# RAG

## 1. Index Data

In [1]:
import requests
import numpy as np
import pandas as pd
from openai import OpenAI
import json

In [2]:
DATA_PATH = r"D:\Personal\OneDrive\Gabrilyi\arxiv_project\arxiv_metadata_sample.parquet.gzip"
CATEGORIES_PATH = r"..\data\categories.json"

In [3]:
def load_data(p):
    return pd.read_parquet(p)

def load_categories(p):
    with open(p, "r") as f:
        return json.load(f)


df = load_data(DATA_PATH)
categories = load_categories(CATEGORIES_PATH)

In [4]:
df.head()

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year
0,1911.1286,Double-Scaling Limit in Principal Chiral Model...,"We initiate a systematic, non-perturbative s...",[hep-th],2020-05-20,10,116,[High Energy Physics - Theory],1,2020
1,astro-ph/9509086,Genus Statistics of the Large-Scale Structure ...,As a statistical measure to quantify the top...,[astro-ph],2009-10-28,10,95,[Astrophysics],1,2009
2,1106.3718,Quantum Efficiency of Intermediate-Band Solar ...,As an appealing concept for developing next-...,[cond-mat.mtrl-sci],2012-10-10,12,165,[Materials Science],1,2012
3,2212.10001,Towards Understanding Chain-of-Thought Prompti...,Chain-of-Thought (CoT) prompting can dramati...,[cs.CL],2023-06-02,10,160,[Computation and Language],1,2023
4,1806.01954,Mining for meaning: from vision to language th...,Describing visual data into natural language...,[cs.CV],2020-05-26,11,161,[Computer Vision and Pattern Recognition],1,2020


In [11]:
df = df[:1000]

In [12]:
from langchain_community.document_loaders import TextLoader
from langchain.docstore.document import Document

docs = []
for i, row in df.iterrows():
    doc = Document(
        page_content=row["abstract"],
        metadata={
            "source": row["id"]
        }
    )
    docs.append(doc)

In [13]:
len(docs)

1000

### Split

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512, chunk_overlap=64, add_start_index=True # ~ 2.35x
)
docs_split = text_splitter.split_documents(docs)

len(docs_split)

2353

### Store

In [22]:
from langchain_chroma import Chroma
import chromadb


class Embedder:
      def __init__(self, model="second-state/All-MiniLM-L6-v2-Embedding-GGUF"):
         self.model = model
         self.client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")

      def embed_documents(self, texts):
         # return self.client.embeddings.create(input = texts, model=self.model)
         return [d.embedding for d in self.client.embeddings.create(input = texts, model=self.model).data]

      def embed_query(self, query: str):
            return self.client.embeddings.create(input = query, model=self.model).data[0].embedding

embedder = Embedder()

vectorstore = Chroma.from_documents(documents=docs_split, embedding=embedder, persist_directory="../data/chroma")

client = chromadb.PersistentClient(path="../data/chroma")



In [23]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("The quick brown fox jumps over the lazy dog.")

len(retrieved_docs)

6

In [26]:
retrieved_docs

[Document(page_content='exponent of the quenched-averaged dispersion of the walker is sub-ballistic but\nsuper-diffusive. We also show that the features are universal to a class of\nsub- and super-Poissonian distributed quenched randomized jumps.', metadata={'source': '1806.04024', 'start_index': 452}),
 Document(page_content='exponent of the quenched-averaged dispersion of the walker is sub-ballistic but\nsuper-diffusive. We also show that the features are universal to a class of\nsub- and super-Poissonian distributed quenched randomized jumps.', metadata={'source': '1806.04024', 'start_index': 452}),
 Document(page_content='with the nulling-drifting interaction: the unexpected brightness of the first\nactive pulse and the low post-null driftrate. We put forward a new\ninterpretation of the subpulse-position jump over the null, indicating that the\nspeedup time scale of the post-null drifting is much shorter than previously\nthought. The speedup time scale we find is no longer discrep