<a href="https://colab.research.google.com/github/hosseinhimself/Technical-Assessment-RAG/blob/main/Untitled19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q fastapi uvicorn
!pip install -q llama-index
!pip install -q llama-index-embeddings-huggingface

!pip install -q faiss-cpu
!pip install -q llama-index-vector-stores-faiss

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModel
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex


from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

import faiss
import numpy as np
import torch
import pickle


In [3]:
class DocumentProcessor:
    def __init__(self, directory):
        self.directory = directory
        self.documents = self.load_documents()

    def load_documents(self):
        documents = SimpleDirectoryReader(self.directory).load_data()
        return self.refine_documents(documents)

    def refine_documents(self, documents):
        refined_docs = []
        for doc in documents:
            if "Member-only story" in doc.text or "The Data Entrepreneurs" in doc.text or " min read" in doc.text:
                continue
            refined_docs.append(doc)
        return refined_docs

    def get_texts(self):
        return [doc.text for doc in self.documents]

    def add_document(self, document):
        refined_doc = self.refine_documents([document])
        if refined_doc:
            self.documents.extend(refined_doc)
            return refined_doc[0]
        return None

In [4]:
class EmbeddingModel:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def get_embeddings(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        return embeddings

In [5]:
class FaissIndex:
    def __init__(self, dimension):
        self.index = faiss.IndexFlatL2(dimension)

    def add_embeddings(self, embeddings):
        self.index.add(embeddings)

    def save(self, index_file, documents_file, embeddings_file, documents):
        faiss.write_index(self.index, index_file)
        with open(documents_file, "wb") as f:
            pickle.dump(documents, f)
        np.save(embeddings_file, embeddings)

    def load(self, index_file, documents_file, embeddings_file):
        self.index = faiss.read_index(index_file)
        with open(documents_file, "rb") as f:
            documents = pickle.load(f)
        embeddings = np.load(embeddings_file)
        return documents, embeddings

    def add_document(self, document, embedding):
        self.add_embeddings(np.array([embedding]))

In [6]:
class QueryEngine:
    def __init__(self, retriever, embedding_model):
        self.retriever = retriever
        self.embedding_model = embedding_model

    def query(self, query_text):
        query_embedding = self.embedding_model.get_embeddings([query_text])[0]
        retrieved_documents = self.retriever.retrieve(query_embedding)
        context = "Context:\n"
        for doc in retrieved_documents:
            context += doc.text + "\n\n"
        return context

    def add_document(self, document_text):
        # Create a new Document object
        new_document = BaseModel(text=document_text)
        # Add the document to the DocumentProcessor
        refined_document = self.retriever.documents.append(new_document)
        if refined_document:
            # Generate the embedding for the new document
            new_embedding = self.embedding_model.get_embeddings([document_text])[0]
            # Add the document and its embedding to the FAISS index
            self.retriever.index.add_document(refined_document, new_embedding)

In [8]:
class FaissRetriever:
    def __init__(self, index, documents, top_k):
        self.index = index
        self.documents = documents
        self.top_k = top_k

    def retrieve(self, query_embedding):
        distances, indices = self.index.search(np.array([query_embedding]), self.top_k)
        return [self.documents[idx] for idx in indices[0]]

In [12]:
if __name__ == "__main__":
    # Settings
    Settings.llm = None
    Settings.chunk_size = 256
    Settings.chunk_overlap = 25

    # Initialize components
    directory = "articles"
    model_name = "thenlper/gte-large"
    index_file = "faiss_index.index"
    documents_file = "documents.pkl"
    embeddings_file = "embeddings.npy"
    top_k = 3

    '''# Process documents
    doc_processor = DocumentProcessor(directory)
    texts = doc_processor.get_texts()
    print(f"Number of documents after refinement: {len(texts)}")

    # Generate embeddings
    embedding_model = EmbeddingModel(model_name)
    embeddings = embedding_model.get_embeddings(texts)

    # Initialize FAISS index and add embeddings
    faiss_index = FaissIndex(dimension=embeddings.shape[1])
    faiss_index.add_embeddings(embeddings)

    # Save index, documents, and embeddings
    faiss_index.save(index_file, documents_file, embeddings_file, doc_processor.documents)
    '''
    # Load index, documents, and embeddings
    documents, embedding_matrix = faiss_index.load(index_file, documents_file, embeddings_file)

    # Initialize retriever
    retriever = FaissRetriever(faiss_index.index, documents, top_k)

    # Initialize query engine
    query_engine = QueryEngine(retriever, embedding_model)

    # Example query
    query_text = "Power Laws Break STAT 101"
    context = query_engine.query(query_text)
    print(context)

LLM is explicitly disabled. Using MockLLM.
Context:
One of the biggest problems in using STAT 101 techniques to analyze Power
Laws (i.e. data from Extremistan) is quantities like mean, standard deviation,
variance, correlation, etc. all have little practical significance.
This all stems from a single core issue — insufficient data.
In statistics, we learn about the Law of Large Numbers, which says that if we
take N random samples, the sample mean will approach the true mean as N
→ ∞. This is true for ANY distribution (with finite mean): Gaussian, Power
Law, Uniform, you name it.
However, it turns out that this asymptotic behavior happens more slowly for
some distributions than others (e.g. slower for Power Laws than Gaussians).
And, in practice, where we (necessarily) have finite datasets, this can cause
problems. Here, I highlight 3 such problems.
Problem 1: The Mean is Meaningless (as well as many other
metrics)
Whenever we want to compare two sets of values (e.g. sales in April vs. 