In [1]:
!pip install -U faiss-cpu sentence_transformers transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1

In [2]:
!pip install -qU langchain-text-splitters pypdf langchain-community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.4/415.4 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [62]:
!pip install rank-bm25



### Data Parser & Data Loader

In [139]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

class DataParser:
    def __init__(self, split_method="character", chunk_size=1000, chunk_overlap=200):
        self.split_method = split_method
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def _parse(self, path):
        print(f"Loading file: {path} ...")
        loader = PyPDFLoader(file_path=path)
        documents = loader.load()
        print("Load complete.")
        return documents

    def _get_text_splitter(self):
        if self.split_method == "paragraph":
            print("Using paragraph-based splitting...")
            return RecursiveCharacterTextSplitter(
                separators=["\n\n", ". ", "\n", " "],
                chunk_size=1000,
                chunk_overlap=0
            )
        else:
            print("Using character-based splitting...")
            return RecursiveCharacterTextSplitter(
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap
            )

    def process_and_save(self, paths, output_dir):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        text_splitter = self._get_text_splitter()

        for path in paths:
            print(f"Processing File -- {path}")
            docs = self._parse(path)
            docs = text_splitter.split_documents(docs)
            docs = [doc for doc in docs if doc.page_content and len(doc.page_content) > 0]

            file_name = path.split("/")[-1].replace(".pdf", ".txt")
            output_path = os.path.join(output_dir, file_name)
            with open(output_path, "w", encoding="utf-8") as f:
                for doc in docs:
                    f.write(doc.page_content + "\n\n")

            print(f"Saved chunks from {path} to {output_path}")


In [9]:
import os
def read_data(folder_path):
  data_parser = DataParser()
  file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]
  data_parser.process_and_save(file_paths, "output")

In [10]:
drive_path = "/content/data"
chunked_data = read_data(drive_path)

Using character-based splitting...
Processing File -- /content/data/shop-20241231.pdf
Loading file: /content/data/shop-20241231.pdf ...
Load complete.
Saved chunks from /content/data/shop-20241231.pdf to output/shop-20241231.txt
Processing File -- /content/data/gddy-20241231.pdf
Loading file: /content/data/gddy-20241231.pdf ...
Load complete.
Saved chunks from /content/data/gddy-20241231.pdf to output/gddy-20241231.txt
Processing File -- /content/data/uber-20241231.pdf
Loading file: /content/data/uber-20241231.pdf ...
Load complete.
Saved chunks from /content/data/uber-20241231.pdf to output/uber-20241231.txt
Processing File -- /content/data/lyft-20241231.pdf
Loading file: /content/data/lyft-20241231.pdf ...
Load complete.
Saved chunks from /content/data/lyft-20241231.pdf to output/lyft-20241231.txt


In [140]:
import os

def read_chunks_from_folder(folder_path):
    chunks = []

    if not os.path.exists(folder_path):
        print(f"The folder {folder_path} does not exist.")
        return []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as f:
                chunks.extend(f.read().split("\n\n"))
            print(f"Read chunks from {file_name}")

    return chunks

folder_path = "output"
chunked_data = read_chunks_from_folder(folder_path)

chunked_data[:1]

Read chunks from shop-20241231.txt
Read chunks from uber-20241231.txt
Read chunks from gddy-20241231.txt
Read chunks from lyft-20241231.txt


["UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \nForm 10-K_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\uf078 ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the ﬁscal year ended December 31, 2024\nOR\n\uf06f TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from ___________ to ___________\n________________________________________________\nShopify Inc.\n(Exact name of registrant as speciﬁed in its charter)\n________________________________________________\nCanada 001-37400 98-0486686\n(State or other jurisdiction of incorporation) (Commission File Number) (IRS Employer Identiﬁcation No.)\n151 O'Connor Street, Ground Floor 148 Lafayette Street\nOttawa,Ontario New York,New York\nCanadaK2P 2L8 USA10012\n(Address of Principal Executive Ofﬁces)\nRegistrant’s Telephone Number, Including Area 

### Embedders (Semantic & Keyword)

In [63]:
from typing import Any
import faiss
from tqdm import tqdm
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed

class SemanticEmbedder:
    def __init__(self, similarity_metric="l2"):
        """
        Initialize the embedder with a FAISS index.

        :param similarity_metric: "cosine" for cosine similarity, "l2" for Euclidean distance.
        """
        self.index = None
        self.data = []
        self.encoder = SentenceTransformer("all-mpnet-base-v2", device="cuda")
        self.similarity_metric = similarity_metric.lower()

    def load_data(self, data):
        """Load raw document data (without embeddings)."""
        self.data = data

    def load_index(self, index):
        """Manually load a FAISS index."""
        self.index = index

    def _embed(self, batch_size=32, num_workers=4):
        """Generate embeddings in parallel batches."""
        if not self.data:
            raise ValueError("No data available for embedding.")

        def encode_batch(batch):
            return self.encoder.encode(batch, convert_to_numpy=True)

        batches = [self.data[i : i + batch_size] for i in range(0, len(self.data), batch_size)]

        results = Parallel(n_jobs=num_workers)(
            delayed(encode_batch)(batch) for batch in tqdm(batches, desc="Embedding Progress")
        )

        embeddings = np.vstack(results)

        if self.similarity_metric == "cosine":
            faiss.normalize_L2(embeddings)

        dim = embeddings.shape[1]
        return embeddings, dim

    def build(self):
        """Build FAISS index based on the selected similarity metric."""
        embeddings, dim = self._embed()

        if self.similarity_metric == "cosine":
            self.index = faiss.IndexFlatIP(dim)
        else:
            self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings)
        assert self.index.ntotal == len(self.data)

    def search(self, query, top_k=2):
        """Search for the top-k closest matches to the query."""
        if self.index is None:
            raise ValueError("FAISS index is not initialized.")

        query_embedding = self.encoder.encode([query], convert_to_numpy=True)
        if self.similarity_metric == "cosine":
            faiss.normalize_L2(query_embedding)

        D, I = self.index.search(query_embedding, top_k)

        results = []
        for i, idx in enumerate(I[0]):
            if idx == -1 or idx >= len(self.data):
                continue
            document = self.data[idx]
            distance = D[0][i]
            similarity_score = distance if self.similarity_metric == "cosine" else 1 / (1 + distance)
            results.append({"document": document, "similarity": similarity_score})
        return results



In [80]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

class KeywordEmbedder:
    def __init__(self):
        self.tokenized_corpus = []
        self.bm25 = None
        self.documents = []

    def build_index(self, documents):
        """
        Builds a BM25 index from the provided documents.
        """
        self.documents = documents
        self.tokenized_corpus = [word_tokenize(doc.lower()) for doc in documents]
        self.bm25 = BM25Okapi(self.tokenized_corpus)

    def search(self, query, top_k=5):
        """
        Searches BM25 index and returns top-k documents with normalized scores.
        """
        if not self.bm25:
            raise ValueError("BM25 index is not initialized.")

        query_tokens = word_tokenize(query.lower())
        scores = self.bm25.get_scores(query_tokens)
        max_score = scores.max() if scores.any() else 1
        normalized_scores = [score / max_score for score in scores]
        ranked_indices = sorted(range(len(normalized_scores)), key=lambda i: normalized_scores[i], reverse=True)[:top_k]
        results = [{"document": self.documents[i], "score": normalized_scores[i]} for i in ranked_indices]
        return results

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [187]:
semantic_embedder = SemanticEmbedder(similarity_metric="cosine")
semantic_embedder.load_data(chunked_data)
semantic_embedder.build()

Embedding Progress: 100%|██████████| 97/97 [02:15<00:00,  1.40s/it]


In [81]:
keyword_embedder = KeywordEmbedder()
keyword_embedder.build_index(chunked_data)

In [169]:
query = "How many total shares of common stock did Lyft and Uber each report as outstanding at the dates specified in their documents, and which exchange(s) are their shares listed on according to those filings?"
ans = semantic_embedder.search(query, top_k=5)
ans

[{'document': "The number of shares of the registrant's common stock outstanding as of February 11, 2025 was 2,089,008,865.\nDOCUMENTS INCORPORATED BY REFERENCE\nPortions of the registrant’s Definitive Proxy Statement relating to the Annual Meeting of Stockholders are incorporated by reference into\nPart III of this Annual Report on Form 10-K where indicated. Such Definitive Proxy Statement will be filed with the Securities and\nExchange Commission within 120 days after the end of the registrant’s fiscal year ended December 31, 2024.\n2/25/25, 10:46 AM uber-20241231\nhttps://www.sec.gov/Archives/edgar/data/1543151/000154315125000008/uber-20241231.htm 2/218",
  'similarity': 0.648477425375615},
 {'document': 'Total liabilities and stockholders’ equity $ 5,435,069 $ 4,564,467 \nThe accompanying notes are an integral part of these consolidated financial statements.\n73\n2/25/25, 10:47 AM lyft-20241231\nhttps://www.sec.gov/Archives/edgar/data/1759509/000175950925000025/lyft-20241231.htm 12

In [168]:
query = "How many total shares of common stock did Lyft and Uber each report as outstanding at the dates specified in their documents, and which exchange(s) are their shares listed on according to those filings?"
ans = keyword_embedder.search(query, top_k=5)
ans

[{'document': 'may have the effect of delaying, preventing or deterring a change in control of our company, could deprive our stockholders of an\nopportunity to receive a premium for their capital stock as part of a sale of our company and might ultimately affect the market price of\nour Class A common stock. Each Co-Founder’s voting power is as of December 31, 2024 and includes shares of Class A common stock\nexpected to be issued upon the vesting of such Co-Founder’s RSUs within 60 days of December 31, 2024.\nFuture transfers by the holders of Class B common stock will generally result in those shares converting into shares of Class A\ncommon stock, subject to limited exceptions, such as certain transfers effected for estate planning purposes. In addition, each share of\nClass B common stock will convert automatically into one share of Class A common stock upon (i) the date specified by affirmative',
  'score': 1.0},
 {'document': 'Indicate by check mark whether the registrant is a s

### LLM Layer

In [49]:
from openai import OpenAI
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
client = OpenAI()

class LLM:

    def __init__(self, model: str = "gpt-4o-mini"):
        self.model = model

    def get_model_response(self, prompt: str, query: str):
        completion = client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": query}
            ]
        )
        return completion.choices[0].message.content

### Query Rephrasing (Decomposing & Rewriting)

In [141]:
class QueryDecomposer:

    def __init__(self, llm: LLM, max_sub_queries: int = 5):
        self.llm = llm
        self.max_sub_queries = max_sub_queries

    def generate_sub_queries(self, original_query: str):
        prompt = f"""
        You are an AI assistant responsible for breaking down complex queries into simpler sub-queries for a Retrieval-Augmented Generation (RAG) system.

        Instructions:
        - Given an **original query**, decompose it into a set of **simpler sub-queries** that, when answered together, comprehensively address the original query.
        - Do **not** answer the sub-queries—only generate them.
        - Ensure sub-queries are concise, relevant, and logically structured.
        - Maintain key details and entities from the original query while breaking it down.
        - Generate at most {self.max_sub_queries} sub-queries.

        ---

        **Input:**
        **Original query:** {original_query}

        **Output:**
        A list of simpler sub-queries derived from the original query.
        """

        response = self.llm.get_model_response(prompt, original_query)
        return self._parse_response(response)

    def _parse_response(self, response: str):
        sub_queries = response.strip().split("\n")
        return [sub_query.strip("- ") for sub_query in sub_queries if sub_query.strip()]


In [142]:
class QueryRewriter:

    def __init__(self, llm: LLM, num_rewrites: int = 5):
        """
        Initialize the QueryRewriter.
        """
        self.llm = llm
        self.num_rewrites = num_rewrites

    def rewrite_query(self, original_query: str):
        """
        Generates multiple rewritten variations of the original query.
        """
        prompt = f"""
        You are an advanced AI assistant specializing in **query rewriting** for Retrieval-Augmented Generation (RAG) systems.

        **Instructions:**
        - Given an **original query**, generate **{self.num_rewrites} alternative rewrites** while preserving the original intent and meaning.
        - Ensure that each rewritten query is **clear, natural, and distinct**.
        - Avoid excessive length while maintaining important details.
        - Use different wording, sentence structures, or rephrase the query for better information retrieval.
        - Vary lexical choices while keeping the same semantic meaning.

        **Original Query:** {original_query}

        **Rewritten Queries:**
        """

        response = self.llm.get_model_response(prompt, original_query)
        return self._parse_response(response)

    def _parse_response(self, response: str):
        """
        Parses the LLM response to extract rewritten queries.
        """
        rewrites = response.strip().split("\n")
        return [rewrite.strip("- ") for rewrite in rewrites if rewrite.strip()]


### Query Processor (Handles all query processing with multiple modes)

In [154]:
import concurrent.futures
import numpy as np
from typing import Dict, List
import math

class QueryProcessor:
    def __init__(
        self,
        semantic_embedder: SemanticEmbedder,
        keyword_embedder: KeywordEmbedder,
        query_decomposer: QueryDecomposer = None,
        query_rewriter: QueryRewriter = None,
        mode="simple",
        hybrid=False
    ):
        """
        :param embedder: Embedding-based retriever.
        :param keyword_embedder: BM25-based retriever for hybrid search.
        :param query_decomposer: (Optional) Used when mode="decompose".
        :param query_rewriter: (Optional) Used when mode="rewrite".
        :param reranker: (Optional) LLM-based reranker.
        :param mode: "simple", "decompose", or "rewrite".
        """
        self.semantic_embedder = semantic_embedder
        self.keyword_embedder = keyword_embedder
        self.query_decomposer = query_decomposer
        self.query_rewriter = query_rewriter
        self.mode = mode
        self.hybrid=hybrid

    def process_query(self, original_query: str, top_k=4):
        """
        Process a query using decomposition, rewriting, or direct retrieval.
        """
        queries = [original_query]

        if self.mode == "decompose" and self.query_decomposer:
            queries.extend(self.query_decomposer.generate_sub_queries(original_query))
        elif self.mode == "rewrite" and self.query_rewriter:
            queries.extend(self.query_rewriter.rewrite_query(original_query))

        query_results = self._parallel_search(queries, top_k)
        fused_results = self._adaptive_fusion(query_results, top_k, original_query)

        return fused_results

    def _parallel_search(self, queries, top_k):
        """
        Executes BM25 and embedding-based searches for multiple queries in parallel.
        """
        results = {}

        def search_query(query):
            bm25_results = []
            if self.hybrid:
              bm25_results = self.keyword_embedder.search(query, top_k)
            embed_results = self.semantic_embedder.search(query, top_k)
            return query, {"bm25": bm25_results, "embeddings": embed_results}

        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_query = {executor.submit(search_query, query): query for query in queries}
            for future in concurrent.futures.as_completed(future_to_query):
                query, result = future.result()
                results[query] = result

        return results

    def _adaptive_fusion(self, retrieved_data: Dict[str, Dict[str, List[Dict]]], top_k: int, query: str):
        """
        Adaptively fuses results from BM25 and embeddings based on query complexity.
        """
        query_weight = self._compute_query_weight(query)
        if not self.hybrid:
          query_weight = 0
        print(f"Query: {query} | BM25 Weight: {query_weight:.2f}, Embedding Weight: {1 - query_weight:.2f}")

        fused_scores = {}

        for retrieval_source in retrieved_data.values():
            for method, results in retrieval_source.items():
                weight = query_weight if method == "bm25" else (1 - query_weight)

                for rank, result in enumerate(results):
                    doc_text = result["document"]["document"] if isinstance(result["document"], dict) else result["document"]
                    score = weight * (1 / (rank + 1))
                    fused_scores[doc_text] = fused_scores.get(doc_text, 0) + score

        ranked_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
        return [{"document": doc, "score": score} for doc, score in ranked_results]

    def _compute_query_weight(self, query: str) -> float:
        """
        Computes an adaptive weight for BM25 vs. embeddings based on query complexity.
        """
        tokens = query.lower().split()
        token_freq = {word: tokens.count(word) for word in set(tokens)}
        total_tokens = len(tokens)

        entropy = -sum((freq / total_tokens) * math.log2(freq / total_tokens) for freq in token_freq.values())
        normalized_entropy = entropy / math.log2(total_tokens + 1)

        return max(0.2, min(0.8, 1 - normalized_entropy))


In [156]:
llm = LLM()

query_decomposer = QueryDecomposer(llm, max_sub_queries=5)
query_rewriter = QueryRewriter(llm, num_rewrites=5)

processor = QueryProcessor(
    semantic_embedder,
    keyword_embedder,
    query_decomposer=query_decomposer,
    query_rewriter=query_rewriter,
    mode="decompose",
    hybrid=False
)

query = "Identify from GoDaddy’s and Shopify’s documents how many countries each claims to serve or have a presence in, and which one has a higher count."
results = processor.process_query(query, top_k=5)

results

Query: Identify from GoDaddy’s and Shopify’s documents how many countries each claims to serve or have a presence in, and which one has a higher count. | BM25 Weight: 0.00, Embedding Weight: 1.00


[{'document': 'and living passionately. Our relentless pursuit of building value and doing right for our customers has been a crucial ingredient of our\ngrowth.\nAs of December 31, 2024, we employed 5,518 people worldwide, made up of 2,131 in care and services (who comprise a\nportion of our GoDaddy Guides), 2,247 in technology and development, 367 in marketing and advertising and 773 in general and\nadministrative functions. In addition, GoDaddy partners with various third-party providers and vendors to provide contracted care and\nsupport services to our customers; approximately 3,700 individuals are employed with or engaged by our external partners. These third-\nparty providers are primarily located in international markets, most significantly in India, the Philippines, and Malaysia. A majority of\nour employees are based in the U.S. and Europe. None of our U.S. employees are represented by a labor union or are party to any',
  'score': 4.0},
 {'document': "the United States, 30% i

### RAG (Retrieval Augmented Generation)

In [121]:
class RAG:
    def __init__(self, query_processor: QueryProcessor, llm: LLM):
        self.query_processor = query_processor
        self.llm = llm

    def answer_question(self, original_query: str, top_k=4):
        context = self.query_processor.process_query(original_query, top_k)

        prompt = f"""
        You are an advanced retrieval-augmented generation (RAG) assistant specializing in synthesizing accurate answers from multiple retrieved documents.

        ### Task:
        Given the retrieved documents, construct a factually accurate and well-structured answer using only the provided information. The response should integrate relevant details from multiple documents when applicable.

        ### Guidelines:
        - Synthesize information from multiple retrieved sources, ensuring completeness.
        - Use only the retrieved context—do not hallucinate or add external knowledge.
        - Structure your answer clearly, providing a direct and concise response.
        - Reference specific information from different documents to justify the response.
        - Ensure factual accuracy, maintaining coherence between sources.

        ### Input:
        **User Query:** {original_query}
        **Retrieved Context:** {context}

        ### Output:
        Provide a synthesized, well-structured answer:
        - Directly address the query, combining relevant details from multiple retrieved sources.
        - Use structured formatting where necessary (e.g., bullet points for clarity).
        - Avoid speculation or unnecessary verbosity.
        """

        return self.llm.get_model_response(prompt, original_query)

### Retrieval Evaluation

Can handle:
- Simple, Non-Composite Queries.
- Composite Queries.
- Semantic/Hybrid Mode.
- Simple/Decomposed/Rewrite Queries

In [180]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

class RetrieverEvaluator:
    def __init__(self, query_processor, k=5):
        self.query_processor = query_processor
        self.k = k
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def is_relevant(self, doc, ground_truth_list):
        normalized_doc = re.sub(r'\s+', ' ', doc.strip().lower())

        normalized_ground_truth_list = [
            re.sub(r'\s+', ' ', gt.strip().lower()) for gt in ground_truth_list
        ]
        return any(gt in normalized_doc for gt in normalized_ground_truth_list)

    def compute_mrr(self, ranked_list, ground_truth_list):
        """Computes Mean Reciprocal Rank (MRR)."""
        for i, doc in enumerate(ranked_list, start=1):
            if self.is_relevant(doc, ground_truth_list):
                return 1 / i
        return 0

    def compute_precision_recall_at_k(self, ranked_list, ground_truth_list):
        """Computes Precision@k and Recall@k."""
        top_k = ranked_list[:self.k]
        relevant_count = sum(1 for doc in top_k if self.is_relevant(doc, ground_truth_list))

        precision = relevant_count / self.k
        recall = relevant_count / len(ground_truth_list) if ground_truth_list else 0

        return precision, recall

    def compute_dcg(self, ranked_list, ground_truth_list):
        """Computes Discounted Cumulative Gain"""
        dcg = 0
        for i, doc in enumerate(ranked_list[:self.k], start=1):
            if self.is_relevant(doc, ground_truth_list):
                dcg = 1 / np.log2(i + 1)
                break
        return dcg

    def compute_ndcg_at_k(self, ranked_list, ground_truth_list):
        """Computes Normalized Discounted Cumulative Gain"""
        dcg = self.compute_dcg(ranked_list, ground_truth_list)
        idcg = 1 / np.log2(2)
        return dcg / idcg if idcg > 0 else 0

    def compute_cosine_similarity(self, text1, text2):
        """Computes cosine similarity between two texts"""
        embeddings1 = self.model.encode([text1])
        embeddings2 = self.model.encode([text2])
        return cosine_similarity(embeddings1, embeddings2)[0][0]

    def curate(self, text):
      pattern = r"\[Chunk[^\]]*\](.*?)\n(?=\[Chunk|\Z)"
      chunks = re.findall(pattern, text, flags=re.DOTALL)
      return chunks

    def evaluate(self, df):
        """Evaluates retrieval quality using MRR, Precision@k, Recall@k, NDCG@k, and Cosine Similarity."""
        generation_results = []
        for _, row in df.iterrows():
            ranked_docs = self.query_processor.process_query(row["question"], self.k)
            ranked_docs = [doc['document'] for doc in ranked_docs]
            print(ranked_docs)
            ground_truth = row["documents_referenced"]
            ground_truth_list = ground_truth if isinstance(ground_truth, list) else self.curate(ground_truth)
            print(ground_truth_list)

            mrr = self.compute_mrr(ranked_docs, ground_truth_list)
            precision, recall = self.compute_precision_recall_at_k(ranked_docs, ground_truth_list)
            ndcg = self.compute_ndcg_at_k(ranked_docs, ground_truth_list)

            similarity_scores = [self.compute_cosine_similarity(doc, ground_truth) for doc in ranked_docs]

            generation_results.append({
                "Cosine Similarity (Max)": np.max(similarity_scores)
            })

        return pd.DataFrame(generation_results)

df = pd.read_csv("/content/data/final_eval_data.csv")

llm = LLM()

query_decomposer = QueryDecomposer(llm, max_sub_queries=5)
query_rewriter = QueryRewriter(llm, num_rewrites=5)

processor = QueryProcessor(
    semantic_embedder,
    keyword_embedder,
    query_rewriter=query_rewriter,
    mode="decompose",
    hybrid=False
)

evaluator = RetrieverEvaluator(query_processor=processor, k=4)
evaluation_df = evaluator.evaluate(df)

evaluation_df.to_csv("retrieval_evaluation_results.csv", index=False)
print("Evaluation complete. Results saved to retrieval_evaluation_results.csv")
print(evaluation_df)


Query: How many total shares of common stock did Lyft and Uber each report as outstanding at the dates specified in their documents, and which exchange(s) are their shares listed on according to those filings? | BM25 Weight: 0.00, Embedding Weight: 1.00
["The number of shares of the registrant's common stock outstanding as of February 11, 2025 was 2,089,008,865.\nDOCUMENTS INCORPORATED BY REFERENCE\nPortions of the registrant’s Definitive Proxy Statement relating to the Annual Meeting of Stockholders are incorporated by reference into\nPart III of this Annual Report on Form 10-K where indicated. Such Definitive Proxy Statement will be filed with the Securities and\nExchange Commission within 120 days after the end of the registrant’s fiscal year ended December 31, 2024.\n2/25/25, 10:46 AM uber-20241231\nhttps://www.sec.gov/Archives/edgar/data/1543151/000154315125000008/uber-20241231.htm 2/218", 'Total liabilities and stockholders’ equity $ 5,435,069 $ 4,564,467 \nThe accompanying notes

### RAG Generation Evaluation

In [134]:
!pip install ragas
!pip install evaluate datasets langchain



In [185]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
import json

df = pd.read_csv("/content/data/final_eval_data.csv")
# df = pd.read_csv("/content/data/multi_doc_data.csv")

def process_contexts(contexts):
    if isinstance(contexts, str):
      def curate(text):
        pattern = r"\[Chunk[^\]]*\](.*?)\n(?=\[Chunk|\Z)"
        chunks = re.findall(pattern, text, flags=re.DOTALL)
        return chunks
      return curate(contexts)

retrieved_contexts = df["documents_referenced"].apply(process_contexts)

llm = LLM()
query_decomposer = QueryDecomposer(llm, max_sub_queries=5)
query_rewriter = QueryRewriter(llm, num_rewrites=5)

processor = QueryProcessor(
    semantic_embedder,
    keyword_embedder,
    query_rewriter=query_rewriter,
    mode="decompose",
    hybrid=True
)

rag = RAG(processor, llm)

benchmark_data = []
for idx, row in df.iterrows():
    question = row['question']
    contexts = retrieved_contexts[idx]
    expected_answer = row['expected_answer']

    response = rag.answer_question(question)

    benchmark_data.append({
        "question": question,
        "contexts": contexts,
        "ground_truth": expected_answer,
        "response": response
    })

dataset = Dataset.from_list(benchmark_data)
metrics = [faithfulness, answer_relevancy, context_precision]
results = evaluate(dataset, metrics)
evaluation_df = pd.DataFrame([results])
print(evaluation_df)

Query: How many total shares of common stock did Lyft and Uber each report as outstanding at the dates specified in their documents, and which exchange(s) are their shares listed on according to those filings? | BM25 Weight: 0.20, Embedding Weight: 0.80
Query: Which two key ways do both Lyft and Uber claim to reduce cost and minimize wait times for riders, based on each company's data-driven network advantage, as described in their annual reports? | BM25 Weight: 0.20, Embedding Weight: 0.80
Query: From these GoDaddy and Shopify excerpts, what are the primary ways each company claims to help small businesses or entrepreneurs establish their online presence? | BM25 Weight: 0.20, Embedding Weight: 0.80
Query: According to statements in Shopify's and GoDaddy's filings, which categories of merchants or organizations do they target, and what broad sets of products do they each offer? | BM25 Weight: 0.20, Embedding Weight: 0.80
Query: Comparing Lyft and Uber, which company states it has three

Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

                                              scores  \
0  [{'faithfulness': 0.2222222222222222, 'answer_...   

                                             dataset binary_columns cost_cb  \
0  {'samples': [user_input='How many total shares...             []    None   

                                              traces  \
0  [{'scores': {'faithfulness': 0.222222222222222...   

                                        ragas_traces run_id  
0  {'c9e015b4-23ca-4c63-aec9-c8980e734a3e': run_i...   None  


In [186]:
evaluation_df

Unnamed: 0,scores,dataset,binary_columns,cost_cb,traces,ragas_traces,run_id
0,"[{'faithfulness': 0.2222222222222222, 'answer_...",{'samples': [user_input='How many total shares...,[],,[{'scores': {'faithfulness': 0.222222222222222...,{'c9e015b4-23ca-4c63-aec9-c8980e734a3e': run_i...,


### Deployment

In [None]:
!pip install pyngrok fastapi nest-asyncio uvicorn langchain-community -q
!pip install -U langchain-huggingface -q

In [None]:
from fastapi import FastAPI, Request, Body
from fastapi.responses import StreamingResponse
from threading import Thread
import time
from typing import Dict, Any
import json

app = FastAPI()

llm = LLM()
query_decomposer = QueryDecomposer(llm)
rag_pipeline = RAG(query_decomposer, embedder, llm)

start_time = 0
first_token_time = 0
token_times = []

def generate_output(query: str):
    global start_time, first_token_time, token_times
    start_time = time.time()
    token_times = []

    response = rag_pipeline.answer_question(query)

    model_output = ""
    for i, token in enumerate(response):
        model_output += token
        if i == 0:
            first_token_time = time.time()
        token_times.append(time.time())
        yield token

    metrics = calculate_metrics(start_time, first_token_time, token_times, model_output)
    print("Performance Metrics:", metrics)

def calculate_metrics(start_time, first_token_time, token_times, model_output):
    end_time = time.time()
    end_to_end_latency = end_time - start_time
    ttft = first_token_time - start_time if first_token_time > 0 else None
    itl = sum(x - y for x, y in zip(token_times[1:], token_times[:-1])) / max(1, (len(token_times) - 1))
    throughput = len(model_output) / end_to_end_latency if end_to_end_latency > 0 else None

    return {
        "End-to-End Latency": end_to_end_latency,
        "Time To First Token (TTFT)": ttft,
        "Inter-token Latency (ITL)": itl,
        "Throughput (tokens/sec)": throughput
    }

@app.get("/")
async def root():
    return {"message": "RAG API is running!"}

@app.post("/rag-inference")
async def rag_inference(query: str = Body(..., embed=True)):
    """Handles RAG inference requests and streams the response."""

    if not query:
        return {"error": "No query provided"}

    return StreamingResponse(generate_output(query), media_type="text/plain")

In [None]:
NGROK_KEY = userdata.get('NGROK_KEY')
!ngrok config add-authtoken $NGROK_KEY

import nest_asyncio
from pyngrok import ngrok
import uvicorn

# Apply nest_asyncio to allow running uvicorn within Jupyter notebooks
nest_asyncio.apply()

# Start ngrok tunnel
ngrok.kill()  # Kill any existing tunnels
ngrok_tunnel = ngrok.connect(8000)  # Expose port 8000 to the internet
print(f'Public URL: {ngrok_tunnel.public_url}')

# Start FastAPI server
uvicorn.run(app, host="127.0.0.1", port=8000)