## Setup environment

### Dependencies installation

In [1]:
%pip install --upgrade pip
%pip install llama-index llama-index-vector-stores-chroma==0.4.1 huggingface_hub==0.27.1 \
llama-index-embeddings-huggingface==0.5.1 llama-index-llms-huggingface==0.4.2 accelerate>=0.26.0 \
ragas==0.2.12 lmcache==0.1.4 lmcache_vllm==0.6.2.3 pandas==2.2.3 typing_extensions==4.12.2 \
pydantic==2.10.6 chromadb==0.6.3 sentence-transformers==3.4.0 tqdm==4.67.1 transformers==4.48.1 \
datasets==3.2.0 duckdb==1.1.3 cudf-cu12 cuml-cu12 --extra-index-url=https://pypi.nvidia.com

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


### Libraries import


In [2]:
import os, time, threading, sys, datetime, json, math, ast
import chromadb
import pandas as pd
import torch
import asyncio
import nest_asyncio
from io import StringIO
from tqdm import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
from huggingface_hub import login
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.llama_pack import download_llama_pack
from llama_index.llms.openai import OpenAI

os.environ["LMCACHE_CONFIG_FILE"] = "./lmcache_config.yaml"
import lmcache_vllm.vllm as vllm
from lmcache_vllm.vllm import LLM

[33mINFO LMCache: [0mInitializing lmcache_vllm version 0.6.2.3, supporting vllm versions: ['0.6.1.post2', '0.6.1.dev238+ge2c6e0a82'] [2025-03-07 12:55:43,834] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/__init__.py:35
[33mINFO LMCache: [0mLoading LMCache config file ./lmcache_config.yaml [2025-03-07 12:55:43,888] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_adapter.py:123


### Custom RAPTOR import

In [None]:
RaptorPack = download_llama_pack("RaptorPack", "./custom_raptor_pack")

In [3]:
# Print the current working directory
print("Current Directory:", os.getcwd())

raptor_pack_path = sys.path.insert(0, os.path.join(os.getcwd(), "custom_raptor_pack"))

# Verify the files in the 'raptor_pack/llama_index' directory
print("Files in 'raptor_pack/llama_index' Directory:", os.listdir(raptor_pack_path))

# Import RaptorPack
from llama_index.packs.raptor import RaptorRetriever
from custom_raptor_pack.llama_index.packs.raptor.base import RaptorPack
print("✅ Using locally modified RaptorPack!")

Current Directory: /workspace
Files in 'raptor_pack/llama_index' Directory: ['naive_rag_cache_07-03-2025', 'advanced_rag_raptor_07-03-2025', 'naive_rag_07-03-2025', 'sampled_summaries.pkl', 'lmcache_config.yaml', 'vector_store_raptor', 'vector_store_naive_rag', 'Faster-RAG-Experiments.ipynb', 'custom_raptor_pack', '.deepeval', '.deepeval_telemtry.txt', '=0.26.0', '.ipynb_checkpoints']
✅ Using locally modified RaptorPack!


### Enable GPU

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


### APIs keys

In [None]:
# Huggingface
login(token="your hugginface token")

# Open AI
os.environ["OPENAI_API_KEY"] = "your openai api key"

# Verify that the environment variables are set
print("OpenAI API Key:", os.environ.get("OPENAI_API_KEY")) 

DeepEval Key: oxzVhgRVKo8eEspdkldpsdskod5+CvN/IUZbNyNKzs1cJbPw=
OpenAI API Key: sk-proj-i4R5gYfQudspsdllsddlsdsakrzyh--
Welcome to [1mDeepEval[0m!
Login and grab your API key here: ]8;id=517129;https://app.confident-ai.com\[4;94mhttps://app.confident-ai.com[0m]8;;\ 
Congratulations! Login successful 🙌 
If you are new to DeepEval, follow our quickstart tutorial here: 
]8;id=374700;https://docs.confident-ai.com/docs/getting-started\[1;4;94mhttps://docs.confident-ai.com/docs/getting-started[0m]8;;\


## Utility functions

### LLM loading

In [6]:
def load_model(model_id, tokenizer, optimized=False):
    """Load the language model with or without LMCache optimization."""
    if optimized:
        # Instantiate a synchronous LMCache-enabled LLM.
        # Adjust parameters such as gpu_memory_utilization and max_model_len as needed.
        return LLM(
            model=model_id,
            gpu_memory_utilization=0.8,
            enable_chunked_prefill=False,
            max_model_len=32768
        )
    else:
        # Load the non-optimized model via Hugging Face and send it to GPU if available.
        return AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
        ).to("cuda" if torch.cuda.is_available() else "cpu")

### RAG pipeline

In [7]:
def process_queries(queries, retriever, model, tokenizer, experiment_name):
    # Setup folder and file names.
    date_str = datetime.date.today().strftime("%d-%m-%Y")
    folder_name = f"{experiment_name}_{date_str}"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    csv_filename = os.path.join(folder_name, f"queries_results_{experiment_name}_{date_str}.csv")
    metrics_filename = os.path.join(folder_name, f"overall_metrics_{experiment_name}_{date_str}.txt")
    
    # Load cached results if available.
    if os.path.exists(csv_filename):
        print(f"{csv_filename} exists. Loading cached results...")
        df = pd.read_csv(csv_filename)
        responses = df.to_dict(orient="records")
        for row in responses:
            metadata = {
                "tfft": row.get("tfft"),
                "e2e_latency": row.get("e2e_latency"),
                "itl": row.get("itl"),
                "tps": row.get("tps"),
                "retriever_time": row.get("retriever_time"),
                "input_context_size": row.get("input_context_size"),
                "output_tokens_size": row.get("output_tokens_size")
            }
            context = row.get("context", "")

        return responses

    responses = []
    
    # Lists for metrics.
    all_tfft = []
    all_e2e_latency = []
    all_itl = []
    all_tps = []
    all_retriever_time = []
    all_input_context_size = []
    all_output_tokens_size = []
    
    overall_start_time = time.time()
    today_date_verbose = datetime.date.today().strftime("%d %B %Y")
    
    for i, row in tqdm(queries.iterrows(), desc="Processing Queries", total=len(queries)):
        query = row['question']['text']
        query_obj = VectorStoreQuery(query_str=query, similarity_top_k=50)
        
        # Retrieve context.
        retriever_start_time = time.time()
        results = retriever.query(query_obj)
        retriever_time = time.time() - retriever_start_time
        all_retriever_time.append(retriever_time)
        retrieved_nodes = results.nodes
        context = "\n".join(node.get_content() for node in retrieved_nodes)

        # Build the prompt.
        prompt = (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
            f"Cutting Knowledge Date: December 2023\n"
            f"Today Date: {today_date_verbose}\n\n"
            "You are a helpful assistant. Please provide a concise and accurate answer in one short paragraph based on the context.\n"
            "Use the provided context only, do not invent.\n"
            "Avoid repeating yourself\n"
            "<|eot_id|>\n"
            "<|start_header_id|>user<|end_header_id|>\n"
            f"Question: {query}\n"
            f"Context:\n{context}\n"
            "<|eot_id|>\n"
            "<|start_header_id|>assistant<|end_header_id|>\n"
        )
        
        # Tokenize to get input context size.
        inputs = tokenizer(prompt, return_tensors="pt")
        input_context_size = inputs.input_ids.shape[1]
        all_input_context_size.append(input_context_size)
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        inputs = inputs.to(device)
        
        if "cache" in experiment_name:
            # --- LMCache branch ---
            sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=128)
            generation_start = time.time()
            outputs = model.generate([prompt], sampling_params)
            generation_end = time.time()
            
            # Extract metrics from the first output.
            request_output = outputs[0]
            metrics_obj = request_output.metrics
            tfft = metrics_obj.first_token_time - metrics_obj.arrival_time
        
            answer = request_output.outputs[0].text.strip()
            e2e_latency = generation_end - generation_start
            token_count = len(tokenizer.encode(answer))
            itl = (e2e_latency / token_count) if token_count > 1 else 0.0
            tps = (token_count / e2e_latency) if e2e_latency > 0 else 0.0
        
            all_tfft.append(tfft)
        else:
            # --- Non-cache branch: using synchronous generation with a streamer ---
            start_time_sync = time.time()
            streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
            generation_thread = threading.Thread(
                target=model.generate,
                kwargs={"inputs": inputs.input_ids, "max_new_tokens": 128, "streamer": streamer}
            )
            generation_thread.start()
            token_times = []
            generated_text = ""
            for token in streamer:
                now = time.time()
                token_times.append(now)
                if len(token_times) == 1:
                    tfft = now - start_time_sync
                    all_tfft.append(tfft)
                generated_text += token
            generation_thread.join()
            answer = generated_text.strip()
            end_time_sync = time.time()
            e2e_latency = end_time_sync - start_time_sync
            if len(token_times) > 1:
                diffs = [t2 - t1 for t1, t2 in zip(token_times[:-1], token_times[1:])]
                itl = sum(diffs) / len(diffs)
                generation_time = token_times[-1] - token_times[0]
                tps = len(token_times) / generation_time if generation_time > 0 else 0.0
            else:
                itl = 0.0
                tps = 0.0

        all_e2e_latency.append(e2e_latency)
        all_itl.append(itl)
        all_tps.append(tps)
        
        token_count = len(tokenizer.encode(answer))
        all_output_tokens_size.append(token_count)
        
        metadata = {
            "tfft": tfft,
            "e2e_latency": e2e_latency,
            "itl": itl,
            "tps": tps,
            "retriever_time": retriever_time,
            "input_context_size": input_context_size,
            "output_tokens_size": token_count
        }
        
        responses.append({
            "query": query,
            "context": context,
            "answer": answer,
            "expected_output": row['answers'][0]['text'],
            **metadata
        })
    
    overall_end_time = time.time()
    total_time = overall_end_time - overall_start_time
    rps = len(queries) / total_time if total_time > 0 else 0.0
    avg_tfft = sum(all_tfft) / len(all_tfft) if all_tfft else 0.0
    avg_e2e = sum(all_e2e_latency) / len(all_e2e_latency) if all_e2e_latency else 0.0
    avg_itl = sum(all_itl) / len(all_itl) if all_itl else 0.0
    avg_tps = sum(all_tps) / len(all_tps) if all_tps else 0.0
    avg_retriever = sum(all_retriever_time) / len(all_retriever_time) if all_retriever_time else 0.0
    avg_input_ctx = sum(all_input_context_size) / len(all_input_context_size) if all_input_context_size else 0.0
    avg_output_tokens = sum(all_output_tokens_size) / len(all_output_tokens_size) if all_output_tokens_size else 0.0
    
    metrics_str = (
        f"Overall RPS: {rps:.2f} req/sec, "
        f"Average TTFT: {avg_tfft:.2f} sec, "
        f"Average E2E Latency: {avg_e2e:.2f} sec, "
        f"Average ITL: {avg_itl:.2f} sec, "
        f"Average TPS: {avg_tps:.2f} tokens/sec, "
        f"Average Retriever Time: {avg_retriever:.2f} sec, "
        f"Average Input Context Size: {avg_input_ctx:.0f} tokens, "
        f"Average Output Tokens Size: {avg_output_tokens:.0f} tokens"
    )
    print("\nOverall Metrics:")
    print(metrics_str)
    
    with open(metrics_filename, "w") as mf:
        mf.write(metrics_str)
    print(f"Saved overall metrics to {metrics_filename}")
    
    df = pd.DataFrame(responses)
    df.to_csv(csv_filename, index=False)
    print(f"Saved intermediate results to {csv_filename}")
    
    return responses

## Dataset loading and preprocessing

### Loading

In [8]:
ds = load_dataset('deepmind/narrativeqa')
print(ds)

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/24 [00:00<?, ?files/s]

train-00003-of-00024.parquet:   0%|          | 0.00/27.2M [00:00<?, ?B/s]

train-00004-of-00024.parquet:   0%|          | 0.00/88.3M [00:00<?, ?B/s]

train-00000-of-00024.parquet:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

train-00014-of-00024.parquet:   0%|          | 0.00/35.0M [00:00<?, ?B/s]

train-00013-of-00024.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

train-00001-of-00024.parquet:   0%|          | 0.00/67.2M [00:00<?, ?B/s]

train-00006-of-00024.parquet:   0%|          | 0.00/39.4M [00:00<?, ?B/s]

train-00007-of-00024.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

train-00008-of-00024.parquet:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

train-00011-of-00024.parquet:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

train-00009-of-00024.parquet:   0%|          | 0.00/49.5M [00:00<?, ?B/s]

train-00015-of-00024.parquet:   0%|          | 0.00/73.4M [00:00<?, ?B/s]

train-00005-of-00024.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

train-00002-of-00024.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

train-00010-of-00024.parquet:   0%|          | 0.00/126M [00:00<?, ?B/s]

train-00016-of-00024.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00017-of-00024.parquet:   0%|          | 0.00/61.6M [00:00<?, ?B/s]

train-00018-of-00024.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

train-00019-of-00024.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00020-of-00024.parquet:   0%|          | 0.00/74.2M [00:00<?, ?B/s]

train-00021-of-00024.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

train-00012-of-00024.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

train-00022-of-00024.parquet:   0%|          | 0.00/11.9M [00:00<?, ?B/s]

train-00023-of-00024.parquet:   0%|          | 0.00/97.8M [00:00<?, ?B/s]

test-00000-of-00008.parquet:   0%|          | 0.00/8.56M [00:00<?, ?B/s]

test-00001-of-00008.parquet:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

test-00002-of-00008.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

test-00003-of-00008.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

test-00004-of-00008.parquet:   0%|          | 0.00/60.8M [00:00<?, ?B/s]

test-00005-of-00008.parquet:   0%|          | 0.00/121M [00:00<?, ?B/s]

test-00006-of-00008.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

test-00007-of-00008.parquet:   0%|          | 0.00/58.5M [00:00<?, ?B/s]

validation-00000-of-00003.parquet:   0%|          | 0.00/10.0M [00:00<?, ?B/s]

validation-00001-of-00003.parquet:   0%|          | 0.00/24.9M [00:00<?, ?B/s]

validation-00002-of-00003.parquet:   0%|          | 0.00/68.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32747 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10557 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3461 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 32747
    })
    test: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 10557
    })
    validation: Dataset({
        features: ['document', 'question', 'answers'],
        num_rows: 3461
    })
})


### Preprocessing

In [28]:
# File to store the sampled summaries as a pickle.
sampled_filename = "sampled_summaries.pkl"

# Combine all splits into a single DataFrame (assuming ds is your dataset object)
dfs = []
for split in ['train', 'test', 'validation']:
    split_df = ds[split].to_pandas()
    dfs.append(split_df)
df = pd.concat(dfs, ignore_index=True)

# Extract summary text into a new column.
df['summary_text'] = df['document'].apply(lambda doc: doc['summary']['text'])

# Remove duplicates based on the 'summary_text' column.
unique_summaries = df.drop_duplicates(subset='summary_text').reset_index(drop=True)

# Check if the sample already exists.
if os.path.exists(sampled_filename):
    sampled_summaries = pd.read_pickle(sampled_filename)
else:
    # Sample 100 random rows from unique_summaries.
    sampled_summaries = unique_summaries.sample(n=1572, random_state=42)
    # If the "question" column doesn't exist or you want to create it based on summary_text:
    if "question" not in sampled_summaries.columns:
        sampled_summaries["question"] = sampled_summaries["summary_text"].apply(lambda x: {"text": x})
    # Save the sample using pickle.
    sampled_summaries.to_pickle(sampled_filename)

# Repeat each row 10 times to get 1000 rows overall.
repeated_summaries = sampled_summaries.loc[sampled_summaries.index.repeat(1)].reset_index(drop=True)

# Print stats for verification.
print(f"Total rows across all splits: {len(df)}")
print(f"Number of unique summaries: {len(unique_summaries)}")
print(f"Number of sampled rows: {len(sampled_summaries)}")
print(f"Total rows after repeating each sampled row 10 times: {len(repeated_summaries)}")

# Optionally display the first few rows.
display(repeated_summaries.head(10))

Total rows across all splits: 46765
Number of unique summaries: 1572
Number of sampled rows: 1572
Total rows after repeating each sampled row 10 times: 1572


Unnamed: 0,document,question,answers,summary_text
0,{'id': '09809dc4e1e63a28e32c8182d8493667a91b54...,{'text': 'What does Mrs. Tittlemouse keep besi...,"[{'text': 'A dust pan and brush', 'tokens': ['...",Mrs. Tittlemouse is a tale in which no humans...
1,{'id': 'bebb3d64b5731fd2cf2e4555686b4959eb2ac4...,{'text': 'What has developed in the Earth's co...,"[{'text': 'An advanced race', 'tokens': ['An',...",The plot concerns an advanced race which has ...
2,{'id': '2daf99dd0a17e8141b6697c4dad2621082cf54...,{'text': 'Why does Alan Stanwyk think Fletch i...,[{'text': 'Because Fletch is posing as a junki...,"Los Angeles Times reporter Irwin ""Fletch"" Fle..."
3,{'id': '7e2bef0b43cf243f513853e82e482d695801e4...,{'text': 'The Titus Brothers Contractors won a...,"[{'text': 'Peru', 'tokens': ['Peru']}, {'text'...",The Titus Brothers Contractors company have w...
4,{'id': '7805e001c28ab9aa85f51b3244a9f57fe972f5...,{'text': 'What does the Pope declare has been ...,"[{'text': 'Dire Offense ', 'tokens': ['Dire', ...","In the beginning of this mock-epic, Pope decl..."
5,{'id': '8ec2acc82d024a645fe81f51270d6dbba8b798...,"{'text': 'Where does this story take place?', ...","[{'text': 'In Socrates Cell?', 'tokens': ['In'...",The dialogue takes place in Socrates' prison ...
6,{'id': '7abce7387dae92c0d09c03d26bf6407237a8c7...,{'text': 'Why does Hammerly want to block the ...,[{'text': 'He believes it'll harm the privacy ...,"In the 1990s, U.S. National Security Agency o..."
7,{'id': '3feb46d105b7ef3bdb248064761dc309c18314...,"{'text': 'How many siblings does Chip have?', ...","[{'text': '1', 'tokens': ['1']}, {'text': 'One...",Beverly Sutphin appears to be a typical subur...
8,{'id': '739ac60705adb8084f1656b8bdb29e58e41f49...,"{'text': 'When he leaves baseball, where does ...","[{'text': 'In New York City.', 'tokens': ['In'...","Miguel ""Sugar"" Santos (Perez Soto) spends his..."
9,{'id': '0edf4e67b33906e690dc45a9abae06c34ca3e1...,{'text': 'Which of the brother's kills the loc...,"[{'text': 'Tuvia.', 'tokens': ['Tuvia', '.']},...","The film opens with on-screen text stating: ""..."


### Chunking

In [10]:
documents = [Document(text=row['summary_text']) for _, row in unique_summaries.iterrows()]
parser = SentenceSplitter(chunk_size=100, chunk_overlap=0)

chunks = parser.get_nodes_from_documents(documents)
chunk_documents = [chunk.get_content() for chunk in chunks]
ids = [f"chunk_{i}" for i in range(len(chunks))]

print(f"Total documents created: {len(documents)}")
print(f"Total chunks created: {len(chunks)}")
print(f"Total chunks documents extracted: {len(chunk_documents)}")
print(f"Total ids created: {len(ids)}")

Total documents created: 1572
Total chunks created: 14738
Total chunks documents extracted: 14738
Total ids created: 14738


## Indexing collections

### ChromaDB client config

In [11]:
client_naive = chromadb.PersistentClient(path="./vector_store_naive_rag")
client_raptor = chromadb.PersistentClient(path="./vector_store_raptor")
embed_model = HuggingFaceEmbedding("multi-qa-mpnet-base-cos-v1", device=device)

[33mINFO LMCache: [0mLoad pretrained SentenceTransformer: multi-qa-mpnet-base-cos-v1 [2025-03-07 12:57:44,423] -- /usr/local/lib/python3.11/dist-packages/sentence_transformers/SentenceTransformer.py:218


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[33mINFO LMCache: [0m2 prompts are loaded, with the keys: ['query', 'text'] [2025-03-07 12:57:48,876] -- /usr/local/lib/python3.11/dist-packages/sentence_transformers/SentenceTransformer.py:357


### NarrativeQA: Naive-RAG

#### Creating

In [12]:
collection_name_naive = "narrativeqa-naive"
#client.delete_collection(name=collection_name_naive)
collection_naive = client_naive.get_or_create_collection(collection_name_naive)
print(collection_naive)

Collection(name=narrativeqa-naive)


#### Indexing

In [13]:
if collection_naive.count () == 0:
    embeddings = embed_model.get_text_embedding_batch(
        chunk_documents,
        device=device,
        show_progress=True
    )
        
    collection_naive.add(
        documents=chunk_documents,
        ids=ids,
        embeddings=embeddings
    )
    
vector_store_naive_rag = ChromaVectorStore(
    client=client_naive, 
    chroma_collection=collection_naive
)

# Saving naive rag vector store to avoid re-indexing
storage_context_naive = StorageContext.from_defaults(vector_store=vector_store_naive_rag)

print(f"Total documents in {collection_name_naive} collection after indexing: {collection_naive.count()}")

Total documents in narrativeqa-naive collection after indexing: 14738


### NarrativeQA: RAPTOR

#### Creating

In [14]:
collection_name_raptor = "narrativeqa-raptor"
#client_raptor.delete_collection(name=collection_name_raptor)
collection_raptor = client_raptor.get_or_create_collection(collection_name_raptor)
print(collection_raptor)
nest_asyncio.apply()

Collection(name=narrativeqa-raptor)


#### Preprocessing

#### Indexing

In [15]:
vector_store_raptor = ChromaVectorStore(chroma_collection=collection_raptor)
storage_context_raptor = StorageContext.from_defaults(vector_store=vector_store_raptor)

if collection_raptor.count() == 0:
    raptor_pack = RaptorPack(
        documents=chunk_documents,
        ids=ids,
        embed_model=embed_model,
        llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
        vector_store=vector_store_raptor, 
        mode="collapsed",
        #transformations=[parser]
    )

print(f"Total documents in {collection_name_raptor} collection after indexing: {collection_raptor.count()}")

Total documents in narrativeqa-raptor collection after indexing: 15825


### Models and tokenizer config

In [16]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

### Generator model

In [None]:
generator_model = load_model(model_id, tokenizer, optimized=False)

### Experiment configs

#### Naive-RAG

In [17]:
def run_naive_rag(queries, vector_store, tokenizer, model, experiment_name):
    return process_queries(queries, vector_store, model, tokenizer, experiment_name)

#### Advanced RAG

In [18]:
def run_advanced_rag(queries, vector_store, tokenizer, model, experiment_name):
    return process_queries(queries, vector_store, model, tokenizer, experiment_name)

## Run experiments

### Naive-RAG

In [None]:
print("Running Naive RAG")
naive_rag_responses = run_naive_rag(repeated_summaries, vector_store_naive_rag, tokenizer, generator_model, "naive_rag")

### Advanced RAG (Raptor)

In [None]:
print("Running Advanced RAG (Raptor)")
advanced_rag_responses = run_advanced_rag(repeated_summaries, vector_store_raptor, tokenizer, generator_model, "advanced_rag_raptor")

### Clean generator model from GPU

In [None]:
del generator_model
import gc
gc.collect()
torch.cuda.empty_cache()

### Generator optimized model

In [22]:
generator_optimized_model = load_model(model_id, tokenizer, optimized=True)

INFO 03-07 13:44:17 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, mult

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 03-07 13:44:22 model_runner.py:1025] Loading model weights took 14.9575 GB
INFO 03-07 13:44:26 gpu_executor.py:122] # GPU blocks: 9811, # CPU blocks: 2048
INFO 03-07 13:44:27 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-07 13:44:27 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 03-07 13:44:51 model_runner.py:1456] Graph capturing finished in 24 secs.


### Naive RAG (LM Cache Optimized)

In [None]:
print("Running Naive RAG (LM Cache Optimized)")
naive_rag_cache_responses = run_naive_rag(repeated_summaries, vector_store_naive_rag, tokenizer, generator_optimized_model, "naive_rag_cache")

Running Naive RAG (LM Cache Optimized)


Processing Queries:   0%|          | 0/1000 [00:00<?, ?it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A[33mINFO LMCache: [0mKV cache retrieving mode: RetrieveStatus.PREFILL [2025-03-07 13:00:36,340] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_injection.py:52
[33mINFO LMCache: [0mUsing default batched implementation of the get() method [2025-03-07 13:00:36,346] -- /usr/local/lib/python3.11/dist-packages/lmcache/storage_backend/abstract_backend.py:120
[33mINFO LMCache: [0mRetrieved 0 chunks [2025-03-07 13:00:36,347] -- /usr/local/lib/python3.11/dist-packages/lmcache/cache_engine.py:385
[33mDEBUG LMCache: [0mReturning the original input! [2025-03-07 13:00:36,349] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_adapter.py:663
[33mINFO LMCache: [0mKV cache saving mode: [<StoreStatus.PREFILL: 1>] [2025-03-07 13:00:36,909] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_in

### Advanced RAG (Raptor + LM Cache Optimized)

In [None]:
print("Running Advanced RAG (Raptor + LM Cache Optimized)")
advanced_rag_cache_responses = run_advanced_rag(repeated_summaries, vector_store_raptor, tokenizer, generator_optimized_model, "advanced_rag_cache_raptor")

Running Advanced RAG (Raptor + LM Cache Optimized)


Processing Queries:   0%|          | 0/1000 [00:00<?, ?it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A[33mINFO LMCache: [0mKV cache retrieving mode: RetrieveStatus.PREFILL [2025-03-07 13:46:14,119] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_injection.py:52
[33mINFO LMCache: [0mUsing default batched implementation of the get() method [2025-03-07 13:46:14,121] -- /usr/local/lib/python3.11/dist-packages/lmcache/storage_backend/abstract_backend.py:120
[33mINFO LMCache: [0mRetrieved 0 chunks [2025-03-07 13:46:14,122] -- /usr/local/lib/python3.11/dist-packages/lmcache/cache_engine.py:385
[33mDEBUG LMCache: [0mReturning the original input! [2025-03-07 13:46:14,123] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_adapter.py:663
[33mINFO LMCache: [0mKV cache saving mode: [<StoreStatus.PREFILL: 1>] [2025-03-07 13:46:14,696] -- /usr/local/lib/python3.11/dist-packages/lmcache_vllm/vllm_in

In [21]:
del generator_optimized_model
import gc
gc.collect()
torch.cuda.empty_cache()