# **Track A - RAG with Hugging Face**

## Set Ups & Installs

In [None]:
# Install dependencies
!pip -q install -U langchain langchain-community chromadb sentence-transformers pypdf transformers accelerate



# Artifacts Reproduce folder created for env_rag.json
import json, sys, platform, os, chromadb, transformers, sentence_transformers, langchain
import importlib

def get_version(pkg, import_name=None):
    try:
        mod = importlib.import_module(import_name or pkg)
        return getattr(mod, "__version__", "unknown")
    except ImportError:
        return "not installed"

env = {
    "python": sys.version.split(" ")[0],
    "langchain": langchain.__version__,
    "platform": platform.platform(),
    "torch": torch_v,
    "cuda": cuda_ok,
    "device": device_name,
    "transformers": transformers.__version__,
    "sentence_transformers": sentence_transformers.__version__,
    "chromadb": chromadb.__version__
}

os.makedirs("reproduce_artifacts", exist_ok=True)
print(json.dumps(env, indent=2))
with open("reproduce_artifacts/env_rag.json", "w") as f: json.dump(env, f, indent=2)




# Create the Log directories for Q&A
# Create the Chroma directories
os.makedirs("logs", exist_ok=True)
os.makedirs("chroma", exist_ok=True)





# Importing Academic Papers
import os
from langchain_community.document_loaders import PyPDFLoader
from google.colab import drive

drive.mount('/content/drive')

pdf_dir = "/content/drive/MyDrive/Capstone/Academic Papers"
print("Files in folder:", os.listdir(pdf_dir))






# Load PDFs into LangChain
docs = []
for file in os.listdir(pdf_dir):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_dir, file))
        docs.extend(loader.load())

print(f"Loaded {len(docs)} documents from {len(os.listdir(pdf_dir))} PDFs.")







{
  "python": "3.12.11",
  "langchain": "0.3.27",
  "platform": "Linux-6.1.123+-x86_64-with-glibc2.35",
  "torch": "2.8.0+cu126",
  "cuda": false,
  "device": "CPU",
  "transformers": "4.56.1",
  "sentence_transformers": "5.1.0",
  "chromadb": "1.1.0"
}
Mounted at /content/drive
Files in folder: ['Chess GPT Paper.pdf', 'Maia-2 Paper.pdf', 'Chess Bench with Stockfish Paper.pdf']
Loaded 99 documents from 3 PDFs.


## LLM Model & Questions Set Up

In [None]:

# Inputs the question and encodes it
# Compares the encoded question to the chroma DB chunks
# Retrieve the k=4 highest similarity vectors from DB
# Decodes the vectors and outputs the answer which came from the DB.

import os, json
import pandas as pd
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from transformers import pipeline

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline




MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # fallback: "distilgpt2"
tok = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
pipe = pipeline("text-generation", model=model, tokenizer=tok, max_new_tokens=200)
llm = HuggingFacePipeline(pipeline=pipe)
print("LLM ready:", MODEL_ID)



# Questions
questions = [
    "What problems did the Chess GPT Paper solve?",
    "What improvements did Maia-2 introduce compared to Maia-1?",
    "How does the model mimic human play?"
]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


LLM ready: TinyLlama/TinyLlama-1.1B-Chat-v1.0


  llm = HuggingFacePipeline(pipeline=pipe)


## MiniLM  |  chunk_size=500, overlap=100


In [None]:
# Q&A results for MiniLM, chunk_size=500, overlap=100

# Setup embedding model
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Split docs
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# Create Chroma DB with unique persist dir
persist_dir = f"chroma/chroma_MiniLM_500_100"
vectordb = Chroma.from_documents(chunks, emb, persist_directory=persist_dir)
vectordb.persist()

# Setup QA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectordb.as_retriever(search_kwargs={"k": 4}), chain_type="stuff")

# Run Q&A
results = []
for q in questions:
    a = qa.run(q)
    results.append({"Question": q, "Answer": a})

# Save results to CSV
csv_path = f"logs/results_MiniLM_500_100.csv"
pd.DataFrame(results).to_csv(csv_path, index=False)
print(f"Saved Q&A results to {csv_path}")

# Save run config
rag_run_config = {
    "chunk_size": 500,
    "chunk_overlap": 100,
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "llm": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "retriever_k": 4
}
config_path = f"reproduce_artifacts/rag_run_config_MiniLM_500_100.json"
with open(config_path, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved run config to {config_path}")


  emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vectordb.persist()
  a = qa.run(q)


Saved Q&A results to logs/results_MiniLM_500_100.csv
Saved run config to reproduce_artifacts/rag_run_config_MiniLM_500_100.json


## MiniLM  |  chunk_size=300, overlap=50


In [None]:
# Q&A results for MiniLM, chunk_size=300, overlap=50

# Setup embedding model
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Split docs
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Create Chroma DB with unique persist dir
persist_dir = f"chroma/chroma_MiniLM_300_50"
vectordb = Chroma.from_documents(chunks, emb, persist_directory=persist_dir)
vectordb.persist()

# Setup QA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectordb.as_retriever(search_kwargs={"k": 4}), chain_type="stuff")

# Run Q&A
results = []
for q in questions:
    a = qa.run(q)
    results.append({"Question": q, "Answer": a})

# Save results to CSV
csv_path = f"logs/results_MiniLM_300_50.csv"
pd.DataFrame(results).to_csv(csv_path, index=False)
print(f"Saved Q&A results to {csv_path}")

# Save run config
rag_run_config = {
    "chunk_size": 300,
    "chunk_overlap": 50,
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "llm": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "retriever_k": 4
}
config_path = f"reproduce_artifacts/rag_run_config_MiniLM_300_50.json"
with open(config_path, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved run config to {config_path}")


Saved Q&A results to logs/results_MiniLM_300_50.csv
Saved run config to reproduce_artifacts/rag_run_config_MiniLM_300_50.json


## E5-Small  |  chunk_size=500, overlap=100


In [None]:
# Q&A results for E5-small, chunk_size=500, overlap=100

# Setup embedding model
emb = SentenceTransformerEmbeddings(model_name="intfloat/e5-small-v2")

# Split docs
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# Create Chroma DB with unique persist dir
persist_dir = f"chroma/chroma_E5-small_500_100"
vectordb = Chroma.from_documents(chunks, emb, persist_directory=persist_dir)
vectordb.persist()

# Setup QA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectordb.as_retriever(search_kwargs={"k": 4}), chain_type="stuff")

# Run Q&A
results = []
for q in questions:
    a = qa.run(q)
    results.append({"Question": q, "Answer": a})

# Save results to CSV
csv_path = f"logs/results_E5-small_500_100.csv"
pd.DataFrame(results).to_csv(csv_path, index=False)
print(f"Saved Q&A results to {csv_path}")

# Save run config
rag_run_config = {
    "chunk_size": 500,
    "chunk_overlap": 100,
    "embedding_model": "intfloat/e5-small-v2",
    "llm": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "retriever_k": 4
}
config_path = f"reproduce_artifacts/rag_run_config_E5-small_500_100.json"
with open(config_path, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved run config to {config_path}")


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Saved Q&A results to logs/results_E5-small_500_100.csv
Saved run config to reproduce_artifacts/rag_run_config_E5-small_500_100.json


## E5-Small  |  chunk_size=300, overlap=50


In [None]:
# Q&A results for E5-small, chunk_size=300, overlap=50

# Setup embedding model
emb = SentenceTransformerEmbeddings(model_name="intfloat/e5-small-v2")

# Split docs
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Create Chroma DB with unique persist dir
persist_dir = f"chroma/chroma_E5-small_300_50"
vectordb = Chroma.from_documents(chunks, emb, persist_directory=persist_dir)
vectordb.persist()

# Setup QA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectordb.as_retriever(search_kwargs={"k": 4}), chain_type="stuff")

# Run Q&A
results = []
for q in questions:
    a = qa.run(q)
    results.append({"Question": q, "Answer": a})

# Save results to CSV
csv_path = f"logs/results_E5-small_300_50.csv"
pd.DataFrame(results).to_csv(csv_path, index=False)
print(f"Saved Q&A results to {csv_path}")

# Save run config
rag_run_config = {
    "chunk_size": 300,
    "chunk_overlap": 50,
    "embedding_model": "intfloat/e5-small-v2",
    "llm": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "retriever_k": 4
}
config_path = f"reproduce_artifacts/rag_run_config_E5-small_300_50.json"
with open(config_path, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved run config to {config_path}")


Saved Q&A results to logs/results_E5-small_300_50.csv
Saved run config to reproduce_artifacts/rag_run_config_E5-small_300_50.json


In [None]:
!zip -r TrackA.zip logs reproduce_artifacts


  adding: logs/ (stored 0%)
  adding: logs/results_E5-small_300_50.csv (deflated 63%)
  adding: logs/results_E5-small_500_100.csv (deflated 61%)
  adding: logs/results_MiniLM_300_50.csv (deflated 57%)
  adding: logs/results_MiniLM_500_100.csv (deflated 59%)
  adding: reproduce_artifacts/ (stored 0%)
  adding: reproduce_artifacts/rag_run_config_E5-small_500_100.json (deflated 21%)
  adding: reproduce_artifacts/rag_run_config_MiniLM_300_50.json (deflated 21%)
  adding: reproduce_artifacts/rag_run_config_MiniLM_500_100.json (deflated 21%)
  adding: reproduce_artifacts/rag_run_config_E5-small_300_50.json (deflated 20%)
  adding: reproduce_artifacts/env_rag.json (deflated 32%)
