# **Track B: RAG with Gemini API**
#### This notebook demonstrates a Retrieval-Augmented Generation (RAG) pipeline using Google Gemini API and LangChain.


## Set Up

In [None]:


# Installs
!pip install -q google-generativeai langchain langchain-community langchain-google-genai chromadb pypdf sentence-transformers



# API Key Setup
import os
import getpass



# Colab automatically injects Secrets API Key into the environment
api_key = os.environ.get("GOOGLE_API_KEY")

if not api_key:
    api_key = getpass.getpass(f'Please input GOOGLE_API_KEY: ')



# Make sure both env vars are aligned
os.environ["GOOGLE_API_KEY"] = api_key
os.environ["GENAI_API_KEY"] = api_key

print("API key loaded from Colab Secrets")



# Document Loading
import os
from langchain.document_loaders import PyPDFLoader
from google.colab import drive

drive.mount('/content/drive')

pdf_path = "/content/drive/My Drive/Capstone/Academic Papers"

print(f'\nDocuments found {os.listdir(pdf_path)}')

docs = []

for file in os.listdir(pdf_path):
    if file.endswith('.pdf'):
      loader = PyPDFLoader(os.path.join(pdf_path, file))
      docs.extend(loader.load())

print(f"\nLoaded {len(docs)} documents from {len(os.listdir(pdf_path))} PDFs.")



# env_rag.json creation

import importlib, json, os
os.makedirs("reproduce_artifacts", exist_ok=True)

def get_version(pkg, import_name=None):
    try:
        mod = importlib.import_module(import_name or pkg)
        return getattr(mod, "__version__", "unknown")
    except ImportError:
        return "not installed"

env_rag = {
    "python": "3.10.12",
    "packages": {
        "torch": get_version("torch"),
        "transformers": get_version("transformers"),
        "sentence-transformers": get_version("sentence_transformers"),
        "chromadb": get_version("chromadb"),
        "langchain": get_version("langchain"),
        "langchain-community": get_version("langchain_community"),
        "langchain-google-genai": get_version("langchain_google_genai"),
        "google-generativeai": get_version("google.generativeai"),
        "pypdf": get_version("pypdf"),
    }
}

with open("reproduce_artifacts/env_rag.json", "w") as f:
    json.dump(env_rag, f, indent=2)

print("\nSaved reproduce_artifacts/env_rag.json")



# Function for Q&A for LLM model and organized outputs for json file
def ask(q):
    r = qa.invoke({"query": q})
    answer = r.get("result", "")
    raw_sources = [d.metadata.get("source", "?") for d in r.get("source_documents", [])]

    # Only keep the filename (or whatever part you want exposed)
    sources = [os.path.basename(s) for s in raw_sources if s]

    print("\nQ:", q)
    print("A:", answer)
    print("\nSources:")
    for i, d in enumerate(r.get("source_documents", [])[:3]):
        fname = os.path.basename(d.metadata.get("source","?"))
        print(f"[{i+1}] {fname} ::", d.page_content[:160].replace("\n"," ")+"...")

    return {"question": q, "answer": answer, "sources": sources}



API key loaded from Colab Secrets
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Documents found ['Chess GPT Paper.pdf', 'Maia-2 Paper.pdf', 'Chess Bench with Stockfish Paper.pdf']

Loaded 99 documents from 3 PDFs.

Saved reproduce_artifacts/env_rag.json


## Experiment: MiniLM | chunk_size=500, overlap=100


In [None]:
# Experiment: MiniLM | chunk_size=500, overlap=100
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
import google.generativeai as genai
import os, getpass, pathlib

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = splitter.split_documents(docs)
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_minilm_500_100")
retriever = vs.as_retriever()

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, google_api_key=os.environ['GOOGLE_API_KEY'])
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)


rag_run_config = {
    "chunk_size": 500,
    "chunk_overlap": 100,
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "llm": "gemini-2.5-flash",
    "retriever_k": 4
}

filename = "reproduce_artifacts/rag_run_config_minilm_500_100.json"
with open(filename, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved {filename}")



Saved reproduce_artifacts/rag_run_config_minilm_500_100.json


In [None]:


q1 = ask("What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?")  # ChessBench

q2 = ask("How does Maia-2’s skill-aware attention mechanism adjust move predictions when modeling players of different skill levels?")  # Maia-2

q3 = ask("What types of datasets were used to train ChessGPT, and how do the game, language, and mixed datasets each contribute to improving its performance?")  # ChessGPT


import pandas as pd
import json
import os

os.makedirs("logs", exist_ok=True)
with open('logs/results_minilm_500_100.json', 'w') as f:
    json.dump([q1, q2, q3], f, indent=2)
print('Saved logs/results_minilm_500_100.json')





Q: What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?
A: Based on the provided context, Stockfish 16 is mentioned as a point of comparison for the models, ablations, and searchless policy/value networks, alongside Leela Chess Zero and AlphaZero.

The text does not state that Stockfish 16 plays a role in annotating the ChessBench dataset, nor does it describe how such annotations would support amortized planning models in learning better policies. The details regarding dataset creation are referred to Appendix A, which is not provided here.

Sources:
[1] Chess Bench with Stockfish Paper.pdf :: series of benchmark results through our models, ablations, and comparisons with Stock- fish 16, Leela Chess Zero, and AlphaZero, and their searchless policy/val...
[2] Chess Bench with Stockfish Paper.pdf :: series of benchmark results through our models, ablations, and comparisons with

## Experiment: MiniLM | chunk_size=300, overlap=50


In [None]:
# Experiment: MiniLM | chunk_size=300, overlap=50
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
import google.generativeai as genai
import os, getpass, pathlib

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
splits = splitter.split_documents(docs)
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_minilm_300_50")
retriever = vs.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, google_api_key=os.environ['GOOGLE_API_KEY'])
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)



rag_run_config = {
    "chunk_size": 300,
    "chunk_overlap": 50,
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "llm": "gemini-2.5-flash",
    "retriever_k": 4
}

filename = "reproduce_artifacts/rag_run_config_minilm_300_50.json"
with open(filename, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved {filename}")



Saved reproduce_artifacts/rag_run_config_minilm_300_50.json


In [None]:

q1 = ask("What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?")  # ChessBench

q2 = ask("How does Maia-2’s skill-aware attention mechanism adjust move predictions when modeling players of different skill levels?")  # Maia-2

q3 = ask("What types of datasets were used to train ChessGPT, and how do the game, language, and mixed datasets each contribute to improving its performance?")  # ChessGPT



import pandas as pd
import json
import os

os.makedirs("logs", exist_ok=True)
with open('logs/results_minilm_300_50.json', 'w') as f:
    json.dump([q1, q2, q3], f, indent=2)
print('Saved logs/results_minilm_300_50.json')




Q: What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?
A: Based on the provided context:

*   The text mentions that ChessBench shows the feasibility of "distilling an approximation of Stockfish 16, a complex planning algorithm." However, it **does not state that Stockfish 16 plays a role in annotating the ChessBench dataset.**
*   The context mentions "annotation language" (Lt) is paired with chessboard states (St) to train a ChessCLIP model to "bridge the modality of policy and language." While this suggests annotations support policy learning, the text **does not explain how these annotations specifically support "amortized planning models in learning better policies," nor does it link this process back to Stockfish 16.**

Sources:
[1] Chess Bench with Stockfish Paper.pdf :: 6 Conclusion Our paper introduces ChessBench, a large-scale, open source benchmark dataset for ches

## Experiment: Gemini Embedding 001 | chunk_size=500, overlap=100


In [None]:
# Experiment: e5-small-v2 | chunk_size=500, overlap=100
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
import google.generativeai as genai
import os, getpass, pathlib

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = splitter.split_documents(docs)
emb = SentenceTransformerEmbeddings(model_name="intfloat/e5-small-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_gemini_500_100")
retriever = vs.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, google_api_key=os.environ['GOOGLE_API_KEY'])
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)


rag_run_config = {
    "chunk_size": 500,
    "chunk_overlap": 100,
    "embedding_model": "intfloat/e5-small-v2",
    "llm": "gemini-2.5-flash",
    "retriever_k": 4
}

filename = "reproduce_artifacts/rag_run_config_e5small_500_100.json"
with open(filename, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved {filename}")




In [None]:


q1 = ask("What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?")  # ChessBench

q2 = ask("How does Maia-2’s skill-aware attention mechanism adjust move predictions when modeling players of different skill levels?")  # Maia-2

q3 = ask("What types of datasets were used to train ChessGPT, and how do the game, language, and mixed datasets each contribute to improving its performance?")  # ChessGPT



import pandas as pd
import json
import os

os.makedirs("logs", exist_ok=True)
with open('logs/results_e5small_500_100.json', 'w') as f:
    json.dump([q1, q2, q3], f, indent=2)
print('Saved logs/results_e5small_500_100.json')




Q: What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?
A: Stockfish 16 plays a crucial role in annotating the ChessBench dataset by providing high-quality, expert-level evaluations for each board state. Specifically, it annotates:

1.  **State-values:** The value of a given board state.
2.  **Best-action:** The optimal move to make from that board state.
3.  **Action-values:** The value for *all legal actions* possible in each board state (amounting to 15 billion data points).

These annotations support amortized planning models in learning better policies by serving as the ground truth for supervised training. By training on this massive dataset, models (such as feed-forward transformers) can "distill an approximation of Stockfish 16, a complex planning algorithm," into a network that can predict state-values, best-actions, and action-values. This allows the models to learn 

## Experiment: e5-small-v2 | chunk_size=300, overlap=50




In [None]:
# Experiment: e5-small-v2 | chunk_size=300, overlap=50
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
import google.generativeai as genai
import os, getpass, pathlib


splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
splits = splitter.split_documents(docs)
emb = SentenceTransformerEmbeddings(model_name="intfloat/e5-small-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_e5small_300_50")
retriever = vs.as_retriever()
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2, google_api_key=os.environ['GOOGLE_API_KEY'])
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)


rag_run_config = {
    "chunk_size": 300,
    "chunk_overlap": 50,
    "embedding_model": "intfloat/e5-small-v2",
    "llm": "gemini-2.5-flash",
    "retriever_k": 4
}

filename = "reproduce_artifacts/rag_run_config_e5small_300_50.json"
with open(filename, "w") as f:
    json.dump(rag_run_config, f, indent=2)
print(f"Saved {filename}")



Saved reproduce_artifacts/rag_run_config_e5small_300_50.json


In [None]:

q1 = ask("What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?")  # ChessBench

q2 = ask("How does Maia-2’s skill-aware attention mechanism adjust move predictions when modeling players of different skill levels?")  # Maia-2

q3 = ask("What types of datasets were used to train ChessGPT, and how do the game, language, and mixed datasets each contribute to improving its performance?")  # ChessGPT


import pandas as pd
import json
import os

os.makedirs("logs", exist_ok=True)
with open('logs/results_e5small_300_50.json', 'w') as f:
    json.dump([q1, q2, q3], f, indent=2)
print('Saved logs/results_e5small_300_50.json')




Q: What role does Stockfish 16 play in annotating the ChessBench dataset, and how do these annotations support amortized planning models in learning better policies?
A: Stockfish 16 plays a crucial role in annotating the ChessBench dataset by providing **state-values** for 530 million board states.

These annotations support amortized planning models in learning better policies by:
*   Providing expert-derived "state-values" which are useful from a reinforcement learning perspective.
*   Facilitating **policy learning** and **value function learning**, allowing models to learn from Stockfish 16's complex planning algorithm. This essentially helps in distilling an approximation of Stockfish 16 into searchless policy/value networks.

Sources:
[1] Chess Bench with Stockfish Paper.pdf :: 6 Conclusion Our paper introduces ChessBench, a large-scale, open source benchmark dataset for chess, and shows the feasibility of distilling an approximation o...
[2] Chess Bench with Stockfish Paper.pdf