In [2]:
# SETUP & DEPENDENCIES ---
import os
import sys

print("--- 1. SETTING UP ENVIRONMENT ---")

# 1. clone repo (if not exists)
if not os.path.exists("rag-financial-10k-analyzer"):
    !git clone https://github.com/imperfect-abhi/rag-financial-10k-analyzer.git
    print("Repository cloned.")
else:
    print("Repository already exists.")

%cd rag-financial-10k-analyzer
# --- STEP 1: MASTER ENVIRONMENT FIX ---
import os

print("--- Force-fixing Environment Binaries ---")

# 1. uninstall the conflicting numpy and associated libraries
!pip uninstall -y numpy sentence-transformers chromadb pypdf &> /dev/null

# 2. we use 1.26.4 because it is the most stable version for Python 3.12 before the 2.0 breaking changes
!pip install numpy==1.26.4
!pip install sentence-transformers==2.7.0
!pip install chromadb==0.5.3  # newer version handles NumPy 1.26.4 better
!pip install pypdf==4.2.0
!pip install langchain-text-splitters==0.2.0

print("\n--- INSTALLATION COMPLETE ---")
print("CRITICAL: Now go to 'Runtime' -> 'Restart session' before running anything else!")

--- 1. SETTING UP ENVIRONMENT ---
Repository already exists.
/content/rag-financial-10k-analyzer
--- Force-fixing Environment Binaries ---
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-contrib-python 4.13.0.92 requires numpy>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
pytensor 2.37.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
rasterio 1.5.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have

Collecting sentence-transformers==2.7.0
  Using cached sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers==2.7.0)
  Using cached transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers==2.7.0)
  Using cached huggingface_hub-0.36.2-py3-none-any.whl.metadata (15 kB)
Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Using cached transformers-4.57.6-py3-none-any.whl (12.0 MB)
Using cached huggingface_hub-0.36.2-py3-none-any.whl (566 kB)
Installing collected packages: huggingface-hub, transformers, sentence-transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface_hub 1.4.0
    Uninstalling huggingface_hub-1.4.0:
      Successfully uninstalled huggingface_hub-1.4.0
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Suc


--- INSTALLATION COMPLETE ---
CRITICAL: Now go to 'Runtime' -> 'Restart session' before running anything else!


In [3]:
# verifying numpy compatibility for chroma and transformers
import numpy as np
print(f"Current NumPy Version: {np.__version__}")

import chromadb
from sentence_transformers import SentenceTransformer
print("ChromaDB and SentenceTransformers imported successfully!")

Current NumPy Version: 1.26.4
ChromaDB and SentenceTransformers imported successfully!


In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import os
from pypdf import PdfReader
import json
import time

In [5]:
# chroma db storage onfiguration
DB_PATH = "/content/chroma_db_storage"
PDF_DIR = "/content/rag-financial-10k-analyzer/data" # Explicit path

# 3. Parsing Logic
def parse_and_chunk():
    print("Parsing PDFs...")
    all_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""]
    )

    docs = {
        "Apple 10-K": os.path.join(PDF_DIR, "10-Q4-2024-As-Filed.pdf"),
        "Tesla 10-K": os.path.join(PDF_DIR, "tsla-20231231-gen.pdf")
    }

    for doc_name, path in docs.items():
        if not os.path.exists(path):
            print(f"ERROR: {path} not found! Run the git clone step.")
            continue

        print(f"Reading {doc_name}...")
        reader = PdfReader(path)
        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            chunks = text_splitter.split_text(text)
            for chunk in chunks:
                all_chunks.append({
                    "text": chunk,
                    "metadata": {"document": doc_name, "page_number": i+1, "section": "General"}
                })
    return all_chunks

# 4. DB Initialization
client = chromadb.PersistentClient(path=DB_PATH)

try:
    # Try to get existing data
    collection = client.get_or_create_collection(name="financial_filings")
    if collection.count() > 0:
        print(f"Database ready with {collection.count()} chunks.")
    else:
        # Build if empty
        data = parse_and_chunk()
        print("Loading Embedding Model (CPU)...")
        embedder = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu')

        print(f"Embedding {len(data)} chunks...")
        texts = [d['text'] for d in data]
        metas = [d['metadata'] for d in data]
        ids = [f"id_{i}" for i in range(len(data))]

        # We embed in one go as bge-small is lightweight
        embeddings = embedder.encode(texts, show_progress_bar=True)

        collection.add(documents=texts, embeddings=embeddings.tolist(), metadatas=metas, ids=ids)
        print("Vector DB Building Complete.")
        del embedder # Free RAM
except Exception as e:
    print(f"Error: {e}")

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Parsing PDFs...
Reading Apple 10-K...
Reading Tesla 10-K...
Loading Embedding Model (CPU)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding 1137 chunks...


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


Vector DB Building Complete.


In [6]:
# --- model loading ---
import torch
import gc
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print("--- initializing models ---")

# clearing GPU memory to ensure a fresh context
torch.cuda.empty_cache()
gc.collect()

# loading retrieval models on CPU
print("Loading Embedder & Re-ranker on CPU...")
embedder = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu')
reranker = CrossEncoder("BAAI/bge-reranker-base", device='cpu')

# loading LLM on GPU (Phi-3-Mini-128k)
print("Loading Phi-3-Mini-128k on GPU...")
model_id = "microsoft/Phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

# creating the generation pipeline
pipe = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=tokenizer,
    max_new_tokens=600,
    temperature=0.1,
    top_p=0.9,
    do_sample=True
)

vram_used = torch.cuda.memory_allocated() / 1024**3
print(f"Models Loaded. Current GPU VRAM Usage: {vram_used:.2f} GB")

--- 3. INITIALIZING MODELS ---
Loading Embedder & Re-ranker on CPU...


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Loading Phi-3-Mini-128k on GPU...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Device set to use cuda


Models Loaded. Current GPU VRAM Usage: 7.12 GB


In [7]:
# --- RAG pipeline functions ---

def retrieve_and_rerank(query, k_initial=25, k_final=5):
    """2-stage retrieval: Vector Search (CPU) -> Cross-Encoder (CPU)"""
    # vector search
    query_vec = embedder.encode([query], normalize_embeddings=True).tolist()
    results = collection.query(query_embeddings=query_vec, n_results=k_initial)

    if not results['documents'][0]: return []

    candidates = []
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        candidates.append({'text': doc, 'metadata': meta})

    # cross-encoder re-ranking
    pairs = [[query, c['text']] for c in candidates]
    scores = reranker.predict(pairs)

    for i, c in enumerate(candidates):
        c['score'] = scores[i]

    # sort and take top 5
    ranked = sorted(candidates, key=lambda x: x['score'], reverse=True)
    return ranked[:k_final]

def answer_question(query):
    """End-to-end orchestration: Retrieval -> Prompt -> Generation"""
    # context retrieval
    top_chunks = retrieve_and_rerank(query)

    if not top_chunks:
        return {"answer": "Not specified in the document.", "sources": []}

    # prompt construction
    context_text = ""
    sources_found = []
    for i, chunk in enumerate(top_chunks):
        m = chunk['metadata']
        context_text += f"[SOURCE {i+1}]\nDoc: {m['document']}\nSection: {m['section']}\nPage: {m['page_number']}\nContent: {chunk['text']}\n\n"
        src = f"['{m['document']}', '{m['section']}', 'p. {m['page_number']}']"
        if src not in sources_found: sources_found.append(src)

    # using Phi-3 specific tags <|user|>, <|assistant|>, <|end|>
    prompt = f"""<|user|>
You are a precise financial analyst. Answer the question ONLY based on the provided Context.

RULES:
1. CITATIONS: Follow every fact with a citation: ['Document Name', 'Section', 'p. Page'].
2. SCOPE: If the info is missing, say "Not specified in the document."
3. OUT-OF-SCOPE: For stock forecasts or non-financial trivia, say "This question cannot be answered based on the provided documents."

CONTEXT:
{context_text}

QUESTION:
{query}<|end|>
<|assistant|>"""

    # inference
    output = pipe(prompt, return_full_text=False)
    return {"answer": output[0]['generated_text'].strip(), "sources": sources_found}

In [10]:
# --- batch execution and checkpointing ---
OUTPUT_FILE = "submission_output.json"
BATCH_SIZE = 3

questions = [
    {"question_id": 1, "question": "What was Apple's total revenue for the fiscal year ended September 28, 2024?"},
    {"question_id": 2, "question": "How many shares of common stock were issued and outstanding as of October 18, 2024?"},
    {"question_id": 3, "question": "What is the total amount of term debt (current + non-current) reported by Apple as of September 28, 2024?"},
    {"question_id": 4, "question": "On what date was Apple's 10-K report for 2024 signed and filed with the SEC?"},
    {"question_id": 5, "question": "Does Apple have any unresolved staff comments from the SEC as of this filing? How do you know?"},
    {"question_id": 6, "question": "What was Tesla's total revenue for the year ended December 31, 2023?"},
    {"question_id": 7, "question": "What percentage of Tesla's total revenue in 2023 came from Automotive Sales (excluding Leasing)?"},
    {"question_id": 8, "question": "What is the primary reason Tesla states for being highly dependent on Elon Musk?"},
    {"question_id": 9, "question": "What types of vehicles does Tesla currently produce and deliver?"},
    {"question_id": 10, "question": "What is the purpose of Tesla's 'lease pass-through fund arrangements'?"},
    {"question_id": 11, "question": "What is Tesla's stock price forecast for 2025?"},
    {"question_id": 12, "question": "Who is the CFO of Apple as of 2025?"},
    {"question_id": 13, "question": "What color is Tesla's headquarters painted?"}
]

# loading existing progress if any
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r") as f:
        results = json.load(f)
    processed = [r["question_id"] for r in results]
    print(f"Resuming evaluation. {len(processed)} questions already processed.")
else:
    results = []
    processed = []

remaining = [q for q in questions if q["question_id"] not in processed]

# batch processing loop
for i in range(0, len(remaining), BATCH_SIZE):
    batch = remaining[i : i + BATCH_SIZE]
    print(f"\nProcessing Batch {(i//BATCH_SIZE)+1}...")

    for q_item in batch:
        print(f"  Question {q_item['question_id']}...", end=" ", flush=True)
        try:
            res = answer_question(q_item["question"])

            # format according to requirements
            entry = {
                "question_id": q_item["question_id"],
                "answer": res["answer"],
                "sources": res["sources"]
            }

            # strict refusal logic: empty sources for Q11, 12, 13 if unanswerable
            if q_item["question_id"] in [11, 12, 13]:
                if "cannot be answered" in res["answer"].lower() or "not specified" in res["answer"].lower():
                    entry["sources"] = []

            results.append(entry)
            print("Done.")
        except Exception as e:
            print(f"Error: {e}")

    # saving checkpoint
    with open(OUTPUT_FILE, "w") as f:
        json.dump(results, f, indent=2)

    # clearing memory
    torch.cuda.empty_cache()
    gc.collect()

print("\n=== FINAL GENERATED JSON ===\n")
print(json.dumps(results, indent=2))

Resuming evaluation. 13 questions already processed.

=== FINAL GENERATED JSON ===

[
  {
    "question_id": 1,
    "answer": "Total revenue for the fiscal year ended September 28, 2024, was $391,035 million, as stated in the CONSOLIDATED STATEMENTS OF OPERATIONS on Page 32 of the Apple 10-K document.",
    "sources": [
      "['Apple 10-K', 'General', 'p. 42']",
      "['Apple 10-K', 'General', 'p. 32']",
      "['Apple 10-K', 'General', 'p. 33']",
      "['Apple 10-K', 'General', 'p. 34']"
    ]
  },
  {
    "question_id": 2,
    "answer": "According to the Apple 10-K document, Section: General, Page: 2, as of October 18, 2024, there were 15,115,823,000 shares of common stock issued and outstanding. [Apple 10-K, Section: General, Page: 2]",
    "sources": [
      "['Apple 10-K', 'General', 'p. 2']",
      "['Apple 10-K', 'General', 'p. 22']",
      "['Apple 10-K', 'General', 'p. 34']",
      "['Tesla 10-K', 'General', 'p. 2']",
      "['Apple 10-K', 'General', 'p. 35']"
    ]
  },
  