# RAG System for SEC 10-K Financial Question Answering

This notebook implements a complete Retrieval-Augmented Generation (RAG) system that answers financial questions from Apple's FY2024 and Tesla's FY2023 10-K filings.

**Colab-Ready**: Click "Run all" or execute cells sequentially.

---

## Setup & Installation

### Step 1: Clone Repository

In [None]:
import os
import subprocess
import sys

# Check if running in Colab
try:
    from google.colab import drive
    IN_COLAB = True
    print("✓ Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("✓ Running locally (not Colab)")

# Set working directory
if IN_COLAB:
    REPO_DIR = "/content/SecRAG-10K"
else:
    REPO_DIR = os.getcwd()

print(f"Working directory: {REPO_DIR}")

In [None]:
# Clone GitHub repository (Colab only)
if IN_COLAB:
    os.chdir("/content")
    if not os.path.exists(REPO_DIR):
        print("Cloning repository from GitHub...")
        subprocess.run(
            ["git", "clone", "https://github.com/kalpeshdahake/SecRAG-10K.git"],
            check=True
        )
    os.chdir(REPO_DIR)
    print(f"✓ Repository ready at {REPO_DIR}")
else:
    print("✓ Local mode - using existing directory")

# Verify structure
required_dirs = ["data", "ingestion", "embeddings", "retrieval", "llm", "pipeline"]
for dir_name in required_dirs:
    if os.path.exists(dir_name):
        print(f"  ✓ {dir_name}/")
    else:
        print(f"  ✗ {dir_name}/ MISSING")

### Step 2: Install Dependencies

In [None]:
# Install requirements
print("Installing dependencies...")
subprocess.run(
    [sys.executable, "-m", "pip", "install", "-r", "requirements.txt", "--quiet"],
    check=False
)
print("✓ Dependencies installed")

### Step 3: Verify Imports

In [None]:
# Test imports
print("Testing imports...")

try:
    import pypdf
    print("  ✓ pypdf")
except ImportError as e:
    print(f"  ✗ pypdf: {e}")

try:
    from sentence_transformers import SentenceTransformer
    print("  ✓ sentence-transformers")
except ImportError as e:
    print(f"  ✗ sentence-transformers: {e}")

try:
    import chromadb
    print("  ✓ chromadb")
except ImportError as e:
    print(f"  ✗ chromadb: {e}")

try:
    import torch
    print(f"  ✓ torch (GPU available: {torch.cuda.is_available()})")
except ImportError as e:
    print(f"  ✗ torch: {e}")

try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    print("  ✓ transformers")
except ImportError as e:
    print(f"  ✗ transformers: {e}")

print("\n✓ All imports successful")

---

## PDF Indexing Pipeline

### Step 4: Load & Parse PDFs

In [None]:
# Import ingestion modules
import sys
sys.path.insert(0, REPO_DIR)

from ingestion.pdf_loader import load_pdf
from ingestion.section_parser import assign_items
from ingestion.chunker import chunk_text

print("Loading PDFs...")

# Load Apple 10-K
apple_pages = load_pdf(
    "data/10-Q4-2024-As-Filed.pdf",
    company="Apple",
    document="Apple 10-K"
)
apple_pages = assign_items(apple_pages)
apple_chunks = chunk_text(apple_pages)

print(f"Apple 10-K: {len(apple_pages)} pages → {len(apple_chunks)} chunks")
print(f"  Sample metadata: {apple_chunks[0]['metadata']}")

# Load Tesla 10-K
tesla_pages = load_pdf(
    "data/tsla-20231231-gen.pdf",
    company="Tesla",
    document="Tesla 10-K"
)
tesla_pages = assign_items(tesla_pages)
tesla_chunks = chunk_text(tesla_pages)

print(f"\nTesla 10-K: {len(tesla_pages)} pages → {len(tesla_chunks)} chunks")
print(f"  Sample metadata: {tesla_chunks[0]['metadata']}")

print(f"\n✓ Total chunks: {len(apple_chunks) + len(tesla_chunks)}")

### Step 5: Generate Embeddings & Index

In [None]:
from embeddings.embedder import Embedder
from embeddings.vector_store import VectorStore

print("Initializing embedding model & vector store...")

# Initialize embedder and vector store
embedder = Embedder()
vector_store = VectorStore(persist_dir="vector_db")

# Create collections
apple_collection = vector_store.get_or_create_collection("apple_10k")
tesla_collection = vector_store.get_or_create_collection("tesla_10k")

print("✓ Collections created")
print("\nGenerating Apple embeddings...")

# Embed Apple chunks
apple_embeddings = embedder.embed_texts(
    [chunk["text"] for chunk in apple_chunks]
)
vector_store.add_chunks(apple_collection, apple_chunks, apple_embeddings)

print(f"✓ Apple indexed: {len(apple_chunks)} chunks")

print("\nGenerating Tesla embeddings...")

# Embed Tesla chunks
tesla_embeddings = embedder.embed_texts(
    [chunk["text"] for chunk in tesla_chunks]
)
vector_store.add_chunks(tesla_collection, tesla_chunks, tesla_embeddings)

print(f"✓ Tesla indexed: {len(tesla_chunks)} chunks")

print("\n✓ Indexing complete! Ready for inference.")

---

## Inference & Evaluation

Run the RAG pipeline on all 13 test questions.

### Step 6: Test Question Set

In [None]:
import json

# 13 test questions from assignment
test_questions = [
    {"question_id": 1, "question": "What was Apple's total revenue for the fiscal year ended September 28, 2024?"},
    {"question_id": 2, "question": "How many shares of common stock were issued and outstanding as of October 18, 2024?"},
    {"question_id": 3, "question": "What is the total amount of term debt (current + non-current) reported by Apple as of September 28, 2024?"},
    {"question_id": 4, "question": "On what date was Apple's 10-K report for 2024 signed and filed with the SEC?"},
    {"question_id": 5, "question": "Does Apple have any unresolved staff comments from the SEC as of this filing? How do you know?"},
    {"question_id": 6, "question": "What was Tesla's total revenue for the year ended December 31, 2023?"},
    {"question_id": 7, "question": "What percentage of Tesla's total revenue in 2023 came from Automotive Sales (excluding Leasing)?"},
    {"question_id": 8, "question": "What is the primary reason Tesla states for being highly dependent on Elon Musk?"},
    {"question_id": 9, "question": "What types of vehicles does Tesla currently produce and deliver?"},
    {"question_id": 10, "question": "What is the purpose of Tesla's 'lease pass-through fund arrangements'?"},
    {"question_id": 11, "question": "What is Tesla's stock price forecast for 2025?"},
    {"question_id": 12, "question": "Who is the CFO of Apple as of 2025?"},
    {"question_id": 13, "question": "What color is Tesla's headquarters painted?"}
]

print(f"Loaded {len(test_questions)} test questions")
for q in test_questions[:3]:
    print(f"  Q{q['question_id']}: {q['question'][:60]}...")
print(f"  ... and {len(test_questions) - 3} more")

### Step 7: Run RAG Pipeline

In [None]:
from pipeline.rag_pipeline import answer_question

print("Running RAG pipeline on test questions...\n")

# Combined collection for routing
combined_collection = {
    "apple": apple_collection,
    "tesla": tesla_collection
}

# Helper: Route to correct collection
def answer_with_routing(query):
    q_lower = query.lower()
    if "apple" in q_lower:
        return answer_question(query, apple_collection)
    elif "tesla" in q_lower:
        return answer_question(query, tesla_collection)
    else:
        return {
            "answer": "This question cannot be answered based on the provided documents.",
            "sources": []
        }

# Run on all questions
results = []

for q in test_questions:
    qid = q["question_id"]
    query = q["question"]

    result = answer_with_routing(query)

    output = {
        "question_id": qid,
        "answer": result["answer"],
        "sources": result["sources"]
    }

    results.append(output)

    print(f"Q{qid}: {query[:70]}...")
    print(f"  Answer: {result['answer'][:80]}...")
    print(f"  Sources: {result['sources']}\n")

print("✓ All 13 questions processed")

### Step 8: Export Results

In [None]:
# Save results to JSON
output_file = "results.json"

with open(output_file, "w") as f:
    json.dump(results, f, indent=2)

print(f"✓ Results saved to {output_file}")

# Display summary
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)

answered = sum(1 for r in results if "cannot be answered" not in r["answer"].lower())
refused = len(results) - answered

print(f"\nQuestions Answered: {answered}/13")
print(f"Questions Refused: {refused}/13")

print("\nAnswered:")
for r in results:
    if "cannot be answered" not in r["answer"].lower():
        print(f"  Q{r['question_id']}: {r['answer'][:60]}... [{r['sources']}]")

print("\nRefused (Out-of-Scope):")
for r in results:
    if "cannot be answered" in r["answer"].lower():
        print(f"  Q{r['question_id']}: {r['answer'][:60]}...")

print("\n" + "="*60)

---

## Interactive Query Mode

Ask custom questions about the 10-K filings.

In [None]:
# Custom query function
def query_rag(question):
    """
    Query the RAG system with a custom question.

    Args:
        question (str): Your question about Apple or Tesla 10-K

    Returns:
        dict: {"answer": str, "sources": list}
    """
    result = answer_with_routing(question)
    return result

# Example custom queries
print("Custom Query Examples:")
print("="*60)

custom_queries = [
    "What are Apple's main business segments?",
    "What risks does Tesla face?"
]

for query in custom_queries:
    result = query_rag(query)
    print(f"Q: {query}")
    print(f"A: {result['answer']}")
    print(f"Sources: {result['sources']}")
    print()

---

## Download Results

Save results locally (Colab only)

In [None]:
if IN_COLAB:
    from google.colab import files
    print("Downloading results.json...")
    files.download("results.json")
    print("✓ Download started")
else:
    print("Local mode: results saved to ./results.json")