1. Load and Chunk the wikipedia narrative docs.

In [7]:
import os
import json

# -----------------------------------
# Step 1: Chunking Function (~300 words with overlap)
# -----------------------------------
def chunk_text_fixed(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks


# -----------------------------------
# Step 2: Main Load + Chunk + Print + Save Function
# -----------------------------------
def load_and_chunk_wikipedia(folder_path, output_path, chunk_size=300, overlap=50):
    all_chunks = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                text = f.read()
                doc_title = filename.replace('.txt', '')

                chunks = chunk_text_fixed(text, chunk_size=chunk_size, overlap=overlap)

                for i, chunk in enumerate(chunks):
                    chunk_id = f"{doc_title}_{str(i+1).zfill(3)}"
                    all_chunks.append({
                        'chunk_id': chunk_id,
                        'doc_title': doc_title,
                        'text': chunk
                    })

    # --- Print First 3 Chunks as JSON ---
    print("\n--- Example Chunks (JSON) ---")
    for chunk in all_chunks[:3]:
        print(json.dumps(chunk, indent=2))

    # --- Save All Chunks to JSONL ---
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as out_file:
        for chunk in all_chunks:
            out_file.write(json.dumps(chunk) + '\n')

    print(f"\n‚úÖ Saved {len(all_chunks)} chunks to {output_path}")

    return all_chunks


# -----------------------------------
# Step 3: Setup Clean Project Paths (portable)
# -----------------------------------
# Assumes notebook is in /notebooks and data is in /data/narratives
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, '..'))
NARRATIVE_DIR = os.path.join(PROJECT_ROOT, 'data', 'narratives')
CHUNKS_OUT_PATH = os.path.join(PROJECT_ROOT, 'data', 'chunks.jsonl')

# --- Run It ---
chunks = load_and_chunk_wikipedia(folder_path=NARRATIVE_DIR, output_path=CHUNKS_OUT_PATH)



--- Example Chunks (JSON) ---
{
  "chunk_id": "2020_philadelphia_eagles_season_001",
  "doc_title": "2020_philadelphia_eagles_season",
  "text": "The 2020 season was the Philadelphia Eagles' 88th in the National Football League (NFL) and their fifth and final under head coach Doug Pederson. They failed to improve on their 9\u20137 record from the previous season following a 23\u201317 loss to the Seattle Seahawks in Week 12. They were eliminated from playoff contention for the first time since 2016 following a Week 16 loss to the Dallas Cowboys and finished with a dismal 4\u201311\u20131 record, the second-worst in the National Football Conference (NFC), and their worst since 2012. After starting 3\u20134\u20131 heading into their bye week and leading the NFC East, the Eagles would lose 7 of their last 8 games. Injuries and poor quarterback play were factors in their struggles in the season. On January 11, 2021, the Eagles announced head coach Doug Pederson would not return after the 

2. Embed the corpus and index

In [None]:
#! pip install -U sentence-transformers
#! pip install faiss-cpu

import os
import json
import numpy as np
import faiss
import requests
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# -----------------------------------
# Path Setup (local)
# -----------------------------------
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR))  # no ".." needed if running from root

CHUNKS_JSONL_PATH = os.path.join(PROJECT_ROOT, 'data', 'chunks.jsonl')
FAISS_INDEX_PATH = os.path.join(PROJECT_ROOT, 'data', 'narrative_index.faiss')
METADATA_JSON_PATH = os.path.join(PROJECT_ROOT, 'data', 'narrative_metadata.json')

# -----------------------------------
# Load Chunks from JSONL
# -----------------------------------
def load_chunks_from_jsonl(filepath):
    chunks = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

# -----------------------------------
# Embed Chunks using Sentence-Transformers
# -----------------------------------
def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    texts = [chunk['text'] for chunk in chunks]
    print(f"üîÅ Embedding {len(texts)} chunks...")
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)

    for i, emb in enumerate(embeddings):
        chunks[i]['embedding'] = emb.tolist()
    return chunks

# -----------------------------------
# Build FAISS Index and Save Metadata
# -----------------------------------
def build_faiss_index(embedded_chunks, index_path, metadata_path):
    embeddings = np.array([chunk['embedding'] for chunk in embedded_chunks]).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    faiss.write_index(index, index_path)
    print(f"‚úÖ FAISS index saved to {index_path}")

    metadata = [
        {
            'chunk_id': chunk['chunk_id'],
            'doc_title': chunk['doc_title'],
            'text': chunk['text']
        }
        for chunk in embedded_chunks
    ]
    with open(metadata_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2)
    print(f"‚úÖ Metadata saved to {metadata_path}")

# -----------------------------------
# Retrieval Function
# -----------------------------------
def retrieve_narrative_chunks(question, index_path, metadata_path, model_name='all-MiniLM-L6-v2', top_k=3):
    print(f"üîç Retrieving top {top_k} chunks for question: {question}")
    
    # Load FAISS index
    index = faiss.read_index(index_path)
    
    # Load metadata
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    # Embed the question
    model = SentenceTransformer(model_name)
    query_vector = model.encode([question]).astype('float32')

    # Search FAISS
    distances, indices = index.search(query_vector, top_k)
    results = [metadata[i] for i in indices[0]]

    print("\nüìÑ Top Retrieved Chunks:\n")
    for i, r in enumerate(results):
        print(f"[{i+1}] ({r['chunk_id']}) from {r['doc_title']}")
        print(r['text'][:500] + "\n---\n")

    return results

# -----------------------------------
# Use Ollama + Mistral to Answer
# -----------------------------------
def answer_with_mistral(question, retrieved_chunks):
    context = "\n\n".join(chunk["text"] for chunk in retrieved_chunks)

    prompt = f"""Answer the question based on the following Eagles season narratives:

{context}

Question: {question}
Answer:"""

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "mistral", "prompt": prompt}
    )
    output = response.json()["response"]
    print("\nü§ñ Mistral's Answer:\n")
    print(output)
    return output

# -----------------------------------
# Run It All
# -----------------------------------
if __name__ == '__main__':
    chunks = load_chunks_from_jsonl(CHUNKS_JSONL_PATH)
    embedded_chunks = embed_chunks(chunks)
    build_faiss_index(embedded_chunks, index_path=FAISS_INDEX_PATH, metadata_path=METADATA_JSON_PATH)

    # üîÅ Try a test query
    question = "What happened in the 2022 Eagles season?"
    retrieved = retrieve_narrative_chunks(question, FAISS_INDEX_PATH, METADATA_JSON_PATH)

    # üß† Let Mistral answer
    answer_with_mistral(question, retrieved)

FileNotFoundError: [Errno 2] No such file or directory: '/data/chunks.jsonl'