In [11]:
!git clone https://github.com/leeskelton8383/EaglesGPT.git
%cd EaglesGPT


Cloning into 'EaglesGPT'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 32 (delta 7), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (32/32), 2.51 MiB | 5.68 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/content/EaglesGPT/EaglesGPT/EaglesGPT


1. Load and Chunk the wikipedia narrative docs.

In [13]:
import os
import json


# -----------------------------------
# Step 1: Chunking Function
# -----------------------------------
def chunk_text_fixed(text, chunk_size=300, overlap=50):
    """
    Splits text into overlapping chunks.
    Each chunk has `chunk_size` words, with `overlap` words shared with the next chunk.
    """
    words = text.split()
    chunks = []
    step = chunk_size - overlap

    if step <= 0:
        raise ValueError("chunk_size must be greater than overlap")

    for i in range(0, len(words), step):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)

    return chunks

# -----------------------------------
# Step 2: Load .txt Files, Chunk, Print Examples, Save as JSONL
# -----------------------------------
def load_and_chunk_wikipedia(folder_path, output_path, chunk_size=300, overlap=50):
    all_chunks = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            full_path = os.path.join(folder_path, filename)

            with open(full_path, 'r', encoding='utf-8') as f:
                text = f.read()
                doc_title = filename.replace('.txt', '')

                chunks = chunk_text_fixed(text, chunk_size=chunk_size, overlap=overlap)

                for i, chunk in enumerate(chunks):
                    chunk_id = f"{doc_title}_{str(i+1).zfill(3)}"
                    all_chunks.append({
                        'chunk_id': chunk_id,
                        'doc_title': doc_title,
                        'text': chunk
                    })

    # --- Print first 3 chunks for inspection ---
    print("\n--- Example Chunks (JSON) ---")
    for chunk in all_chunks[:3]:
        print(json.dumps(chunk, indent=2))

    # --- Save all chunks to JSONL ---
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as out_file:
        for chunk in all_chunks:
            out_file.write(json.dumps(chunk) + '\n')

    print(f"\n‚úÖ Saved {len(all_chunks)} chunks to {output_path}")
    return all_chunks

# -----------------------------------
# Step 3: Setup Clean Project Paths (portable)
# -----------------------------------
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, '..'))

NARRATIVE_DIR = os.path.join(PROJECT_ROOT, 'data', 'narratives')
CHUNKS_OUT_PATH = os.path.join(PROJECT_ROOT, 'data', 'chunks.jsonl')

# --- Run It ---
chunks = load_and_chunk_wikipedia(folder_path=NARRATIVE_DIR, output_path=CHUNKS_OUT_PATH)



--- Example Chunks (JSON) ---
{
  "chunk_id": "2022_philadelphia_eagles_season_001",
  "doc_title": "2022_philadelphia_eagles_season",
  "text": "The 2022 season was the Philadelphia Eagles' 90th in the National Football League (NFL), their 20th playing home games at Lincoln Financial Field, their second under head coach Nick Sirianni, and seventh under general manager Howie Roseman since he returned to the position (twelfth overall). The Eagles started 8\u20130 for the first time in franchise history, before their winning streak was snapped with a 32\u201321 upset loss to the Washington Commanders in Week 10. They improved on their 9\u20138 record from last year after a 40\u201333 victory over the Green Bay Packers in Week 12. With a 48\u201322 road win over the division rival New York Giants in Week 14, the Eagles clinched their second straight playoff berth, and their fifth in six seasons. After a 25\u201320 road win over the Chicago Bears in Week 15, the Eagles matched their franc

2. Embed the corpus and index

In [24]:
#! pip install -U sentence-transformers
#! pip install faiss-cpu
!pip install -U bitsandbytes

import os
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# -----------------------------------
# Path Setup (works in Colab or local)
# -----------------------------------
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR))  # assuming /content/EaglesGPT in Colab

CHUNKS_JSONL_PATH = os.path.join(PROJECT_ROOT, 'data', 'chunks.jsonl')
FAISS_INDEX_PATH = os.path.join(PROJECT_ROOT, 'data', 'narrative_index.faiss')
METADATA_JSON_PATH = os.path.join(PROJECT_ROOT, 'data', 'narrative_metadata.json')

# -----------------------------------
# Load Chunks
# -----------------------------------
def load_chunks_from_jsonl(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# -----------------------------------
# Embed Chunks
# -----------------------------------
def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    texts = [chunk['text'] for chunk in chunks]
    print(f"üîÅ Embedding {len(texts)} chunks...")
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)
    for i, emb in enumerate(embeddings):
        chunks[i]['embedding'] = emb.tolist()
    return chunks

# -----------------------------------
# Build FAISS Index
# -----------------------------------
def build_faiss_index(embedded_chunks, index_path, metadata_path):
    embeddings = np.array([c['embedding'] for c in embedded_chunks]).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    faiss.write_index(index, index_path)
    print(f"‚úÖ FAISS index saved to {index_path}")
    metadata = [{k: c[k] for k in ['chunk_id', 'doc_title', 'text']} for c in embedded_chunks]
    with open(metadata_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2)
    print(f"‚úÖ Metadata saved to {metadata_path}")

# -----------------------------------
# Retrieve Chunks
# -----------------------------------
def retrieve_narrative_chunks(question, index_path, metadata_path, model_name='all-MiniLM-L6-v2', top_k=3):
    print(f"\nüîç Retrieving top {top_k} chunks for question: {question}")
    index = faiss.read_index(index_path)
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    model = SentenceTransformer(model_name)
    query_vec = model.encode([question]).astype('float32')
    distances, indices = index.search(query_vec, top_k)
    results = [metadata[i] for i in indices[0]]
    for i, r in enumerate(results):
        print(f"[{i+1}] ({r['chunk_id']}) from {r['doc_title']}")
        print(r['text'][:400], "\n---\n")
    return results

# -----------------------------------
# Use HuggingFace Mistral to Answer
# -----------------------------------
def answer_with_mistral(question, retrieved_chunks):
    context = "\n\n".join(chunk["text"] for chunk in retrieved_chunks)
    prompt = f"""Answer the question based on the following Eagles season narratives:

{context}

Question: {question}
Answer:"""

    # Load once
    if not hasattr(answer_with_mistral, "model"):
        print("üì¶ Loading Mistral...")
        model_name = "mistralai/Mistral-7B-Instruct-v0.2"
        answer_with_mistral.tokenizer = AutoTokenizer.from_pretrained(model_name)
        answer_with_mistral.model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map="auto"
        )

    tokenizer = answer_with_mistral.tokenizer
    model = answer_with_mistral.model
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=300)
    result = tokenizer.decode(output[0], skip_special_tokens=True)

    print("\nü§ñ Mistral's Answer:\n")
    print(result.split("Answer:")[-1].strip())
    return result



In [None]:


question = "What happened in the 2022 Eagles season?"
retrieved = retrieve_narrative_chunks(question, FAISS_INDEX_PATH, METADATA_JSON_PATH)
answer= answer_with_mistral(question,retrieved)


üîç Retrieving top 3 chunks for question: What happened in the 2022 Eagles season?
[1] (2022_philadelphia_eagles_season_001) from 2022_philadelphia_eagles_season
The 2022 season was the Philadelphia Eagles' 90th in the National Football League (NFL), their 20th playing home games at Lincoln Financial Field, their second under head coach Nick Sirianni, and seventh under general manager Howie Roseman since he returned to the position (twelfth overall). The Eagles started 8‚Äì0 for the first time in franchise history, before their winning streak was snapped wit 
---

[2] (2020_philadelphia_eagles_season_001) from 2020_philadelphia_eagles_season
The 2020 season was the Philadelphia Eagles' 88th in the National Football League (NFL) and their fifth and final under head coach Doug Pederson. They failed to improve on their 9‚Äì7 record from the previous season following a 23‚Äì17 loss to the Seattle Seahawks in Week 12. They were eliminated from playoff contention for the first time since 2

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
