In [13]:
!pip install -q sentence-transformers transformers accelerate bitsandbytes
!pip install -q huggingface_hub
import getpass
from huggingface_hub import login
hf_token = getpass.getpass("Enter your Hugging Face token:")
login(token=hf_token)

Enter your Hugging Face token:··········


We are looking for the model to return the Larry  Fitzgerald line to the question: Who had the most consistent hands in NFL history? We will step through a few methods that return the wrong answer and finally a method that will work.

This first method is just using a bi-encoder to detect semantic similarity. It will incorrectly return Desean Jackson because it isn't understanding "most consistent hands".

In [21]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# === Documents ===
documents = [
    "Larry Fitzgerald had a 29-game streak with no dropped passes from 2016 to 2018.",
    "DeSean Jackson led the NFL in yards per reception in 2009, with five touchdowns over 50 yards.",
    "The 2024 Eagles forced 25 turnovers and had the league's best turnover differential.",
    "Tom Brady threw 50 touchdown passes in the 2007 season, setting a new record at the time."
]

# === Query ===
query = "Who had the most consistent hands in NFL history?"

# === Step 1: Bi-Encoder Retrieval ===
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = bi_encoder.encode(documents, convert_to_tensor=True)
query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
top_k = 3
top_k_idx = np.argsort(cosine_scores.cpu().numpy())[::-1][:top_k]
top_k_docs = [documents[i] for i in top_k_idx]

# === Output Top Match (Bi-Encoder Only) ===
best_idx = 0  # Top 1 match
best_doc = top_k_docs[best_idx]
top_score = float(cosine_scores[top_k_idx[best_idx]])

print("Best match:", best_doc)
print("Cosine similarity score:", round(top_score, 4))

Best match: DeSean Jackson led the NFL in yards per reception in 2009, with five touchdowns over 50 yards.
Cosine similarity score: 0.5012


Now we will add cross- encoding which will allow bi-encoder to quickly retrieve semantic matches and cross- encoder will rerank them with the hopes it will better match the question and answer.

In this case it still fails though and returns Desean Jackson again.

In [20]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import numpy as np

# Documents
documents = [
    "Larry Fitzgerald had a 29-game streak with no dropped passes from 2016 to 2018.",
    "DeSean Jackson led the NFL in yards per reception in 2009, with five touchdowns over 50 yards.",
    "The 2024 Eagles forced 25 turnovers and had the league's best turnover differential.",
    "Tom Brady threw 50 touchdown passes in the 2007 season, setting a new record at the time."
]

query = "Who had the most consistent hands in NFL history?"

# Step 1: Fast retrieval with Bi-Encoder
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = bi_encoder.encode(documents, convert_to_tensor=True)
query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
top_k = 10
top_k_idx = np.argsort(cosine_scores.cpu().numpy())[::-1][:top_k]
top_k_docs = [documents[i] for i in top_k_idx]

# Step 2: Smart re-ranking with Cross-Encoder
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
rerank_pairs = [[query, doc] for doc in top_k_docs]
rerank_scores = cross_encoder.predict(rerank_pairs)

best_idx = int(np.argmax(rerank_scores))
best_doc = top_k_docs[best_idx]

print("Best match:", best_doc)
print("Re-ranked score:", rerank_scores[best_idx])


Best match: DeSean Jackson led the NFL in yards per reception in 2009, with five touchdowns over 50 yards.
Re-ranked score: -5.93768


Now we will try LLM-first method to let the LLM reason to determine the best answer. This first step is embedding and retrieving top k docs.


In [25]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# === Knowledge Base ===
documents = [
    "Larry Fitzgerald had a 29-game streak with no dropped passes from 2016 to 2018.",
    "DeSean Jackson led the NFL in yards per reception in 2009, with five touchdowns over 50 yards.",
    "The 2024 Eagles forced 25 turnovers and had the league's best turnover differential.",
    "Tom Brady threw 50 touchdown passes in the 2007 season, setting a new record at the time."
]

# === Query ===
query = "Who had the most consistent hands in NFL history?"

# === Embed and Retrieve ===
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = bi_encoder.encode(documents, convert_to_tensor=True)
query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
top_k = 10
top_k_idx = np.argsort(cosine_scores.cpu().numpy())[::-1][:top_k]
retrieved_docs = [documents[i] for i in top_k_idx]

This step passes retrieved docs to llm so it can reason and get right answer.

In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# === Load Open LLM (Falcon/Mistral works here) ===
model_id = "tiiuae/falcon-7b-instruct"  # swap with Mistral if you have access
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# === Format Prompt ===
context = "\n".join(retrieved_docs)
prompt = f"""You are a football expert. Use only the context below to answer the question. If the answer is not in the context, say "Not in context."

Context:
{context}

Question: {query}
Answer:"""

# === Generate ===
response = pipe(prompt, max_new_tokens=100)[0]['generated_text']
print(response)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

FileNotFoundError: [Errno 2] No such file or directory: 'LLMRAG.ipynb'