<a href="https://colab.research.google.com/github/khawar-khan520/nlp_project/blob/main/retrieval_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install and Import Libraries:

In [5]:
!mkdir -p baseline/data baseline/generator baseline/retriever


In [None]:
!pip install openai sentence-transformers faiss-cpu hf_xet

In [6]:
from google.colab import files
uploaded = files.upload()


Saving winnie_the_pooh.txt to winnie_the_pooh (1).txt


In [7]:
!mv winnie_the_pooh.txt baseline/data/


Load and Chunk your Document:

In [8]:

with open('winnie_the_pooh.txt', 'r') as file:
    # Read the entire content of the file into a string
    text = file.read()

chunks = [text[i:i+200] for i in range(0, len(text), 200)]

FileNotFoundError: [Errno 2] No such file or directory: 'winnie_the_pooh.txt'

Generate Embeddings with SenteceTransformers:

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'chunks' is not defined

Store Embeddings in a FAISS Index for Similarity Search:

In [None]:
import faiss
import numpy as np

index = faiss.IndexFlatL2(embeddings[0].shape[0])
index.add(np.array(embeddings))

# Search
query = "Who is always sad?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=3)

In [None]:
for i in I[0]:
    print(chunks[i])
    print("....")

Build the Prompt from Retrieved Chunks:

In [None]:

retrieved_chunks = [chunks[i] for i in I[0]]

# Format the prompt
context = "\n\n".join(retrieved_chunks)
#query = "What is the capital of France?"

prompt = f"""You are a helpful assistant. Use the following context to answer the question.

Context:
{context}

Question:
{query}

Answer:"""

print(prompt)

Generate an Answer Using a Lightweight Language Model:

In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load a small, instruction-tuned model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Build prompt from chunks
retrieved_chunks = [chunks[i] for i in I[0]]
context = "\n\n".join(retrieved_chunks)


# Simple instruction-style prompt for T5
prompt = f"Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion:\n{query}"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

# Generate output
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)

# Decode and print
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Answer:", answer)

## Task 2: Retriever Class with FAISS"

In [None]:
!pip install sentence-transformers faiss-cpu


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import pickle

class Retriever:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []
        self.embeddings = None

    def chunk_text(self, text, chunk_size=200, overlap=50):
        chunks = []
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            chunks.append(chunk)
        return chunks

    def add_documents(self, texts):
        chunks = []
        for text in texts:
            chunks.extend(self.chunk_text(text))
        self.documents.extend(chunks)
        embeddings = self.model.encode(chunks, show_progress_bar=True)
        self.embeddings = np.array(embeddings).astype("float32")
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)

    def query(self, question, top_k=3):
        query_embedding = self.model.encode([question]).astype("float32")
        D, I = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in I[0]]

    def save(self, path="retriever_data"):
        os.makedirs(path, exist_ok=True)
        faiss.write_index(self.index, os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "wb") as f:
            pickle.dump(self.documents, f)

    def load(self, path="retriever_data"):
        self.index = faiss.read_index(os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "rb") as f:
            self.documents = pickle.load(f)



In [None]:
def load_txt(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return f.read()



In [None]:
text = load_txt("winnie_the_pooh.txt")
retriever = Retriever()
retriever.add_documents([text])


In [None]:
query = "What is the main idea of the document?"
results = retriever.query(query)
for i, r in enumerate(results, 1):
    print(f"Result {i}:\n{r}\n")


In [None]:
retriever.save("my_retriever")
retriever.load("my_retriever")


In [None]:
def test_retriever():
    doc = "This is a test document about AI and NLP."
    retriever = Retriever()
    retriever.add_documents([doc])
    result = retriever.query("What is it about?")
    assert "AI and NLP" in result[0]

test_retriever()


In [None]:
with open("retriever.py", "w") as f:
    f.write("""
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import pickle

class Retriever:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []
        self.embeddings = None

    def chunk_text(self, text, chunk_size=200, overlap=50):
        chunks = []
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            chunks.append(chunk)
        return chunks

    def add_documents(self, texts):
        chunks = []
        for text in texts:
            chunks.extend(self.chunk_text(text))
        self.documents.extend(chunks)
        embeddings = self.model.encode(chunks, show_progress_bar=True)
        self.embeddings = np.array(embeddings).astype("float32")
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)

    def query(self, question, top_k=3):
        query_embedding = self.model.encode([question]).astype("float32")
        D, I = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in I[0]]

    def save(self, path="retriever_data"):
        os.makedirs(path, exist_ok=True)
        faiss.write_index(self.index, os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "wb") as f:
            pickle.dump(self.documents, f)

    def load(self, path="retriever_data"):
        self.index = faiss.read_index(os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "rb") as f:
            self.documents = pickle.load(f)
""")


In [None]:
retriever.save()  # Saves to folder
retriever.load()  # Loads from saved folder


In [None]:
def test_retriever():
    doc = "This is a test document about AI and NLP."
    retriever = Retriever()
    retriever.add_documents([doc])
    result = retriever.query("What is it about?")
    assert "AI and NLP" in result[0]

test_retriever()


In [None]:
from google.colab import files
files.download("retriever.py")  # Download retriever.py, or any other files

