<a href="https://colab.research.google.com/github/khawar-khan520/nlp_project/blob/main/Copy_of_intro_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install and Import Libraries:

In [1]:
!pip install openai sentence-transformers faiss-cpu hf_xet

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting hf_xet
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [13]:
from google.colab import files
uploaded = files.upload()


Saving winnie_the_pooh.txt to winnie_the_pooh.txt


Load and Chunk your Document:

In [14]:

with open('winnie_the_pooh.txt', 'r') as file:
    # Read the entire content of the file into a string
    text = file.read()

chunks = [text[i:i+200] for i in range(0, len(text), 200)]

Generate Embeddings with SenteceTransformers:

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

Store Embeddings in a FAISS Index for Similarity Search:

In [None]:
import faiss
import numpy as np

index = faiss.IndexFlatL2(embeddings[0].shape[0])
index.add(np.array(embeddings))

# Search
query = "Who is always sad?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=3)

In [None]:
for i in I[0]:
    print(chunks[i])
    print("....")

Build the Prompt from Retrieved Chunks:

In [None]:

retrieved_chunks = [chunks[i] for i in I[0]]

# Format the prompt
context = "\n\n".join(retrieved_chunks)
#query = "What is the capital of France?"

prompt = f"""You are a helpful assistant. Use the following context to answer the question.

Context:
{context}

Question:
{query}

Answer:"""

print(prompt)

Generate an Answer Using a Lightweight Language Model:

In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load a small, instruction-tuned model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Build prompt from chunks
retrieved_chunks = [chunks[i] for i in I[0]]
context = "\n\n".join(retrieved_chunks)


# Simple instruction-style prompt for T5
prompt = f"Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion:\n{query}"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

# Generate output
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)

# Decode and print
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Answer:", answer)

## Task 2: Retriever Class with FAISS"

In [None]:
!pip install sentence-transformers faiss-cpu


Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
[0mInstalling collected packages: nvidia-cublas-cu12
[0mSuccessfully installed nvidia-cublas-cu12


In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import pickle

class Retriever:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []
        self.embeddings = None

    def chunk_text(self, text, chunk_size=200, overlap=50):
        chunks = []
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            chunks.append(chunk)
        return chunks

    def add_documents(self, texts):
        chunks = []
        for text in texts:
            chunks.extend(self.chunk_text(text))
        self.documents.extend(chunks)
        embeddings = self.model.encode(chunks, show_progress_bar=True)
        self.embeddings = np.array(embeddings).astype("float32")
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)

    def query(self, question, top_k=3):
        query_embedding = self.model.encode([question]).astype("float32")
        D, I = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in I[0]]

    def save(self, path="retriever_data"):
        os.makedirs(path, exist_ok=True)
        faiss.write_index(self.index, os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "wb") as f:
            pickle.dump(self.documents, f)

    def load(self, path="retriever_data"):
        self.index = faiss.read_index(os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "rb") as f:
            self.documents = pickle.load(f)



In [9]:
def load_txt(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return f.read()



In [15]:
text = load_txt("winnie_the_pooh.txt")
retriever = Retriever()
retriever.add_documents([text])


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [16]:
query = "What is the main idea of the document?"
results = retriever.query(query)
for i, r in enumerate(results, 1):
    print(f"Result {i}:\n{r}\n")


Result 1:
Information About Project Gutenberg™ electronic works

Professor Michael S. Hart was the originator of the Project
Gutenberg™ concept of a library of electronic works that could be
freely shared with 

Result 2:
” described in paragraph 1.F.3, the Project
Gutenberg Literary Archive Foundation, the owner of the Project
Gutenberg™ trademark, and any other party distributing a Project
Gutenberg™ electronic work 

Result 3:
s Project Gutenberg™
electronic work, you indicate that you have read, understand, agree to
and accept all the terms of this license and intellectual property
(trademark/copyright) agreement. If you d



In [6]:
retriever.save("my_retriever")
retriever.load("my_retriever")


In [24]:
def test_retriever():
    doc = "This is a test document about AI and NLP."
    retriever = Retriever()
    retriever.add_documents([doc])
    result = retriever.query("What is it about?")
    assert "AI and NLP" in result[0]

test_retriever()


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
with open("retriever.py", "w") as f:
    f.write("""
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import pickle

class Retriever:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []
        self.embeddings = None

    def chunk_text(self, text, chunk_size=200, overlap=50):
        chunks = []
        for i in range(0, len(text), chunk_size - overlap):
            chunk = text[i:i + chunk_size]
            chunks.append(chunk)
        return chunks

    def add_documents(self, texts):
        chunks = []
        for text in texts:
            chunks.extend(self.chunk_text(text))
        self.documents.extend(chunks)
        embeddings = self.model.encode(chunks, show_progress_bar=True)
        self.embeddings = np.array(embeddings).astype("float32")
        self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
        self.index.add(self.embeddings)

    def query(self, question, top_k=3):
        query_embedding = self.model.encode([question]).astype("float32")
        D, I = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in I[0]]

    def save(self, path="retriever_data"):
        os.makedirs(path, exist_ok=True)
        faiss.write_index(self.index, os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "wb") as f:
            pickle.dump(self.documents, f)

    def load(self, path="retriever_data"):
        self.index = faiss.read_index(os.path.join(path, "index.faiss"))
        with open(os.path.join(path, "documents.pkl"), "rb") as f:
            self.documents = pickle.load(f)
""")


In [22]:
retriever.save()  # Saves to folder
retriever.load()  # Loads from saved folder


In [23]:
def test_retriever():
    doc = "This is a test document about AI and NLP."
    retriever = Retriever()
    retriever.add_documents([doc])
    result = retriever.query("What is it about?")
    assert "AI and NLP" in result[0]

test_retriever()


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
from google.colab import files
files.download("retriever.py")  # Download retriever.py, or any other files



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>