<a href="https://colab.research.google.com/github/harjeet88/LLM_experiemnts/blob/main/LLMs_course/week1/day2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch faiss-cpu # Install necessary libraries

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModel
import faiss
import numpy as np
import os



In [None]:
# Model setup
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
embedding_model_name = 'sentence-transformers/all-mpnet-base-v2' # or another sentence embedding model
embedding_model = AutoModel.from_pretrained(embedding_model_name)
# Use AutoTokenizer to load the appropriate tokenizer for the embedding model
from transformers import AutoTokenizer
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
def get_embeddings(texts):
    """Generate embeddings for a list of texts."""
    encoded_input = embedding_tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = embedding_model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings.numpy()


In [None]:
def ingest_documents(document_paths, index, texts):
    """Ingest documents into the vector database."""
    for doc_path in document_paths:
        try:
            with open(doc_path, "r", encoding="utf-8") as f:
                text = f.read()
                texts.append(text)
        except FileNotFoundError:
            print(f"File not found: {doc_path}")
        except Exception as e:
            print(f"Error processing {doc_path}: {e}")

    embeddings = get_embeddings(texts[-len(document_paths):]) #generate only the new embeddings.
    index.add(embeddings)
    print("Documents ingested successfully.")


In [None]:
def retrieve_relevant_context(question, index, texts):
    """Retrieve relevant context from the vector database."""
    question_embedding = get_embeddings([question])
    D, I = index.search(question_embedding, 5) # Search top 5
    relevant_texts = [texts[i] for i in I[0]]
    return " ".join(relevant_texts)

In [None]:
def generate_answer(context, question):
    """Generate an answer using the Flan-T5 model."""
    input_text = f"context: {context} question: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [None]:
def rag_pipeline(question, index, texts):
    """RAG pipeline using FAISS and Flan-T5."""
    context = retrieve_relevant_context(question, index, texts)
    if context:
        answer = generate_answer(context, question)
        return answer
    else:
        return "No relevant information found."


In [None]:
# Example usage:
texts = [] # List to store document texts
embeddings_dim = 768  # Dimension of embeddings from all-mpnet-base-v2
index = faiss.IndexFlatL2(embeddings_dim) # Create FAISS index


In [None]:
# Create example documents (for testing)
#doc1_content = "The Eiffel Tower is in Paris."
#doc2_content = "The capital of Japan is Tokyo."
#doc3_content = "Python is a popular programming language."

# HP Usage

doc1_content = "Harry Potter is a series of seven fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends, Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's conflict with Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as the Ministry of Magic, and subjugate all wizards and Muggles (non-magical people)."
doc2_content="Harry learns that his parents, Lily and James Potter, also had magical powers and were murdered by the dark wizard Lord Voldemort when Harry was a baby."
doc3_content="He gains the friendship of Ron Weasley, a member of a large but poor wizarding family, and Hermione Granger, a witch of non-magical, or Muggle, parentage."
doc4_content="he first book concludes with Harry's confrontation with Voldemort, who, in his quest to regain a body, yearns to possess the Philosopher's Stone, a substance that bestows everlasting life."
doc5_content="Tom riddle is actual Voldemort. Harry learns from a drunken Slughorn that he used to teach Tom Riddle, and that Voldemort divided his soul into pieces, creating a series of Horcruxes."




In [None]:
with open("doc1.txt", "w", encoding="utf-8") as f:
    f.write(doc1_content)
with open("doc2.txt", "w", encoding="utf-8") as f:
    f.write(doc2_content)
with open("doc3.txt", "w", encoding="utf-8") as f:
    f.write(doc3_content)
with open("doc4.txt", "w", encoding="utf-8") as f:
    f.write(doc5_content)
with open("doc5.txt", "w", encoding="utf-8") as f:
    f.write(doc5_content)

In [None]:
document_paths = ["doc1.txt", "doc2.txt", "doc3.txt"]
ingest_documents(document_paths, index, texts)

Documents ingested successfully.


In [None]:
question = "who are friends of harry potter?"
answer = rag_pipeline(question, index, texts)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: who are friends of harry potter?
Answer: Hermione Granger and Ron Weasley


In [None]:
question = "who is enemy of harry potter"
answer = rag_pipeline(question, index, texts)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: who is enemy of harry potter
Answer: Lord Voldemort
