In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [None]:
!pip install transformers datasets faiss-cpu sentence-transformers gradio rouge-score ipywidgets datasets beautifulsoup4

In [None]:
# Import libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import faiss
import torch
from bs4 import BeautifulSoup
from datasets import load_dataset, Dataset

# Check if GPU is available
print(torch.cuda.is_available())

In [None]:
# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="models")

# Load generative model and tokenizer
# or t5-base
tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir="models")
generative_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", cache_dir="models")

In [None]:
# Load the dataset from: https://huggingface.co/datasets/JLK-ptbk/faq
faq_dataset = load_dataset("JLK-ptbk/faq", cache_dir="datasets", split="train")

# Inspect the first few examples
for i in range(3):
    print(faq_dataset[i])

In [None]:
import ast
import re
from typing import List

def parse_data(data_str: str) -> List[str]:
    """
    Parses a string representation of a list into an actual list.
    """
    try:
        # Safely evaluate the string to a Python list
        data_list = ast.literal_eval(data_str)
        if isinstance(data_list, list):
            return data_list
        else:
            return []
    except (SyntaxError, ValueError):
        # If parsing fails, attempt to extract strings using regex
        return re.findall(r"'(.*?)'", data_str)

def remove_html(text: str) -> str:
    """
    Removes HTML tags from a string.
    """
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text(separator=" ", strip=True)

def clean_entry(data_str: str) -> List[str]:
    """
    Cleans a single 'data' field entry.
    """
    parsed_list = parse_data(data_str)
    cleaned_list = []
    for item in parsed_list:
        # Remove HTML tags
        clean_text = remove_html(item)
        # Normalize whitespace
        clean_text = ' '.join(clean_text.split())
        # Filter out entries that are too short or incomplete
        if len(clean_text) > 10 and not re.search(r'\bStartin\b', clean_text, re.IGNORECASE):
            cleaned_list.append(clean_text)
    return cleaned_list

# Apply the cleaning function
def apply_cleaning(example) -> dict:
    cleaned = clean_entry(example['data'])
    return {'faq': cleaned}

cleaned_dataset = faq_dataset.map(apply_cleaning, remove_columns=['Unnamed: 0', 'index', 'data'])

print(cleaned_dataset[1])

In [37]:
# Process the dataset into a list of dictionaries
import pandas as pd
faq_pairs = []
faq_list = cleaned_dataset["faq"]
for i in range(0, len(faq_list), 2):
    faq_pairs.append({"question": faq_list[i], "answer": faq_list[i + 1]})

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(pd.DataFrame(faq_pairs))

In [None]:
# Embed the documents
answers = dataset['answer']
embeddings = embedding_model.encode(dataset["answer"])

# Determine the dimensionality of the embeddings
d = embeddings.shape[1]

# Initialize the FAISS index
index = faiss.IndexFlatL2(d)  # Using L2 distance; consider IndexHNSWFlat or others for larger datasets

# Add embeddings to the index
index.add(embeddings)

print(f"Number of vectors in the index: {index.ntotal}")

In [46]:
# Load generative model
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

system_prompt !   "You are an AI assistant helping users with their queries about PetBacker services."
)


def rag_qa(question, top_k=1):
    # Encode the question to find similar answers
    question_embedding = embedding_model.encode([question], convert_to_tensor=False)
    question_embedding = np.array(question_embedding).astype("float32")

    # Search for the top_k most similar answers
    distances, indices = index.search(question_embedding, top_k)

    # Retrieve the relevant contexts
    retrieved_answers = [answers[idx] for idx in indices[0]]

    # Prepare the input for the generator
    # Incorporate the system prompt
    print(retrieved_answers)
    context = ' '.join(retrieved_answers[0])
    input_text = f"{system_prompt}\nQuestion: {question}\nContext: {context}"

    # Tokenize the input
    inputs = tokenizer.encode(
        input_text, return_tensors="pt", truncation=True, max_length=512
    )

    # Generate the answer
    outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)

    # Decode the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

In [None]:
# Example question
user_question = "What payment options can I use on PetBacker?"

# Get the answer from RAG
generated_answer = rag_qa(user_question)

print("Q:", user_question)
print("A:", generated_answer)

In [79]:
# RAG pipeline function
def rag_pipeline(query, retrieval_model, faiss_index, documents):
    query_embedding = retrieval_model.encode([query])
    _, retrieved_indices = faiss_index.search(query_embedding, k=3)
    context = " ".join([documents[i] for i in retrieved_indices[0]])

    input_text = f"Context: {context} Query: {query}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = generative_model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Test the RAG pipeline
query = "What is Hugging Face?"
response = rag_pipeline(query, embedding_model, index, documents)
print("Response:", response)

In [None]:
## Step 4: Evaluate the RAG Model
from rouge_score import rouge_scorer

# Reference response for evaluation
reference = "Hugging Face is a company creating open-source libraries."

# Evaluation with ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(response, reference)
print("ROUGE scores:", scores)

In [None]:
# Load CLIP model for multi-modal retrieval
from transformers import CLIPProcessor, CLIPModel

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Example: Image retrieval logic
# Assume "image_features" and "text_features" are pre-computed
text_query = "open-source libraries"
text_features = clip_model.get_text_features(clip_processor(text=[text_query], return_tensors="pt"))

# Multi-modal response: Extend to use image features if applicable

In [None]:
import gradio as gr


def generate_response(query):
    return rag_pipeline(query, model, tokenizer, embedding_model, index, documents)


gr.Interface(fn=generate_response, inputs="text", outputs="text").launch()