In [80]:
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import faiss
import os
import json
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch



In [75]:
class Retrieval:
    """
    A class to perform similarity-based text retrieval using FAISS and generate responses using Phi-3 Mini.
    """

    def __init__(self, dataset):
        """
        Initializes the Retrieval class by creating a FAISS index from the dataset's embeddings.

        Args:
            dataset (Embeddings): An instance of the Embeddings class containing text data and computed embeddings.
        """
        self.dataset = dataset

        # Extract the embedding dimension from the dataset
        embedding_dim = self.dataset.embeddings.shape[1]

        # Create a FAISS index with L2 (Euclidean) distance for similarity search
        self.index = faiss.IndexFlatL2(embedding_dim)

        # Add dataset embeddings to the FAISS index
        self.index.add(self.dataset.embeddings)

    def search(self, query, k=1):
        """
        Finds the top-k most relevant documents using FAISS similarity search.

        Args:
            query (str): The query text to search for similar entries.
            k (int, optional): The number of top results to return. Defaults to 1.

        Returns:
            list: A list of dictionaries containing 'title' and 'url' of the retrieved documents.
        """
        # Convert query text into an embedding
        query_vector = self.dataset.get_embedding(query).reshape(1, -1)

        # Search FAISS index for the k nearest neighbors
        distances, indices = self.index.search(query_vector, k)

        # Extract only 'url' and 'title' from the retrieved results
        results = [
            {
                "title": self.dataset.data[i].get("title", "No Title"),
                "url": self.dataset.data[i].get("url", "No URL")
            }
            for i in indices[0] if i < len(self.dataset.data)  # Ensure index is valid
        ]

        return results  # Return top-k retrieved documents

    def generate_response(self, query, k=3):
        """
        Retrieves relevant documents and generates a response using Phi-3 Mini.

        Args:
            query (str): The user query.
            k (int, optional): Number of top relevant documents to retrieve. Defaults to 3.

        Returns:
            str: AI-generated response based on retrieved documents and user query.
        """
        retrieved_results = self.search(query, k)  # Retrieve relevant documents

        # Format the retrieved results as context
        context = "\n".join([f"Title: {res['title']}\nURL: {res['url']}" for res in retrieved_results])

        # Construct the prompt for Phi-3 Mini
        prompt = f"""You are an AI assistant helping answer questions. Based on the following retrieved documents, generate a useful response.

        Context:
        {context}

        User Query: {query}

        AI Response:
        """

        # Tokenize the input
        input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Generate the response
        output_tokens = model.generate(**input_tokens, max_length=500)
        response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

        return response


In [76]:

dataset = Embeddings("json_input_data.json")  # JSON file with a list of dictionaries


In [77]:
retriever = Retrieval(dataset)

query = "Healthcare benefits for international students"
top_result = retriever.search(query, k=3)

In [None]:
for results in top_result:
    for result in results:
        print(result)
    

{'title': 'International Student Support | Northeastern International Student Hub', 'url': 'https://international.northeastern.edu/student-support/'}
{'title': 'Health Insurance - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/student-support/global-campuses/canada/health-insurance/'}
{'title': 'Health - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/health/'}


In [78]:
print(top_result)

[{'title': 'International Student Support | Northeastern International Student Hub', 'url': 'https://international.northeastern.edu/student-support/'}, {'title': 'Health Insurance - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/student-support/global-campuses/canada/health-insurance/'}, {'title': 'Health - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/health/'}]
