In [1]:
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import faiss
import os
import json
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch






In [2]:
class Embeddings:
    """
    A class to generate and store sentence embeddings for a given dataset.
    
    Attributes:
        model (SentenceTransformer): Pre-trained transformer model for generating embeddings.
        data (list): List of dictionaries representing the dataset.
        embeddings (np.ndarray): NumPy array storing computed embeddings in float32 format.
    """

    def __init__(self, data_path, model="sentence-transformers/all-MiniLM-L6-v2", max_length=512): 
        """
        Initializes the Embeddings class by loading data and computing embeddings.

        Args:
            data_path (str): Path to the JSON file containing text data.
            model (str, optional): Name of the SentenceTransformer model to use. Defaults to "sentence-transformers/all-MiniLM-L6-v2".
            max_length (int, optional): Maximum token length for embeddings (not currently used). Defaults to 512.
        """
        self.model = SentenceTransformer(model)  # Load the sentence transformer model
        self.data = self.load_data(data_path)  # Load dataset from JSON file

        # Compute and store embeddings as float32 for FAISS compatibility
        self.embeddings = np.array(
            [self.get_embedding(" ".join(map(str, item.values()))) for item in self.data], 
            dtype=np.float32
        )

    def load_data(self, data_path):
        """
        Loads JSON data into a list of dictionaries.

        Args:
            data_path (str): Path to the JSON file.

        Returns:
            list: A list of dictionaries, where each dictionary represents an entry in the dataset.
        """
        with open(data_path, "r", encoding="utf-8") as file:
            return json.load(file)  # Ensure the JSON file contains a list of dictionaries
    
    def get_embedding(self, text):
        """
        Generates an embedding for a given text using the SentenceTransformer model.

        Args:
            text (str): Input text to encode.

        Returns:
            np.ndarray: The generated embedding as a NumPy array in float32 format.
        """
        return self.model.encode(text, convert_to_numpy=True).astype(np.float32)


In [3]:
class Retrieval:
    """
    A class to perform similarity-based text retrieval using FAISS.

    Attributes:
        dataset (Embeddings): An instance of the Embeddings class containing text data and embeddings.
        index (faiss.IndexFlatL2): FAISS index built using L2 distance for fast retrieval.
    """

    def __init__(self, dataset):
        """
        Initializes the Retrieval class by creating a FAISS index from the dataset's embeddings.

        Args:
            dataset (Embeddings): An instance of the Embeddings class containing text data and computed embeddings.
        """
        self.dataset = dataset  # Store dataset instance
        # Extract the embedding dimension from the dataset
        embedding_dim = self.dataset.embeddings.shape[1]
        # Create a FAISS index with L2 (Euclidean) distance for similarity search
        self.index = faiss.IndexFlatL2(embedding_dim)
        # Add dataset embeddings to the FAISS index
        self.index.add(self.dataset.embeddings)

    def search(self, query, k=1):
        """
        Finds the top-k most relevant documents using FAISS similarity search.

        Args:
            query (str): The query text to search for similar entries.
            k (int, optional): The number of top results to return. Defaults to 1.

        Returns:
            list or dict: If k=1, returns a single dictionary with 'title' and 'url'.
                          If k>1, returns a list of such dictionaries.
        """
        # Convert query text into an embedding
        query_vector = self.dataset.get_embedding(query).reshape(1, -1)
        # Search FAISS index for the k nearest neighbors
        distances, indices = self.index.search(query_vector, k)
        # Extract only 'url' and 'title' from the retrieved results
        results = [
            {
                "title": self.dataset.data[i].get("title", "No Title"), 
                "url": self.dataset.data[i].get("url", "No URL")
            }
            for i in indices[0] if i < len(self.dataset.data)  # Ensure index is valid
        ]

        return results if k > 1 else results[0]  # Return a list if k > 1, else return a single result

In [5]:

dataset = Embeddings("json_input_data.json")  # JSON file with a list of dictionaries


In [None]:
retriever = Retrieval(dataset)  # Create a retrieval instance with the dataset
query = "Healthcare benefits for international students"
top_result = retriever.search(query, k=3)

In [9]:
for result in top_result:
    print(result)

{'title': 'International Student Support | Northeastern International Student Hub', 'url': 'https://international.northeastern.edu/student-support/'}
{'title': 'Health Insurance - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/student-support/global-campuses/canada/health-insurance/'}
{'title': 'Health - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/health/'}


In [78]:
print(top_result)

[{'title': 'International Student Support | Northeastern International Student Hub', 'url': 'https://international.northeastern.edu/student-support/'}, {'title': 'Health Insurance - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/student-support/global-campuses/canada/health-insurance/'}, {'title': 'Health - Office of Global Services', 'url': 'https://international.northeastern.edu/ogs/health/'}]
