In [9]:
class Embeddings:
    """
    A class to generate and store sentence embeddings for a given dataset.

    Attributes:
        model (SentenceTransformer): Pre-trained transformer model for generating embeddings.
        data (list): List of dictionaries representing the dataset.
        embeddings (np.ndarray): NumPy array storing computed embeddings in float32 format.
    """

    def __init__(self, data_path, model="sentence-transformers/all-MiniLM-L6-v2", max_length=512):
        """
        Initializes the Embeddings class by loading data and computing embeddings.

        Args:
            data_path (str): Path to the JSON file containing text data.
            model (str, optional): Name of the SentenceTransformer model to use. Defaults to "sentence-transformers/all-MiniLM-L6-v2".
            max_length (int, optional): Maximum token length for embeddings (not currently used). Defaults to 512.
        """
        self.model = SentenceTransformer(model)  # Load the sentence transformer model
        self.data = self.load_data(data_path)  # Load dataset from JSON file

        # Compute and store embeddings as float32 for FAISS compatibility
        self.embeddings = np.array(
            [self.get_embedding(" ".join(map(str, item.values()))) for item in self.data],
            dtype=np.float32
        )

    def load_data(self, data_path):
        """
        Loads JSON data into a list of dictionaries.

        Args:
            data_path (str): Path to the JSON file.

        Returns:
            list: A list of dictionaries, where each dictionary represents an entry in the dataset.
        """
        with open(data_path, "r", encoding="utf-8") as file:
            return json.load(file)  # Ensure the JSON file contains a list of dictionaries

    def get_embedding(self, text):
        """
        Generates an embedding for a given text using the SentenceTransformer model.

        Args:
            text (str): Input text to encode.

        Returns:
            np.ndarray: The generated embedding as a NumPy array in float32 format.
        """
        return self.model.encode(text, convert_to_numpy=True).astype(np.float32)


In [5]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import faiss
import os
import json
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch



In [7]:
!pip install --upgrade torch transformers


Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cusparselt-cu12==0.6.2 (from torch)
  Downloading nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting triton==3.2.0 (from torch)
  Downloading triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl (150.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.1/150.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
!pip install --upgrade torch torchvision


Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.20.1+cu124
    Uninstalling torchvision-0.20.1+cu124:
      Successfully uninstalled torchvision-0.20.1+cu124
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.18 requires torch<2.6,>=1.10, but you have torch 2.6.0 which is incompatible.[0m[31m
[0mSuccessfully installed torchvision-0.21.0


In [7]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [3]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
    print("Number of GPUs:", torch.cuda.device_count())
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU found!")


CUDA is available!
Number of GPUs: 1
GPU Name: Tesla T4


In [11]:
class Retrieval:
    """
    A class to perform similarity-based text retrieval using FAISS.

    Attributes:
        dataset (Embeddings): An instance of the Embeddings class containing text data and embeddings.
        index (faiss.IndexFlatL2): FAISS index built using L2 distance for fast retrieval.
    """

    def __init__(self, dataset):
        """
        Initializes the Retrieval class by creating a FAISS index from the dataset's embeddings.

        Args:
            dataset (Embeddings): An instance of the Embeddings class containing text data and computed embeddings.
        """
        self.dataset = dataset  # Store dataset instance
        # Extract the embedding dimension from the dataset
        embedding_dim = self.dataset.embeddings.shape[1]
        # Create a FAISS index with L2 (Euclidean) distance for similarity search
        self.index = faiss.IndexFlatL2(embedding_dim)
        # Add dataset embeddings to the FAISS index
        self.index.add(self.dataset.embeddings)

    def search(self, query, k=1):
        """
        Finds the top-k most relevant documents using FAISS similarity search.

        Args:
            query (str): The query text to search for similar entries.
            k (int, optional): The number of top results to return. Defaults to 1.

        Returns:
            list or dict: If k=1, returns a single dictionary with 'title' and 'url'.
                          If k>1, returns a list of such dictionaries.
        """
        # Convert query text into an embedding
        query_vector = self.dataset.get_embedding(query).reshape(1, -1)
        # Search FAISS index for the k nearest neighbors
        distances, indices = self.index.search(query_vector, k)
        # Extract only 'url' and 'title' from the retrieved results
        results = [
            {
                "title": self.dataset.data[i].get("title", "No Title"),
                "url": self.dataset.data[i].get("url", "No URL"),
                "content": self.dataset.data[i].get("content", "No Content")
            }
            for i in indices[0] if i < len(self.dataset.data)  # Ensure index is valid
        ]

        return results if k > 1 else results[0]  # Return a list if k > 1, else return a single result

In [15]:

dataset = Embeddings("json_input_data.json")  # JSON file with a list of dictionaries


In [64]:
retriever = Retrieval(dataset)  # Create a retrieval instance with the dataset
query = "Can i do a part time 20hr week internship or Coop?"
top_result = retriever.search(query, k=5)

In [65]:
combined_content = ""
for result in top_result:
  combined_content += result["content"]

In [66]:
print(combined_content)

Students Studying in 
																								The United States Summer Term At Northeastern, classes run year-round. However, not all students are required or choose to study during the summer term. Whether you plan to study or travel, it is important to maintain your status in the summer. You can learn more below. Summer Enrollment at Northeastern During required academic terms, F-1 and J-1 students must maintain full-time enrollment and appropriate on-ground presence. While the summer term is considered a vacation term for most continuing students, some students are required to maintain full-time enrollment in the summer. Please read the following information carefully to make sure you understand your summer enrollment requirements. What summer terms are available at Northeastern? CPS Undergraduate students and School of Law Students Full Summer CPS Graduate Students Summer Quarter All other Undergraduate and Graduate students Full Summer Summer 1* Summer 2* *Degree-seeking students 

In [17]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `capstone_project` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `c

In [None]:

!huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --local-dir Llama-3.2-1B-Instruct

Fetching 13 files:   0% 0/13 [00:00<?, ?it/s]Downloading 'LICENSE.txt' to 'Llama-3.2-1B-Instruct/.cache/huggingface/download/cyBuwAu93UXke23CJCWORBYR70A=.085b47c1575cb889b7024030e60b78f54f0b8c9e.incomplete'
Downloading '.gitattributes' to 'Llama-3.2-1B-Instruct/.cache/huggingface/download/wPaCkH-WbT7GsmxMKKrNZTV4nSM=.a6344aac8c09253b3b630fb776ae94478aa0275b.incomplete'
Downloading 'USE_POLICY.md' to 'Llama-3.2-1B-Instruct/.cache/huggingface/download/UcPfAI5B08awK9_TiALuc0iOThI=.ac3c5f21b9779e3da0677d6d3c587778fe3a331e.incomplete'
Downloading 'README.md' to 'Llama-3.2-1B-Instruct/.cache/huggingface/download/Xn7B-BWUGOee2Y6hCZtEhtFu4BE=.b3c62ef6190712fb3e31af1628f0b85706fc4ba8.incomplete'
Downloading 'config.json' to 'Llama-3.2-1B-Instruct/.cache/huggingface/download/8_PA_wEVGiVa2goH2H4KQOQpvVY=.3e3aaf51a035cb5092d9f6827a0dc074657ba88c.incomplete'
Downloading 'model.safetensors' to 'Llama-3.2-1B-Instruct/.cache/huggingface/download/xGOKKLRSlIhH692hSVvI1-gpoa8=.1ff795ff6a07e6a68085d206fb

In [30]:
LLAMA_MODEL_PATH = "Llama-3.2-1B-Instruct"

In [31]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_PATH, trust_remote_code=True, local_files_only=True)
llama_model = AutoModelForCausalLM.from_pretrained(
        LLAMA_MODEL_PATH,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        device_map="auto",
        offload_folder="offload",        # Ensure this folder exists or can be created.
        offload_state_dict=True
)

In [32]:
chat_pipeline = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


Device set to use cuda:0


In [33]:
class ChatHuggingFace:
    def __init__(self, llm):
        self.llm = llm

    def __call__(self, prompt):
        if not isinstance(prompt, list):
            prompt = [{"role": "user", "content": str(prompt)}]
        outputs = self.llm(prompt, max_new_tokens=256)
        generated = outputs[0]["generated_text"]
        return generated

chat_model = ChatHuggingFace(chat_pipeline)

In [43]:
def summarize_abstract(content: str, query: str) -> str:
    prompt = (
        "You are Northeastern's university student visa assistant.\n"
        f"Query: {query}\n"
        f"Content: {content}\n"
        "If the query can be answered with the context provided in the content, frame an answer. "
        "Otherwise, state: 'As an Northeastern's OGS assistant I do not have an answer to that question.'"
    )
    messages = [{"role": "user", "content": prompt}]
    response = chat_model(messages)
    return response


In [67]:
output = summarize_abstract(combined_content, query)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [68]:
print(output)

[{'role': 'user', 'content': "You are Northeastern's university student visa assistant.\nQuery: Can i do a part time 20hr week internship or Coop?\nContent: Students Studying in \n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tThe United States Summer Term At Northeastern, classes run year-round. However, not all students are required or choose to study during the summer term. Whether you plan to study or travel, it is important to maintain your status in the summer. You can learn more below. Summer Enrollment at Northeastern During required academic terms, F-1 and J-1 students must maintain full-time enrollment and appropriate on-ground presence. While the summer term is considered a vacation term for most continuing students, some students are required to maintain full-time enrollment in the summer. Please read the following information carefully to make sure you understand your summer enrollment requirements. What summer terms are available at Northeastern? CPS Undergraduate studen

In [69]:
for line in output:
    print(line)

{'role': 'user', 'content': "You are Northeastern's university student visa assistant.\nQuery: Can i do a part time 20hr week internship or Coop?\nContent: Students Studying in \n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tThe United States Summer Term At Northeastern, classes run year-round. However, not all students are required or choose to study during the summer term. Whether you plan to study or travel, it is important to maintain your status in the summer. You can learn more below. Summer Enrollment at Northeastern During required academic terms, F-1 and J-1 students must maintain full-time enrollment and appropriate on-ground presence. While the summer term is considered a vacation term for most continuing students, some students are required to maintain full-time enrollment in the summer. Please read the following information carefully to make sure you understand your summer enrollment requirements. What summer terms are available at Northeastern? CPS Undergraduate student

In [70]:
print(output[1]["content"])

As an Northeastern's OGS assistant, I can provide guidance on the different types of internships and co-ops available to F-1 and J-1 students.

According to the content, a part-time 20-hour week internship or co-op is allowed. However, there are some conditions and requirements to be met:

- If you are in your first term of enrollment at Northeastern, you must maintain full-time enrollment and on-ground presence during the summer term(s).
- If you are in your final term of enrollment, you must maintain full-time enrollment and on-ground presence throughout the duration of your final term.
- You must have a valid I-20 or DS-2019.
- You must have an approved change of degree level (CDL) or change of status (COS) or be returning to Northeastern after a leave of absence.
- You must be enrolled full-time in the full summer or in the summer quarter.
- You must be enrolled in a course that requires an experiential learning component in the syllabus.
- You must be enrolled in a course that req