In [1]:
# import torch.cuda
# !pip install transformers accelerate torch
# !pip install sentence_transformers
# !pip install langchain
# !pip install chromadb
# !pip install pypdf
# !pip install PyMuPDF
# !pip install langchain_community

In [2]:
import os

os.environ['HF_HOME'] = '/mnt/data/thomas/.cache' #Used to change where to save model. Uncomment this if you want to use default location
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import torch

In [None]:
# !pip install -q huggingface_hub
# from huggingface_hub import notebook_login
#
# notebook_login()

In [3]:
import fitz
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
from langchain_huggingface import HuggingFaceEmbeddings

In [4]:
# === Step 1: Load and clean PDF ===
def load_and_clean_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    exact_removal = {
        "Fulbright University Vietnam Ground Floor, 105 Ton Dat Tien, Tan Phu, Quan 7, Ho Chi Minh City"
    }

    all_paragraphs = []

    for i, page in enumerate(doc):
        raw_paragraphs = [p.strip() for p in page.get_text().split("\n") if p.strip()]
        filtered = []

        for p in raw_paragraphs:
            if p in exact_removal:
                continue
            if p.isdigit():
                continue
            if p.lower().startswith("internal"):
                continue
            if re.match(r"^\d+\s*\|\s*Page$", p):
                continue
            if re.match(r"^Page\s+\d+\s+of\s+\d+", p, re.IGNORECASE):
                continue
            filtered.append(p)

        if i > 0 and filtered:
            first_word = filtered[0].split()[0] if filtered[0].split() else ""
            if first_word and not first_word[0].isupper():
                all_paragraphs[-1] += " " + filtered[0]
                filtered = filtered[1:]

        all_paragraphs.extend(filtered)

    return all_paragraphs

In [5]:
# === Step 2: Chunk text ===
def chunk_paragraphs(paragraphs):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
    return splitter.split_text("\n\n".join(paragraphs))



In [6]:
# === Step 3: Build vector store ===
def build_vectorstore(chunks, persist_path="./chroma_fulbright2"):
    documents = [Document(page_content=chunk) for chunk in chunks]
    print(type(documents[0]))
    embedding_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base")
    # model_name_or_path = "Alibaba-NLP/gte-multilingual-base"
    # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    # model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_path
    )
    vectorstore.persist()
    return vectorstore


In [12]:
# === Step 4: Load local LLM ===
def load_local_llm(model_id="Qwen/Qwen2.5-7B-Instruct-1M"):
    tokenizer = AutoTokenizer.from_pretrained(model_id, timeout=60)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype="auto",
        device_map="auto"
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return pipe

# === Step 5: Ask questions ===
def ask_question(llm_pipe, vectorstore, query, top_k=3):
    docs = vectorstore.similarity_search(query, k=top_k)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Answer the question based on the following context:\n\n{context}\n\nQuestion: {query}\nAnswer:"""

    print("\n=== PROMPT ===\n", prompt)

    response = llm_pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]["generated_text"]
    print("\n=== RESPONSE ===\n", response[len(prompt):].strip())


In [13]:
# === Main ===
def main():
    pdf_path = "../data/Academic-Policy_V5.1.pdf" #Change the data path
    paragraphs = load_and_clean_pdf(pdf_path)
    chunks = chunk_paragraphs(paragraphs)
    vectorstore = build_vectorstore(chunks)

    print("✅ Vectorstore built and persisted.")

    llm_pipe = load_local_llm()

    list_questions = []
    list_questions.append("Can you differentiate a cross-listed course and elective applied course in the case of a double major?")
    list_questions.append("I would like to ask for the capstone withdrawal policy. What will be the impact when I choose to drop the capstone before the Fall 2025 term starts? Will there be any penalties associated with withdrawing, such as a 'W' notation on my transcript?")
    list_questions.append("Does withdrawal from a course after 4th week result in a 4 credits deduction in the total credits covered by financial aid?")
    list_questions.append("What are the consequences if I withdraw from capstone II?")
    list_questions.append("Can I continue my study for more than 4 years with financial aid?")
    list_questions.append("How many courses can be double counted between two majors? And among three majors?")
    list_questions.append("Can a student with relationship breakdown request special considerations for their course’s assessment?")
    list_questions.append("How is grade calculated for a retaken course?")
    for question in list_questions:
        ask_question(llm_pipe, vectorstore, question)

if __name__ == "__main__":
    main()

<class 'langchain_core.documents.base.Document'>


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.weight', 'classifier.bias'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✅ Vectorstore built and persisted.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0



=== PROMPT ===
 Answer the question based on the following context:

and minor) and does not restrict the total number of double-counted

credits or courses a student can claim. For example, a student can major in

both A and B and minor in C with a total of 6 double-counted courses: two

in A and B, two in B and C, and two in A and C. This rule does not apply to

courses that are not cross-listed or tagged. For instance, if major A requires

certain courses from major B for out-of-area foundation, exploration, or

applications of the major, and these courses are not cross-listed between A

and B, they do not count toward the 8-credit limit.

1 In interdisciplinary minors, the requirements mainly consist of courses from other majors.

These courses are usually not cross-listed with the minors and are referred to as 'tagged' by

these minors. 13.4  Experiential Learning Requirements

Experiential Learning can count toward elective or major requirements

(upon advisor approval). Fulbrig

In [8]:
!zip -r chroma_fulbright.zip chroma_fulbright/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: chroma_fulbright/ (stored 0%)
  adding: chroma_fulbright/b71b0971-1fd1-4d91-a078-b7c85d8ca9e1/ (stored 0%)
  adding: chroma_fulbright/b71b0971-1fd1-4d91-a078-b7c85d8ca9e1/header.bin (deflated 61%)
  adding: chroma_fulbright/b71b0971-1fd1-4d91-a078-b7c85d8ca9e1/length.bin (deflated 34%)
  adding: chroma_fulbright/b71b0971-1fd1-4d91-a078-b7c85d8ca9e1/link_lists.bin (stored 0%)
  adding: chroma_fulbright/b71b0971-1fd1-4d91-a078-b7c85d8ca9e1/data_level0.bin (deflated 41%)
  adding: chroma_fulbright/chroma.sqlite3 (deflated 62%)
