In [None]:
!pip install transformers accelerate torch
!pip install sentence_transformers
!pip install langchain
!pip install chromadb
!pip install pypdf
!pip install PyMuPDF
!pip install langchain_community

In [11]:
!pip install -q huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import fitz
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# === Step 1: Load and clean PDF ===
def load_and_clean_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    exact_removal = {
        "Fulbright University Vietnam Ground Floor, 105 Ton Dat Tien, Tan Phu, Quan 7, Ho Chi Minh City"
    }

    all_paragraphs = []

    for i, page in enumerate(doc):
        raw_paragraphs = [p.strip() for p in page.get_text().split("\n") if p.strip()]
        filtered = []

        for p in raw_paragraphs:
            if p in exact_removal:
                continue
            if p.isdigit():
                continue
            if p.lower().startswith("internal"):
                continue
            if re.match(r"^\d+\s*\|\s*Page$", p):
                continue
            if re.match(r"^Page\s+\d+\s+of\s+\d+", p, re.IGNORECASE):
                continue
            filtered.append(p)

        if i > 0 and filtered:
            first_word = filtered[0].split()[0] if filtered[0].split() else ""
            if first_word and not first_word[0].isupper():
                all_paragraphs[-1] += " " + filtered[0]
                filtered = filtered[1:]

        all_paragraphs.extend(filtered)

    return all_paragraphs

# === Step 2: Chunk text ===
def chunk_paragraphs(paragraphs):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100) 
    return splitter.split_text("\n\n".join(paragraphs))

# === Step 3: Build vector store ===
def build_vectorstore(chunks, persist_path="./chroma_fulbright"):
    documents = [Document(page_content=chunk) for chunk in chunks]
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_path
    )
    vectorstore.persist()
    return vectorstore

# === Step 4: Load local LLM ===
def load_local_llm(model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    tokenizer = AutoTokenizer.from_pretrained(model_id, timeout=60)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype="auto",
        device_map="auto"
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return pipe

# === Step 5: Ask questions ===
def ask_question(llm_pipe, vectorstore, query, top_k=3):
    docs = vectorstore.similarity_search(query, k=top_k)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Answer the question based on the following context:\n\n{context}\n\nQuestion: {query}\nAnswer:"""

    print("\n=== PROMPT ===\n", prompt)

    response = llm_pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]["generated_text"]
    print("\n=== RESPONSE ===\n", response[len(prompt):].strip())

# === Main ===
def main():
    pdf_path = "data/Academic-Policy_V5.1.pdf"
    paragraphs = load_and_clean_pdf(pdf_path)
    chunks = chunk_paragraphs(paragraphs)
    vectorstore = build_vectorstore(chunks)

    print("✅ Vectorstore built and persisted.")

    llm_pipe = load_local_llm()
    query = "I would like to ask for the capstone withdrawal policy. What will be the impact when I choose to drop the capstone before the Fall 2025 term starts? Will there be any penalties associated with withdrawing, such as a 'W' notation on my transcript?"
    ask_question(llm_pipe, vectorstore, query)

if __name__ == "__main__":
    main()


✅ Vectorstore built and persisted.


Device set to use cuda:0



=== PROMPT ===
 Answer the question based on the following context:

wishing to do a capstone must apply during the term prior to the first term

of the capstone (Capstone I), and may only apply after having completed 80

credits of coursework. Students who do not do a capstone will need to fulfill

8 credits of coursework as specified in their major. Individual majors will

determine eligibility requirements, and students should discuss their

proposals with program coordinators and potential advisors to prepare

wishing to do a capstone must apply during the term prior to the first term

of the capstone (Capstone I), and may only apply after having completed 80

credits of coursework. Students who do not do a capstone will need to fulfill

8 credits of coursework as specified in their major. Individual majors will

determine eligibility requirements, and students should discuss their

proposals with program coordinators and potential advisors to prepare

wishing to do a capstone mus

In [16]:
!zip -r chroma_fulbright.zip chroma_fulbright/

  adding: chroma_fulbright/ (stored 0%)
  adding: chroma_fulbright/cc64a2d9-e7f3-48f6-a042-4589382c0fa2/ (stored 0%)
  adding: chroma_fulbright/cc64a2d9-e7f3-48f6-a042-4589382c0fa2/header.bin (deflated 61%)
  adding: chroma_fulbright/cc64a2d9-e7f3-48f6-a042-4589382c0fa2/data_level0.bin (deflated 100%)
  adding: chroma_fulbright/cc64a2d9-e7f3-48f6-a042-4589382c0fa2/link_lists.bin (stored 0%)
  adding: chroma_fulbright/cc64a2d9-e7f3-48f6-a042-4589382c0fa2/length.bin (deflated 40%)
  adding: chroma_fulbright/chroma.sqlite3 (deflated 63%)
