In [1]:
!pip install pymupdf4llm langchain-community pypdf langchain-core langchain-huggingface langchain-pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.18-py3-none-any.whl.metadata (4.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain-pymupdf4llm
  Downloading langchain_pymupdf4llm-0.2.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pymupdf>=1.25.4 (from pymupdf4llm)
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloadi

In [19]:
!pip install -qU langchain-community faiss-cpu tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [41]:
import os
import pathlib
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import tqdm

In [3]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import csv

In [4]:


def get_default_metadata_keys():
    metadata_keys = ['creationdate', 'keywords', 'source', 'title', 'total_pages']
    return metadata_keys


def get_defaultloader(file, mode):
    return PyMuPDFLoader(file, mode=mode)


class PDFDocLoader:

    def __init__(self, file_path, loader=None, metadata_keys=None, new_metadata=None, mode="page"):
        if new_metadata is None:
            new_metadata = dict()
        self.file_path = file_path
        self.metadata_keys = metadata_keys
        if loader is None:
            loader = get_defaultloader(file_path, mode)
        self.loader = loader
        self.new_metadata = new_metadata

    def load(self) -> list[Document]:
        loader = self.loader
        docs = loader.load()
        for doc in docs:
            metadata = dict()
            if self.metadata_keys:
                metadata = {key: doc.metadata[key] for key in self.metadata_keys if key in doc.metadata}
            metadata.update(self.new_metadata)
            if len(metadata):
                doc.metadata = metadata

        return docs


In [10]:

def get_default_embeddings(model_name="sentence-transformers/all-mpnet-base-v2"):
    """Return the default embedding model."""
    return HuggingFaceEmbeddings(model_name=model_name)

def get_default_splitter(**kwargs):
    """Return the default text splitter."""
    return RecursiveCharacterTextSplitter(chunk_size=kwargs.get("chunk_size"), chunk_overlap=kwargs.get("chunk_overlap"))

def get_default_vector_store(embedding):
    """Return the default vector store."""
    return InMemoryVectorStore(embedding)


class PDFToVectorDB:
    def __init__(self, embedding=None, splitter=None, vector_store=None, **kwargs):
        self.kwargs = kwargs

        if embedding is None:
            embedding = get_default_embeddings()
        self.embedding = embedding

        if splitter is None:
            chunk_size = self.kwargs.get("chunk_size", 1000)
            splitter = get_default_splitter(chunk_size=chunk_size, chunk_overlap=self.kwargs.get("chunk_overlap", int(chunk_size/5)))
        self.splitter = splitter

        if vector_store is None:
            vector_store = get_default_vector_store(self.embedding)
        self.vector_store = vector_store

    def process_pdf_and_store_in_vectorDB(self, file_path, loader=None, metadata_keys=None, new_metadata=None, mode="page"):
        """Process PDF file, split documents, and store in the vector DB."""
        if type(file_path) == type(""):
            file_path = pathlib.Path(file_path)
        file_name = file_path.name
        documents = self.load_pdf(file_path, loader=loader, metadata_keys=metadata_keys, new_metadata=new_metadata, mode=mode)
        if len(documents) !=0:
            split_documents = self.split_documents(documents)
            return self.add_documents_to_vector_store(split_documents, file_name)

    def load_pdf(self, file_path, loader=None, metadata_keys=None, new_metadata=None, mode="page"):
        """Load the PDF file using PDFDocLoader."""
        loader = PDFDocLoader(file_path, loader=loader, metadata_keys=metadata_keys, new_metadata=new_metadata, mode=mode)
        return loader.load()

    def split_documents(self, documents):
        """Split the loaded documents."""
        return self.splitter.split_documents(documents)

    def add_documents_to_vector_store(self, documents, file_name):
        """Add the split documents to the vector store."""
        ids = self.vector_store.add_documents(documents)
        return ids

    def save_local(self, file, index_name):
        self.vector_store.save_local(file, index_name=index_name)

def read_file_url_mapper(filename):
    """
    Reads the contents of a CSV file and returns the data as a list of dictionaries.

    Each dictionary corresponds to a row in the CSV file, with the keys
    being the column names ("file", "URL").

    If the file doesn't exist, it returns an empty list.
    """
    if not os.path.exists(filename):
        return []

    with open(filename, 'r', newline='') as f:
        reader = csv.DictReader(f)
        return [row for row in reader]


In [38]:
gd = pathlib.Path("/content/drive/MyDrive/Colab Notebooks/Capstone/Data")
files = [f for f in gd.iterdir()]

In [7]:
len(files)

673

In [8]:
files[0]

PosixPath('/content/drive/MyDrive/Colab Notebooks/Capstone/Data/Parking_1.pdf')

In [11]:
metadata_keys = [
 'creationdate',
 'source',
 'total_pages',
 'title',
 'keywords']

new_metadata = {"file_name": "test.pdf","url":"example.com"}

embedding = get_default_embeddings()
vector_store = get_default_vector_store(embedding)

processor = PDFToVectorDB(vector_store=vector_store, embedding=embedding)


In [12]:
processor.process_pdf_and_store_in_vectorDB(files[3], mode="single", metadata_keys=metadata_keys, new_metadata=new_metadata)

Added Parking_4.pdf file with 6 splits


['1a9ae7b9-c7a7-4fa1-8f8c-a9c6439bc2d4',
 'b1c15a9f-4205-4050-b34b-6b1fb1fb757e',
 'b1641c9b-4a7d-46bf-a288-1d87e686045e',
 '14a4cc0e-480b-4cb6-88f3-535316f23874',
 'cc514db8-aa75-45e0-8dc7-8a455bf73eb6',
 'b683a182-e2e5-4fcd-b430-7942f2a27276']

In [16]:
file_url_csv = read_file_url_mapper("/content/drive/MyDrive/Colab Notebooks/Capstone/FileURLMapper.csv")

In [17]:
file_url_map = { row["file"]:row["URL"] for row in file_url_csv}

In [20]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS



In [21]:
metadata_keys = [
 'creationdate',
 'source',
 'total_pages',
 'title',
 'keywords']

file_vectore_ids = {}

embedding = get_default_embeddings()

embedding_dim = len(embedding.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embedding,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

processor = PDFToVectorDB(vector_store=vector_store, embedding=embedding)

In [34]:
for file in tqdm.tqdm(files):
    new_metadata = {"file_name": file.name,"url":file_url_map[file.stem]}
    ids = processor.process_pdf_and_store_in_vectorDB(file, mode="single", metadata_keys=metadata_keys, new_metadata=new_metadata)
    print(f"Added {file.name} file with {len(ids)} splits")
    file_vectore_ids[file.name] = ids

Added Parking_1.pdf file with 1 splits
Added Parking_2.pdf file with 1 splits
Added Parking_3.pdf file with 6 splits
Added Parking_4.pdf file with 6 splits
Added HuskyCT_1.pdf file with 1 splits
Added HuskyCT_2.pdf file with 2 splits
Added HuskyCT_3.pdf file with 3 splits
Added HuskyCT_4.pdf file with 4 splits
Added HuskyCT_5.pdf file with 1 splits
Added HuskyCT_6.pdf file with 3 splits
Added HuskyCT_7.pdf file with 4 splits
Added HuskyCT_8.pdf file with 2 splits
Added HuskyCT_9.pdf file with 2 splits
Added HuskyCT_10.pdf file with 3 splits
Added HuskyCT_11.pdf file with 5 splits
Added HuskyCT_12.pdf file with 2 splits
Added HuskyCT_13.pdf file with 2 splits
Added HuskyCT_14.pdf file with 4 splits
Added HuskyCT_15.pdf file with 9 splits
Added HuskyCT_16.pdf file with 2 splits
Added HuskyCT_17.pdf file with 4 splits
Added HuskyCT_18.pdf file with 15 splits
Added HuskyCT_19.pdf file with 2 splits
Added HuskyCT_20.pdf file with 3 splits
Added HuskyCT_21.pdf file with 2 splits
Added HuskyC

ValueError: not enough values to unpack (expected 2, got 1)

In [36]:
"StudentAdmin_58.pdf" in file_vectore_ids

True

In [42]:
for file in tqdm.tqdm(files):
    if file.name not in file_vectore_ids and file.stem in file_url_map:
        new_metadata = {"file_name": file.name,"url":file_url_map[file.stem]}
        ids = processor.process_pdf_and_store_in_vectorDB(file, mode="single", metadata_keys=metadata_keys, new_metadata=new_metadata)
        file_vectore_ids[file.name] = ids

100%|██████████| 673/673 [00:00<00:00, 554898.09it/s]


In [43]:
vector_store.save_local("faiss_index")


In [44]:
!zip -r faiss_index.zip /content/faiss_index


  adding: content/faiss_index/ (stored 0%)
  adding: content/faiss_index/index.pkl (deflated 76%)
  adding: content/faiss_index/index.faiss (deflated 7%)


In [45]:
new_vector_store = FAISS.load_local(
    "faiss_index", embedding, allow_dangerous_deserialization=True
)

In [46]:
new_vector_store.similarity_search_with_score("How faculty can purchase parking permit?",k=2,
    )

[(Document(id='903dcfc3-3957-4eb4-8092-6e6c01f92528', metadata={'creationdate': '2025-03-01T21:28:42+00:00', 'source': '/content/drive/MyDrive/Colab Notebooks/Capstone/Data/Parking_4.pdf', 'total_pages': 8, 'title': 'Faculty & Staff | Purchasing a Permit in the UConn Parking Portal via Payroll Deduction - Parking - UConn Knowledge Base', 'keywords': '', 'file_name': 'Parking_4.pdf', 'url': 'https://kb.uconn.edu/space/PAR/10894836121'}, page_content='Faculty & Staff | Purchasing a Permit in the\nUConn Parking Portal via Payroll Deduction\nLast updated 21 November, 2024 • 4 min read\nFaculty and staff, as well as special payroll employees, can purchase parking\npermits through the UConn Parking Services Ticketrak 10 Customer Portal. Permits\nare required to park in University lots from 7 am to 5 pm unless otherwise posted.\nStudents and graduate assistants should refer to the respective student article for\nrole-specific instructions.\xa0\xa0\nPurchasing a Permit \n1. Log in to the Ticke

In [53]:
new_vector_store.similarity_search_with_score("How student can view  assesmnet feedback?",k=3,
    )

[(Document(id='63b5e6c9-3e76-4d6d-894b-7681f8e41729', metadata={'creationdate': '2025-03-02T04:15:11+00:00', 'source': '/content/drive/MyDrive/Colab Notebooks/Capstone/Data/HuskyCT-Ultra_93.pdf', 'total_pages': 6, 'title': 'Viewing Assessment Feedback in Ultra Course View (Students) - Teaching and Learning - UConn Knowledge Base', 'keywords': '', 'file_name': 'HuskyCT-Ultra_93.pdf', 'url': 'https://kb.uconn.edu/space/TL/26302382154'}, page_content='Viewing Assessment Feedback in Ultra Course\nView (Students)\nLast updated 30 October, 2024 • 2 min read\nThis article is for students looking to view assignment or test feedback within\nHuskyCT Ultra courses.\nViewing Assignment Feedback \nFrom Course Content \n\uf05aIf you are unable to see View Submission and instead see View\nInstructions, that is because the assignment is still accepting new\nattempts. If one attempt has been graded you will see it under Your Grade\nin the same panel. Click on the Your Grade box to enter the graded\nsub

In [54]:
new_vector_store.similarity_search_with_score("move mail from gmail to outlook",k=3,
    )

[(Document(id='84c03607-0222-425e-abcf-d63fff9246e6', metadata={'creationdate': '2025-03-02T02:47:18+00:00', 'source': '/content/drive/MyDrive/Colab Notebooks/Capstone/Data/IT-Microsoft_4.pdf', 'total_pages': 5, 'title': 'Gmail to Microsoft Outlook - Information Technology - UConn Knowledge Base', 'keywords': '', 'file_name': 'IT-Microsoft_4.pdf', 'url': 'https://kb.uconn.edu/space/IKB/26521141307'}, page_content='Gmail to Microsoft Outlook\nLast updated 26 February, 2024 • 3 min read\nGmail and Microsoft 365 are both email services used to send messages. This\narticle covers the key differences in the browser experience and additional features.\n\uf05aStudents’ emails will be transferred to Outlook as part of the migration from\nGoogle to Microsoft 365 during the summer.\nGmail\nOutlook\nBasic Functionality \nLogging in: Students log in to Gmail using their email address and password.\nTo log into Outlook, you can use your email address and NetID password.\nChecking mail: Like in Gmai

In [55]:
new_vector_store.similarity_search_with_score("creat hidden group from student",k=3,
    )

[(Document(id='beeb64f4-6d1e-4380-a20a-d987c50a7b82', metadata={'creationdate': '2025-03-02T04:09:03+00:00', 'source': '/content/drive/MyDrive/Colab Notebooks/Capstone/Data/HuskyCT-Ultra_31.pdf', 'total_pages': 11, 'title': 'Creating Groups in Ultra Course View - Teaching and Learning - UConn Knowledge Base', 'keywords': '', 'file_name': 'HuskyCT-Ultra_31.pdf', 'url': 'https://kb.uconn.edu/space/TL/26052199835'}, page_content='Group Visibility \nIn Ultra, Group Sets can be visible or hidden from students.\n\x0cIf the Group Set is visible to students, they will be able to see their group and\ngroup members from the Groups tab.\nAny Group Set that will be used for a Group Assignment or Group Discussion must\nbe set to “Visible to students” before it can be selected from the\nAssignment/Discussion Settings panel.\n1. To adjust the visibility setting of a Group Set, click on the Groups tab.\n2. The visibility status of the Group Set will be indicated under the set name. To\nadjust the visi

In [47]:
file_vectore_ids

{'Parking_1.pdf': ['238f919c-c3fc-4813-9e23-086c8c778a0c'],
 'Parking_2.pdf': ['f9b856bb-7755-4fdc-a4f6-bf2c091f5fd4'],
 'Parking_3.pdf': ['3853d65c-100c-4d57-9469-ec890f924edc',
  '1a31dc20-97c8-4c09-9610-3c850237aabc',
  '2f0f876f-4e55-41b6-baeb-9885e1daa3e4',
  'a2bcb647-67cc-4536-ab5a-1f47c8a82c32',
  '33470fce-0a21-44e2-92a2-201d432a8900',
  '522709b2-b090-427a-87f7-6e6552453d57'],
 'Parking_4.pdf': ['68ade012-9421-4e16-a4a9-f9c3580e3cb4',
  'e5682a54-83e1-4674-b544-5bd46247c42e',
  'e8a35f67-ac46-44f7-97c1-1e3d43226349',
  '08627c92-5f6a-4c09-bdb5-c31192dc130e',
  '7749cd85-b3c1-481c-a6d0-cdcf7201fa79',
  '8c170a74-531c-4b93-904e-e98dd29e2a1e'],
 'HuskyCT_1.pdf': ['979af63f-feb8-4315-8a5b-f27c85fe459b'],
 'HuskyCT_2.pdf': ['3a7ef516-9b52-4eff-a5e0-e9ce853f86ba',
  '3a1a0322-54f1-45af-be21-65a0345f56fc'],
 'HuskyCT_3.pdf': ['383d8321-3d33-447c-b1c5-546ac40cd8ef',
  '987d0249-3203-4297-bd99-6a718403ee27',
  'e1ec72e3-26e0-4750-bbab-7ec1e4294384'],
 'HuskyCT_4.pdf': ['bec32e08-c6ca-

In [48]:
import json



with open("vectorDB_ids.json", "w") as file:
    json.dump(file_vectore_ids, file, indent=4)

In [49]:
new_vector_store.index.ntotal

2005

In [50]:
count = 0
for k in file_vectore_ids:
    count += len(file_vectore_ids[k])

In [51]:
count

1990

In [52]:
len(file_vectore_ids)

672