In [6]:
import os 
import tempfile

import chromadb
import ollama
import streamlit as st

from langchain_community.vectorstores import Chroma
from pypdf import PdfReader
import ollama
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from streamlit.runtime.uploaded_file_manager import UploadedFile
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader

### Document Splitting Functions

In [2]:
def process_document(file_uploaded: UploadedFile) -> list[Document]:
    # Store uploaded file as a temp file
    temp_file = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
    temp_file.write(file_uploaded.read())

    docs = ""
    pdf = PdfReader(temp_file.name)
    for page in pdf.pages:
        docs += page.extract_text()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,    
    )

    return text_splitter.split_text(docs)

In [None]:
def split_documents(documents: list[Document]):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,    
    )

    chunks = documents.load_and_split(text_splitter)

    return text_splitter.split_text(documents)

In [3]:
def get_collection():

    embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")

    chroma_client = chromadb.PersistentClient(path="./hsc-llm")

    # chroma_client.get_or_create_collection(
    #     name="biology_collection", 
    #     embedding_function=embedding_model,
    #     metadata={"hnsw:space": "cosine"})

    return Chroma(
        client=chroma_client,
        collection_name="biology_collection",
        embedding_function=embedding_model,
    )

In [None]:
def add_documents_to_collection(chunks):
    collection = get_collection()
    documents, ids = [],  []
    count = 0
    for idx, split in enumerate(chunks):
        documents.append(chunks)
        ids.append(f"{file_name}_{idx}")
        count += 1
        print(count)
    collection.add_documents(documents=documents, ids=ids)
    

### Document Loader

In [30]:
directory_path = ("./biology_dataset")
loader = PyPDFDirectoryLoader(directory_path)

In [31]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,    
    )

docs = loader.load_and_split(text_splitter)

In [37]:
docs[0]

Document(metadata={'source': 'biology_dataset\\2022-module-8-non-infectious-disease-635c95038fcf9.pdf', 'page': 0}, page_content='Module 8: non-infectious disease and disorders\nConstruct and interpret negative feedback loops that sho w homeostasis by using a range of \nsources, including but not limited to: (ACSBL101, ACSBL 110, ACSBL111)\n●temperature (ACSBL098)\n●glucose\n1.0 Homeostasis:\n●Processes by which biological systems maintain stability and equilibrium by \ncontrolling different factors that play direct role for  survival (temperature, pH, \ndifferent chemicals etc).')

In [36]:
len(docs)

2819

In [43]:
collection = get_collection()
count = 0
idx = 0
for doc in docs:
    documents, ids = [],  []
    documents.append(doc)
    ids.append(f"{doc.metadata['source']}_{idx}")
    count += 1
    idx+=1
    print(count)
    print(f"{doc.metadata['source']}_{idx}")
    collection.add_documents(documents=documents, ids=ids)

1
biology_dataset\2022-module-8-non-infectious-disease-635c95038fcf9.pdf_1
2
biology_dataset\2022-module-8-non-infectious-disease-635c95038fcf9.pdf_2
3
biology_dataset\2022-module-8-non-infectious-disease-635c95038fcf9.pdf_3
4
biology_dataset\2022-module-8-non-infectious-disease-635c95038fcf9.pdf_4
5
biology_dataset\2022-module-8-non-infectious-disease-635c95038fcf9.pdf_5


KeyboardInterrupt: 

In [None]:
all_docs = split_documents(docs)
add_documents_to_collection(all_docs, "biology_dataset")

In [None]:
for doc in docs:
    pages = doc.lo