# DIRECTORY EMBEDDING FILE

In [None]:

import os 
import tempfile

import chromadb
import streamlit as st

from langchain_chroma import Chroma
from pypdf import PdfReader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import CrossEncoder
from streamlit.runtime.uploaded_file_manager import UploadedFile
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_ollama import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser



### Document Splitting Functions

In [3]:
def process_document(file_uploaded: UploadedFile) -> list[Document]:
    # Store uploaded file as a temp file
    temp_file = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
    temp_file.write(file_uploaded.read())

    docs = ""
    pdf = PdfReader(temp_file.name)
    for page in pdf.pages:
        docs += page.extract_text()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,    
    )

    return text_splitter.split_text(docs)

In [5]:
def get_collection():

    embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")

    chroma_client = chromadb.PersistentClient(path="./hsc-llm")

    # chroma_client.get_or_create_collection(
    #     name="biology_collection", 
    #     embedding_function=embedding_model,
    #     metadata={"hnsw:space": "cosine"})

    return Chroma(
        client=chroma_client,
        collection_name="biology_collection",
        embedding_function=embedding_model,
    )

### Document Loader

In [6]:
directory_path = ("./biology_dataset")
loader = PyPDFDirectoryLoader(directory_path)

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,    
    )

docs = loader.load_and_split(text_splitter)

In [11]:
docs[0]

Document(metadata={'source': 'biology_dataset\\hsc-full-1.pdf', 'page': 0}, page_content='BIOLOGY')

In [12]:
len(docs)

2098

In [13]:
collection = get_collection()
count = 0
idx = 0
for doc in docs:
    documents, ids = [],  []
    documents.append(doc)
    ids.append(f"{doc.metadata['source']}_{idx}")
    count += 1
    idx+=1
    print(count)
    print(f"{doc.metadata['source']}_{idx}")
    collection.add_documents(documents=documents, ids=ids)

1
biology_dataset\hsc-full-1.pdf_1
2
biology_dataset\hsc-full-1.pdf_2
3
biology_dataset\hsc-full-1.pdf_3
4
biology_dataset\hsc-full-1.pdf_4
5
biology_dataset\hsc-full-1.pdf_5
6
biology_dataset\hsc-full-1.pdf_6
7
biology_dataset\hsc-full-1.pdf_7
8
biology_dataset\hsc-full-1.pdf_8
9
biology_dataset\hsc-full-1.pdf_9
10
biology_dataset\hsc-full-1.pdf_10
11
biology_dataset\hsc-full-1.pdf_11
12
biology_dataset\hsc-full-1.pdf_12
13
biology_dataset\hsc-full-1.pdf_13
14
biology_dataset\hsc-full-1.pdf_14
15
biology_dataset\hsc-full-1.pdf_15
16
biology_dataset\hsc-full-1.pdf_16
17
biology_dataset\hsc-full-1.pdf_17
18
biology_dataset\hsc-full-1.pdf_18
19
biology_dataset\hsc-full-1.pdf_19
20
biology_dataset\hsc-full-1.pdf_20
21
biology_dataset\hsc-full-1.pdf_21
22
biology_dataset\hsc-full-1.pdf_22
23
biology_dataset\hsc-full-1.pdf_23
24
biology_dataset\hsc-full-1.pdf_24
25
biology_dataset\hsc-full-1.pdf_25
26
biology_dataset\hsc-full-1.pdf_26
27
biology_dataset\hsc-full-1.pdf_27
28
biology_dataset\