In [1]:
# Libraries
import pymupdf
import re
import pandas as pd
from langchain_chroma import Chroma
from langchain_text_splitters import CharacterTextSplitter
import getpass
import os
from langchain_openai import OpenAIEmbeddings

In [None]:
# clean the titles
def clean_title(title):
    return re.sub(r'^\d+[A-Z]*\.\s*', '', title, count=1)

# Function to move the prefix to the end
def move_prefix_to_end(title):
    match = re.match(r'^(\d+[A-Z]*\.\s+)(.*)', title)
    if match:
        prefix = match.group(1).strip()
        rest = match.group(2)
        return f"{rest} {prefix}"
    return title

# load and clean CPF pdf
def clean_and_save(file):
    
    titles = []
    doc = pymupdf.open(file)
    toc = doc.get_toc()
    for level, title, page in toc: 
        titles.append(title)
    
    # Cleaned titles
    cleaned_titles = [clean_title(title) for title in titles]
    
    # Apply the function to the titles
    cleaned_titles = [move_prefix_to_end(title) for title in titles]
    
    # Initialize a list to hold the modified content
    modified_content = []

    # Iterate through each page and extract text
    for page_num in range(8,len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text = text.replace('\n',' ')
        
        for i in cleaned_titles:
            text = text.replace(i,"##" + i)
        
        modified_content.append(text)
        
    modified_text = " ".join(modified_content)
        
    # Save the modified content to a .txt file
    with open("modified_document.txt", "w", encoding='utf-8') as file:
        file.write(modified_text)
        

In [None]:
# clean and save the .txt
clean_and_save("./Central Provident Fund Act 1953.pdf")

In [1]:
def chunking(textfile):
    
    # read as text
    with open(textfile, encoding='utf-8') as f:
        text = f.read()
    
    # split as text str
    text_splitter = CharacterTextSplitter(
        separator="##",
        chunk_size=200
    )

    pages = text_splitter.split_text(text)
    
    # create a document
    documents = text_splitter.create_documents(pages)
    
    return documents 

In [2]:
documents = chunking("modified_document.txt")

### OpenAI Embedding

In [28]:
os.environ["OPENAI_API_KEY"] = getpass.getpass()
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Load into Chroma and Saving the model
db2 = Chroma.from_documents(documents, embeddings, persist_directory="./openai_chroma_db")

In [None]:
# Loading the model
# db3 = Chroma(persist_directory="./openai_chroma_db",embedding_function=embeddings)