In [7]:
# Libraries
import pymupdf
import re
import pandas as pd
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
import getpass
import os

import streamlit as st

In [2]:
# clean the titles
def clean_title(title):
    return re.sub(r'^\d+[A-Z]*\.\s*', '', title, count=1)

# Function to move the prefix to the end
def move_prefix_to_end(title):
    match = re.match(r'^(\d+[A-Z]*\.\s+)(.*)', title)
    if match:
        prefix = match.group(1).strip()
        rest = match.group(2)
        return f"{rest} {prefix}"
    return title

# load and clean CPF pdf
def clean_and_save(file):
    
    titles = []
    doc = pymupdf.open(file)
    toc = doc.get_toc()
    for level, title, page in toc: 
        titles.append(title)
    
    # Cleaned titles
    cleaned_titles = [clean_title(title) for title in titles]
    
    # Apply the function to the titles
    cleaned_titles = [move_prefix_to_end(title) for title in titles]
    
    # Initialize a list to hold the modified content
    modified_content = []

    # Iterate through each page and extract text
    for page_num in range(8,len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        text = text.replace('\n',' ')
        
        for i in cleaned_titles:
            text = text.replace(i,"##" + i)
        
        modified_content.append(text)
        
    modified_text = " ".join(modified_content)
    
    return modified_text

    # Save the modified content to a .txt file
    # with open("modified_document.txt", "w", encoding='utf-8') as file:
    #     file.write(modified_text)
        

In [3]:
# clean and save the .txt
modified_text = clean_and_save("./Central Provident Fund Act 1953.pdf")

In [4]:
print(modified_text)

##Interpretation 2.—(1) In this Act, unless the context otherwise requires — “additional interest” means the interest referred to in section 6(4B); “applicable person” means — (a) a citizen or permanent resident of Singapore; or (b) for the purposes of a provision of this Act or subsidiary legislation made under this Act in which the term appears, any other person of a class prescribed by regulations made under section 77(1) for the purposes of that provision; [Act 36 of 2023 wef 01/04/2024] “approved annuity” means an annuity approved by the Board for the purposes of section 15(6C)(b) or the former section 15(6C)(b); “approved bank” means a bank approved by the Board for the purposes of section 15(6C)(a) or the former section 15(6C)(b); “approved corporation” means any company which — (a) is incorporated in Singapore; (b) is — (i) wholly or partly owned by the Government; (ii) a subsidiary of a company wholly or partly owned by the Government; or (iii) a subsidiary of a statutory boar

In [5]:
def chunking(textfile):
    
    # read as text
    with open(textfile, encoding='utf-8') as f:
        text = f.read()
    
    # split as text str
    text_splitter = CharacterTextSplitter(
        separator="##",
        chunk_size=200
    )

    pages = text_splitter.split_text(text)
    
    # create a document
    documents = text_splitter.create_documents(pages)
    
    return documents 

In [6]:
documents = chunking("modified_document.txt")

Created a chunk of size 20557, which is longer than the specified 200
Created a chunk of size 2184, which is longer than the specified 200
Created a chunk of size 3131, which is longer than the specified 200
Created a chunk of size 1352, which is longer than the specified 200
Created a chunk of size 5796, which is longer than the specified 200
Created a chunk of size 443, which is longer than the specified 200
Created a chunk of size 387, which is longer than the specified 200
Created a chunk of size 917, which is longer than the specified 200
Created a chunk of size 7063, which is longer than the specified 200
Created a chunk of size 7351, which is longer than the specified 200
Created a chunk of size 402, which is longer than the specified 200
Created a chunk of size 1032, which is longer than the specified 200
Created a chunk of size 1360, which is longer than the specified 200
Created a chunk of size 1852, which is longer than the specified 200
Created a chunk of size 276, which is

### OpenAI Embedding

In [8]:
os.environ["OPENAI_API_KEY"] = st.secrets["OPEN_AI_KEY"]
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Load into Chroma and Saving the model
# db2 = Chroma.from_documents(documents, embeddings, persist_directory="./openai_chroma_db")

In [10]:
# Loading the model
db3 = Chroma(persist_directory="./openai_chroma_db",embedding_function=embeddings)

In [11]:
@st.cache_resource(ttl="1h")
def retriever():
    os.environ["OPENAI_API_KEY"] = st.secrets["OPEN_AI_KEY"]
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vectordb = Chroma(persist_directory="./openai_chroma_db",embedding_function=embeddings)

    # Define retriever
    retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4})
    # retriever = vectordb.as_retriever(k = 4)
    # retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 4})
    
    return retriever

In [12]:
retriever()

2024-06-15 12:52:46.809 
  command:

    streamlit run c:\Users\User\.conda\envs\cs605_nlp\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001DF262E9050>, search_type='mmr', search_kwargs={'k': 2, 'fetch_k': 4})