# Part 1: Embeddings

In [12]:
import os
import re
import string
from nltk.corpus import stopwords
from typing import List, Dict
import PyPDF2
from langchain.embeddings import HuggingFaceEmbeddings

In [13]:
def load_pdf(file_path: str) -> str:
    """
    Load a PDF file and extract its text content.
    """
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

In [14]:
def preprocess_text(text):
    text = text.lower() # Lowercase the text
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    return text

In [21]:
def split_text(text: str, chunk_size: int = 500, overlap: int = 200) -> List[str]:
    """
    Split the input text into chunks of specified size with overlap.
    """
    text = preprocess_text(text)
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    return chunks


In [22]:
def load_and_split_pdfs(data_folder: str) -> List[Dict]:
    """
    Load all PDFs from a folder and split them into chunks.
    """
    chunks = []
    for filename in os.listdir(data_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(data_folder, filename)
            text = load_pdf(pdf_path)
            split_texts = split_text(text)
            chunks.extend([{"text": chunk, "source": filename} for chunk in split_texts])
    return chunks

In [96]:
from sentence_transformers import SentenceTransformer

def get_embedding_function():
    # Load the Sentence-Transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Return the function to embed texts
    return model.encode

In [97]:
def process_pdfs(data_folder: str) -> List[Dict]:
    """
    Process all PDF files in a folder: load, split into chunks, and prepare for embedding.
    """
    # Load and split the PDFs
    chunks = load_and_split_pdfs(data_folder)
    
    # Get the embedding function
    embedding_function = get_embedding_function()
    
    # Add the embedding function to each chunk for later use
    for chunk in chunks:
        chunk['embedding_function'] = embedding_function
    
    print(f"Processed {len(chunks)} chunks from PDFs in {data_folder}")
    return chunks

In [98]:

if __name__ == "__main__":
    # Example usage
    data_folder = "../../../Data/"
    processed_chunks = process_pdfs(data_folder)
    print(f"Prepared {len(processed_chunks)} chunks for database insertion")

Processed 1308 chunks from PDFs in ../../../Data/
Prepared 1308 chunks for database insertion


In [99]:
processed_chunks[0]

{'text': 'united states securities exchange commission washington dc 20549 form 10k ☒ annual report pursuant section 13 15d securities exchange act 1934 fiscal year ended december 31 2023 ☐ transition report pursuant section 13 15d securities exchange act 1934 transition period commission file number 00140951 portillos inc exact name registrant specified charter delaware 871104304 state jurisdiction incorporation organization irs employer identification 2001 spring road suite 400 oak brook illinois 60523 ',
 'source': 'PTLO 2023 Q4 10K.pdf',
 'embedding_function': <bound method SentenceTransformer.encode of SentenceTransformer(
   (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoke

# Part 2: Populate DataBase (ChromaDB)

In [100]:
import os
import shutil
import argparse
import faiss 
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings 
from langchain.docstore import InMemoryDocstore  # Updated to use SimpleDocumentStore
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings


In [101]:
FAISS_INDEX_PATH = "../../Vectordatabase/"
DATA_PATH = "../../../Data/"  # Assuming embeddings or processed chunks are saved here
EMBEDDING_SIZE = 384

In [102]:
def load_preprocessed_chunks():
    print(f"Loading preprocessed chunks from {DATA_PATH}")
    
    # Assuming you have a function to load already embedded chunks from embeddings.py
    #chunks = load_embedded_chunks(DATA_PATH)
    chunks = process_pdfs(DATA_PATH)
    
    print(f"Loaded {len(chunks)} chunks")
    return chunks

In [103]:
c = load_preprocessed_chunks()

Loading preprocessed chunks from ../../../Data/
Processed 1308 chunks from PDFs in ../../../Data/
Loaded 1308 chunks


In [104]:
c[0]

{'text': 'united states securities exchange commission washington dc 20549 form 10k ☒ annual report pursuant section 13 15d securities exchange act 1934 fiscal year ended december 31 2023 ☐ transition report pursuant section 13 15d securities exchange act 1934 transition period commission file number 00140951 portillos inc exact name registrant specified charter delaware 871104304 state jurisdiction incorporation organization irs employer identification 2001 spring road suite 400 oak brook illinois 60523 ',
 'source': 'PTLO 2023 Q4 10K.pdf',
 'embedding_function': <bound method SentenceTransformer.encode of SentenceTransformer(
   (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoke

In [105]:
def save_faiss_index(db):
    print(f"Saving FAISS index to {FAISS_INDEX_PATH}")
    if not os.path.exists(FAISS_INDEX_PATH):
        os.makedirs(FAISS_INDEX_PATH)
    db.save_local(FAISS_INDEX_PATH)

In [114]:
def populate_faiss(chunks):
    print(f"Initializing FAISS index")
    
    # Create FAISS vector store
    embeddings = get_embedding_function()  # Using your embedding function from embeddings.py
    index = faiss.IndexFlatL2(EMBEDDING_SIZE)  # Create FAISS index for L2 similarity search

    # Use the FAISS vector store wrapper from LangChain
    db = FAISS(index=index, embedding_function=embeddings, docstore=InMemoryDocstore({}), index_to_docstore_id={})

    # Add chunks to the FAISS index
    # for chunk in chunks:
    #     db.add_texts(chunk["text"])
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1} of {len(chunks)}")
        db.add_texts(chunk["text"])

    # Save the index
    save_faiss_index(db)

In [115]:
def clear_faiss_index():
    if os.path.exists(FAISS_INDEX_PATH):
        shutil.rmtree(FAISS_INDEX_PATH)
        print(f"FAISS index at {FAISS_INDEX_PATH} cleared")
    else:
        print(f"No FAISS index found at {FAISS_INDEX_PATH}")

In [116]:

def main():
    # # Check if the index should be reset
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--reset", action="store_true", help="Reset the FAISS index.")
    # args = parser.parse_args()

    # if args.reset:
    #     print("✨ Clearing the FAISS index...")
    #     clear_faiss_index()

    # Load preprocessed chunks (from embeddings.py or another source)
    chunks = load_preprocessed_chunks()
    # Add chunks to FAISS index
    populate_faiss(chunks)

In [117]:
if __name__ == "__main__":
    main()

Loading preprocessed chunks from ../../../Data/
Processed 1308 chunks from PDFs in ../../../Data/
Loaded 1308 chunks
Initializing FAISS index
Processing chunk 1 of 1308
Processing chunk 2 of 1308
Processing chunk 3 of 1308
Processing chunk 4 of 1308


KeyboardInterrupt: 