In [1]:
import re
import string
from nltk.corpus import stopwords
import numpy as np
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import pdfplumber
import os
import faiss
import pickle
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Initialize tokenizer for token-based chunking
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [3]:

# Function to chunk text based on tokens
def chunk_text_by_tokens(text, max_tokens=512):
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i+max_tokens]
        chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))
    return chunks

# Initialize text embedding model
text_embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Text embeddings

# Initialize LLM
llm = pipeline("text-generation", model="gpt2", device=0)  # Replace with your preferred LLM


In [5]:
# Function to extract text from PDFs with preprocessing
def extract_text_and_preprocess(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text  # Append to overall text
    # Preprocess the text
    text = preprocess_text(text)
    return text

# Process PDFs and create chunks
def process_pdfs(data_folder):
    documents = []
    for filename in os.listdir(data_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(data_folder, filename)
            text = extract_text_and_preprocess(pdf_path)
            chunks = chunk_text_by_tokens(text, max_tokens=512)
            documents.append({
                'chunks': chunks,
                'filename': filename
            })
    return documents

# Folder containing your PDFs
data_folder = '../../Data/'

# Process the PDFs
documents = process_pdfs(data_folder)

Token indices sequence length is longer than the specified maximum sequence length for this model (47209 > 512). Running this sequence through the model will result in indexing errors


In [6]:




# Embed the chunks
def embed_chunks(documents):
    for document in documents:
        document['chunk_embeddings'] = [{'text': chunk, 'embedding': text_embedder.encode(chunk, convert_to_tensor=True)} for chunk in document['chunks']]
    return documents

# Embed the chunks of all PDFs
embedded_documents = embed_chunks(documents)

# Create a vector database
build_folder = '../Vectordatabase/'
if not os.path.exists(build_folder):
    os.makedirs(build_folder)

# Initialize FAISS index for text chunks
embedding_dim_text = 384  # Dimension of the sentence transformer for text
index_text = faiss.IndexFlatL2(embedding_dim_text)

# Flatten all chunk embeddings and store their metadata
all_chunk_embeddings = []
chunk_metadata = []

for document in embedded_documents:
    for chunk in document['chunk_embeddings']:
        all_chunk_embeddings.append(chunk['embedding'].cpu().numpy())
        chunk_metadata.append({'filename': document['filename'], 'text': chunk['text']})

# Convert embeddings to numpy arrays
all_chunk_embeddings = np.array(all_chunk_embeddings)

# Add embeddings to FAISS index
index_text.add(all_chunk_embeddings)

# Save FAISS index and metadata
faiss.write_index(index_text, os.path.join(build_folder, 'financial_docs_text_index.faiss'))

with open(os.path.join(build_folder, 'financial_chunks_metadata.pkl'), 'wb') as f:
    pickle.dump(chunk_metadata, f)


In [19]:
embedded_documents[0]["chunk_embeddings"][0]

{'text': 'united states securities exchange commission washington dc 20549 form 10k annual report pursuant section 13 15d securities exchange act 1934 fiscal year ended december 31 2023 transition report pursuant section 13 15d securities exchange act 1934 transition period commission file number 00140951 portillos inc exact name registrant specified charter delaware 871104304 state jurisdiction incorporation organization irs employer identification 2001 spring road suite 400 oak brook illinois 60523 address principal executive offices 630 9543773 registrant ’ s telephone number including area code securities registered pursuant section 12b act title class trading symbol name exchange registered class common stock 001 par value per share ptlo nasdaq global select market securities registered pursuant section 12g act none indicate check mark registrant wellknown seasoned issuer defined rule 405 securities act yes indicate check mark registrant required file reports pursuant section 13 s

In [26]:
def query_retrieval(query, index, metadata, k=5):
    # Preprocess and embed the query
    preprocessed_query = preprocess_text(query)
    query_embedding = text_embedder.encode(preprocessed_query, convert_to_tensor=True)

    # Search the index
    distances, indices = index.search(np.array([query_embedding.cpu().numpy()]), k)

    # Check if any indices are returned
    if indices.shape[1] == 0:
        return []  # Return an empty list if no indices found

    # Retrieve documents based on indices
    results = []
    for idx in indices[0]:
        if idx >= 0 and idx < len(metadata):  # Ensure valid index
            results.append(metadata[idx])
    return results

def generate_answer(query, retrieved_docs):
    # Construct context from retrieved documents
    context = "\n".join([doc['text'] for doc in retrieved_docs])  # Ensure you access the correct key

    # Create the prompt for the LLM
    prompt = f"""Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer. 
    Use three sentences maximum and keep the answer concise.
    Context: {context}
    Question: {query}
    Helpful Answer:"""

    # Generate answer using the LLM
    return llm(prompt)




In [30]:
# Example usage
query = "What are the earnings in Q3 of 2023 for Portillo's"  # Replace this with an actual query
retrieved_docs = query_retrieval(query, index_text, chunk_metadata)


# if retrieved_docs:
#     answer = generate_answer(query, retrieved_docs)  # Pass both query and retrieved_docs
#     print("Generated Answer:", answer)
# else:
#     print("No relevant documents found.")


{'filename': 'PTLO 2023 Q4 10K.pdf',
 'text': '##based compensation 64 note 14 income taxes 68 note 15 earnings loss per share 70 note 16 contingencies 70 note 17 related party transactions 71 portillos inc form 10k 36table contents report independent registered public accounting firm stockholders board directors portillo ’ s inc opinion financial statements audited accompanying consolidated balance sheets portillo ’ s inc subsidiaries “ company ” december 31 2023 december 25 2022 related consolidated statements operations stockholders ’ members ’ equity cash flows three years period ended december 31 2023 related notes schedule listed index item 15 collectively referred “ financial statements ” opinion financial statements present fairly material respects financial position company december 31 2023 december 25 2022 results operations cash flows three years period ended december 31 2023 conformity accounting principles generally accepted united states america also audited accordance st