# Step 1: Dealing with the Financial Documents

#### Import

In [4]:
import re
import string
from nltk.corpus import stopwords
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os
import pdfplumber
from PIL import Image
from sentence_transformers import SentenceTransformer

In [22]:
import faiss
import numpy as np
import pickle
import torch

## Preprocessing functions for text

In [3]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower() # Lowercase the text
    
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    return text


In [5]:
# Initialize tokenizer for token-based chunking
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Function to chunk text based on tokens
def chunk_text_by_tokens(text, max_tokens=512):
    tokens = tokenizer.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), max_tokens):
        chunk = tokens[i:i+max_tokens]
        chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))
    
    return chunks


Downloading: 100%|██████████| 350/350 [00:00<00:00, 560kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 3.37MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 4.87MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 335kB/s]


In [9]:
# Initialize text and image embedding models
text_embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Text embeddings
image_embedder = SentenceTransformer('clip-ViT-B-32')    # Image embeddings (using CLIP model)

# Function to process and embed images
def embed_image(image_path):
    img = Image.open(image_path)
    img = img.convert("RGB")
    # Resize image as needed by the CLIP model
    img = img.resize((224, 224))
    return image_embedder.encode(img)

# Function to extract and embed images from PDFs
def extract_text_and_images_with_preprocessing(pdf_path):
    text = ""
    images = []
    image_embeddings = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Extract text from page
            page_text = page.extract_text()
            if page_text:
                text += page_text  # Append to overall text
            
            # Extract images from the page (updated to handle image extraction reliably)
            for img in page.images:
                # Get the image from its bbox
                try:
                    image_object = page.within_bbox((img['x0'], img['top'], img['x1'], img['bottom'])).to_image()
                    # Save the image temporarily to a bytes buffer and open with Pillow
                    img_bytes = io.BytesIO()
                    image_object.save(img_bytes, format='PNG')
                    img_bytes.seek(0)
                    pil_img = Image.open(img_bytes)

                    # Resize and convert image as required by the CLIP model
                    pil_img = pil_img.convert("RGB")
                    pil_img = pil_img.resize((224, 224))

                    # Embed the image
                    img_embedding = image_embedder.encode(pil_img)
                    images.append(pil_img)  # Store the image
                    image_embeddings.append(img_embedding)  # Store the embedding
                except Exception as e:
                    print(f"Error processing image on page {page_num}: {e}")

    # Preprocess and chunk the text
    text = preprocess_text(text)
    chunks = chunk_text_by_tokens(text, max_tokens=512)

    return chunks, images, image_embeddings



ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [10]:
def process_pdfs_with_chunking_and_image_embedding(data_folder):
    documents = []
    for filename in os.listdir(data_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(data_folder, filename)
            chunks, images, image_embeddings = extract_text_and_images_with_preprocessing(pdf_path)
            documents.append({
                'chunks': chunks,
                'images': images,
                'image_embeddings': image_embeddings,
                'filename': filename
            })
    return documents

# Folder containing your PDFs
data_folder = '../Data/'

# Process the PDFs with chunking and image embedding
documents = process_pdfs_with_chunking_and_image_embedding(data_folder)


Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 2: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 3: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 4: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 8: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 8: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 8: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 8: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 12: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 14: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 16: type object 'PdfDocument' has no 

Token indices sequence length is longer than the specified maximum sequence length for this model (47209 > 512). Running this sequence through the model will result in indexing errors


Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no attribute '_process_page'
Error processing image on page 0: type object 'PdfDocument' has no att

There is a slight issue with the image embedding so far, but the text has been preprocessed and remoed any stopwords and puctuation as well as split into chunks of identical size.

In [12]:
# Embed the chunks
def embed_chunks(documents):
    for document in documents:
        document['chunk_embeddings'] = [{'text': chunk, 'embedding': text_embedder.encode(chunk, convert_to_tensor=True)} for chunk in document['chunks']]
    return documents

# Embed the chunks of all PDFs
embedded_documents = embed_chunks(documents)


In [16]:

build_folder = 'Build/'
if not os.path.exists(build_folder):
    os.makedirs(build_folder)

# Initialize FAISS index for text chunks and image embeddings
embedding_dim_text = 384  # Dimension of the sentence transformer for text
#embedding_dim_image = 512  # Dimension of CLIP model for image embeddings

# Create separate FAISS indices for text and image embeddings
index_text = faiss.IndexFlatL2(embedding_dim_text)
#index_image = faiss.IndexFlatL2(embedding_dim_image)

# Flatten all chunk embeddings and image embeddings and store their metadata
all_chunk_embeddings = []
#all_image_embeddings = []
chunk_metadata = []
#image_metadata = []

for document in embedded_documents:
    # Add text chunk embeddings to index
    for chunk in document['chunk_embeddings']:
        all_chunk_embeddings.append(chunk['embedding'].cpu().numpy())
        chunk_metadata.append({'filename': document['filename'], 'text': chunk['text']})
    
    # Add image embeddings to index
    # for img_embedding in document['image_embeddings']:
    #     all_image_embeddings.append(img_embedding)
    #     image_metadata.append({'filename': document['filename'], 'image': 'image path'})

# Convert embeddings to numpy arrays
all_chunk_embeddings = np.array(all_chunk_embeddings)
#all_image_embeddings = np.array(all_image_embeddings)

# Add embeddings to FAISS indices
index_text.add(all_chunk_embeddings)
#index_image.add(all_image_embeddings)

# Save FAISS indices and metadata
faiss.write_index(index_text, 'financial_docs_text_index.faiss')
#faiss.write_index(index_image, 'Build/financial_docs_image_index.faiss')


with open('financial_chunks_metadata.pkl', 'wb') as f:
    pickle.dump(chunk_metadata, f)

# with open('Build/financial_images_metadata.pkl', 'wb') as f:
#     pickle.dump(image_metadata, f)


### Query

In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel


In [21]:
# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Ensure the model is in evaluation mode
model.eval()


Downloading: 100%|██████████| 0.99M/0.99M [00:00<00:00, 8.06MB/s]
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 3.99MB/s]
Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 74.1kB/s]
Downloading: 100%|██████████| 1.29M/1.29M [00:00<00:00, 9.25MB/s]
Downloading: 100%|██████████| 665/665 [00:00<00:00, 1.39MB/s]
Downloading: 100%|██████████| 523M/523M [00:17<00:00, 31.4MB/s] 


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [25]:
def query_llm(query, top_k=5):
    # Embed the query using the text embedder
    query_embedding = text_embedder.encode(query, convert_to_tensor=True).cpu().numpy()
    
    # Search the FAISS index for text
    distances, indices = index_text.search(np.array([query_embedding]), top_k)
    
    # Collect relevant text chunks and limit the number of chunks to avoid exceeding token limit
    relevant_chunks = [chunk_metadata[i]['text'] for i in indices[0]]
    
    # Create context string, ensuring it doesn't exceed the maximum length for the model
    context = "\n".join(relevant_chunks)
    
    # Tokenize the context and query to check the total length
    combined_prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer.encode(combined_prompt, return_tensors='pt')

    # Truncate the input to the model's maximum length
    max_length = model.config.n_positions  # Maximum length for the model
    if inputs.size(1) > max_length:
        inputs = inputs[:, -max_length:]  # Keep only the last max_length tokens

    # Generate a response
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=300, num_return_sequences=1)
    
    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response


In [26]:
# Example usage
user_query = "How much did total sales increase in Q4?"
response = query_llm(user_query)
print("Response from LLM:")
print(response)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 1024, but ``max_length`` is set to 300. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


Response from LLM:
 campaign increased wages benefits at
##le digit wage inflation continue strategically offset expense increases menu price increases operational efficiencies january 2024 increased certain menu prices approximately 15 continue monitor cost pressures competitive landscape well consumer sentiment inform pricing decisions coming quarters believe strength brand consistency operations ongoing execution disciplined development strategy support business model intend continue develop shareholder value selffunded restaurant development ongoing focus operational excellence 2review fourth quarter 2023 financial results revenues fourth quarter ended december 31 2023 1879 million compared 1509 million fourth quarter ended december 25 2022 increase 370 million 245 increase inclusive favorable impact 53rd week resulted incremental revenue 139 million increase revenues primarily attributed opening new restaurants combined increase samerestaurant sales fourth quarter ended december 3

In [17]:
def search_text(query, index, chunk_metadata, top_k=5):
    query_embedding = text_embedder.encode(query, convert_to_tensor=True).cpu().numpy()
    _, indices = index.search(np.array([query_embedding]), top_k)
    results = [chunk_metadata[i] for i in indices[0]]
    return results

def search_image(image_path, index, image_metadata, top_k=5):
    img_embedding = embed_image(image_path)
    _, indices = index.search(np.array([img_embedding]), top_k)
    results = [image_metadata[i] for i in indices[0]]
    return results

# Example text query
query = "financial performance Q3 2023"
text_results = search_text(query, index_text, chunk_metadata)

# Example image query (with an image path)
# image_path = "path_to_query_image.png"
# image_results = search_image(image_path, index_image, image_metadata)

# Print results
print("Text Results:")
for result in text_results:
    print(f"Filename: {result['filename']}\nChunk Text: {result['text'][:500]}\n")

# print("Image Results:")
# for result in image_results:
#     print(f"Filename: {result['filename']}, Image Path: {result['image']}")


Text Results:
Filename: PTLO 2023 Q4 Press Release (8K).pdf
Chunk Text: 41279 plus general administrative expenses 21550 17707 78835 66892 preopening expenses 3990 2945 9019 4715 depreciation amortization 6525 5104 24313 20907 net income attributable equity method investment 391 276 1401 1083 income loss net 405 129 1035 204 restaurantlevel adjusted ebitda 45736 32049 165171 132506 restaurantlevel adjusted ebitda margin 243 212 243 226 note use 52 53week fiscal year ending sunday prior december 31 fourth quarter 2023 fiscal 2023 consisted 14 weeks 53 weeks respectiv

Filename: PTLO 2023 Q4 Press Release (8K).pdf
Chunk Text: ##ujuastitnedg e inbteirtedsta ra atensd rr oetshtaeur fraacnttolrse • vt hele imdjpuasctte odf e ubniiotndizaat mioanr agcitniv riteiecso onfc oiulira rteiostnasu arnadn dt wefoinrkiteirosn osn oreui rn ocpleurdaetidoi nns th aen ad pprpoefnitdaibxi tloit tyhi • th pere imsepnatactti oofn recent bank failures marketplace including ability access credit • risks asso