# Steps:

- Document Loading
- Text Preprocessing
- Embedding
- Storing

### Imports

impor

In [1]:
import re
import string
from nltk.corpus import stopwords
import numpy as np

In [8]:
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import pdfplumber
import os
import faiss
import pickle

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/jean-sebastiengaultier/Desktop/UChicago/Q4/Capstone-Mosaic/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/jean-sebastiengaultier/Desktop/UChicago/Q4/Capstone-Mosaic/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()


In [9]:
def get_full_text(pdf_path):
    text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Extract text from page
            page_text = page.extract_text()
            if page_text:
                text += page_text  # Append to overall text
    return text

In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


False

In [4]:
def get_five_sentence_chunks(text):
    sentences = nltk.sent_tokenize(text)
    chunk_size = 5
    return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

In [5]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower() # Lowercase the text
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    return text

In [6]:
# Embed the chunks
def embed_chunks(documents):
    for document in documents:
        document['chunk_embeddings'] = [{'text': chunk, 'embedding': text_embedder.encode(chunk, convert_to_tensor=True)} for chunk in document['chunks']]
    return documents

In [10]:
data_folder = '../Data/'
documents = []
for filename in os.listdir(data_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(data_folder, filename)
            text = get_full_text(pdf_path)
            raw_chunks = get_five_sentence_chunks(text)
            chunks = [preprocess_text(chunk) for chunk in raw_chunks]
            documents.append({
                    'raw_chunks': raw_chunks,
                    'chunks': chunks,
                    'filename': filename
                })

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/jean-sebastiengaultier/nltk_data'
    - '/Users/jean-sebastiengaultier/Desktop/UChicago/Q4/Capstone-Mosaic/.venv/nltk_data'
    - '/Users/jean-sebastiengaultier/Desktop/UChicago/Q4/Capstone-Mosaic/.venv/share/nltk_data'
    - '/Users/jean-sebastiengaultier/Desktop/UChicago/Q4/Capstone-Mosaic/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [12]:
text_embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Text embeddings

In [15]:
# Embed the chunks
def embed_chunks(documents):
    for document in documents:
        document['chunk_embeddings'] = [{'text': chunk, 'embedding': text_embedder.encode(chunk, convert_to_tensor=True), 'raw_text': raw_chunk} for chunk, raw_chunk in zip(document['chunks'], document['raw_chunks'])]
    return documents

# Embed the chunks of all PDFs
embedded_documents = embed_chunks(documents)


### FAISS Vectordatabase

In [18]:

build_folder = 'Vectordatabase/'
if not os.path.exists(build_folder):
    os.makedirs(build_folder)

# Initialize FAISS index for text chunks and image embeddings
embedding_dim_text = 384  # Dimension of the sentence transformer for text

# Create separate FAISS indices for text and image embeddings
index_text = faiss.IndexFlatL2(embedding_dim_text)

# Flatten all chunk embeddings and image embeddings and store their metadata
all_chunk_embeddings = []
chunk_metadata = []

for document in embedded_documents:
    # Add text chunk embeddings to index
    for chunk in document['chunk_embeddings']:
        all_chunk_embeddings.append(chunk['embedding'].cpu().numpy())
        chunk_metadata.append({'filename': document['filename'], 'text': chunk['text'], 'raw_text': chunk['raw_text']})
    

# Convert embeddings to numpy arrays
all_chunk_embeddings = np.array(all_chunk_embeddings)

# Add embeddings to FAISS indices
index_text.add(all_chunk_embeddings)

# Save FAISS indices and metadata
faiss.write_index(index_text, build_folder + 'financial_docs_text_index_sentences.faiss')


with open(build_folder + 'financial_chunks_metadata_sentences.pkl', 'wb') as f:
    pickle.dump(chunk_metadata, f)

