# Steps:

- Document Loading
- Text Preprocessing
- Embedding
- Storing

### Imports

impor

In [2]:
import re
import string
from nltk.corpus import stopwords
import numpy as np

In [3]:
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import pdfplumber
import os
import faiss
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_full_text(pdf_path):
    text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Extract text from page
            page_text = page.extract_text()
            if page_text:
                text += page_text  # Append to overall text
    return text

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def get_five_sentence_chunks(text):
    sentences = nltk.sent_tokenize(text)
    chunk_size = 5
    return [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

In [7]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower() # Lowercase the text
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    return text

In [8]:
# Embed the chunks
def embed_chunks(documents):
    for document in documents:
        document['chunk_embeddings'] = [{'text': chunk, 'embedding': text_embedder.encode(chunk, convert_to_tensor=True)} for chunk in document['chunks']]
    return documents

In [11]:
data_folder = '../Data/'
documents = []
for filename in os.listdir(data_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(data_folder, filename)
            text = get_full_text(pdf_path)
            raw_chunks = get_five_sentence_chunks(text)
            chunks = [preprocess_text(chunk) for chunk in raw_chunks]
            documents.append({
                    'raw_chunks': raw_chunks,
                    'chunks': chunks,
                    'filename': filename
                })

In [12]:
text_embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Text embeddings

In [15]:
# Embed the chunks
def embed_chunks(documents):
    for document in documents:
        document['chunk_embeddings'] = [{'text': chunk, 'embedding': text_embedder.encode(chunk, convert_to_tensor=True), 'raw_text': raw_chunk} for chunk, raw_chunk in zip(document['chunks'], document['raw_chunks'])]
    return documents

# Embed the chunks of all PDFs
embedded_documents = embed_chunks(documents)


### FAISS Vectordatabase

In [18]:

build_folder = 'Vectordatabase/'
if not os.path.exists(build_folder):
    os.makedirs(build_folder)

# Initialize FAISS index for text chunks and image embeddings
embedding_dim_text = 384  # Dimension of the sentence transformer for text

# Create separate FAISS indices for text and image embeddings
index_text = faiss.IndexFlatL2(embedding_dim_text)

# Flatten all chunk embeddings and image embeddings and store their metadata
all_chunk_embeddings = []
chunk_metadata = []

for document in embedded_documents:
    # Add text chunk embeddings to index
    for chunk in document['chunk_embeddings']:
        all_chunk_embeddings.append(chunk['embedding'].cpu().numpy())
        chunk_metadata.append({'filename': document['filename'], 'text': chunk['text'], 'raw_text': chunk['raw_text']})
    

# Convert embeddings to numpy arrays
all_chunk_embeddings = np.array(all_chunk_embeddings)

# Add embeddings to FAISS indices
index_text.add(all_chunk_embeddings)

# Save FAISS indices and metadata
faiss.write_index(index_text, build_folder + 'financial_docs_text_index_sentences.faiss')


with open(build_folder + 'financial_chunks_metadata_sentences.pkl', 'wb') as f:
    pickle.dump(chunk_metadata, f)

