# Create Vector Database

First we will create the database of vectors and words. I don't believe for such a small project that we will need a SQLite databse, so we will simple save the text data as a csv.

In [1]:
# Import necessary libraries
import os
import fitz  # PyMuPDF for PDF text extraction
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer
import pandas as pd


  from tqdm.autonotebook import tqdm, trange





Model we will use is a Hugging Face transformer model saved on my local machine.

In [2]:
# Initialize Sentence-BERT model and BERT tokenizer
model_name = "../models/all-mpnet-base-v2"
model = SentenceTransformer(model_name)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)

# FAISS setup for vector search
embedding_dimension = model.get_sentence_embedding_dimension()  # Get the dimension size of the embeddings
index = faiss.IndexFlatL2(embedding_dimension)  # L2 distance for similarity search




Chunk up the text data so as not to max out the token limit.

In [3]:
# Function to extract text from PDFs using PyMuPDF
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()

    doc.close()
    return text
    
def chunk_text(text, tokenizer, max_tokens=512):
    """Chunk the text based on sentences, ensuring no chunk exceeds the token limit."""
    # Split text into sentences
    sentences = text.split('. ')  # You can adjust the sentence splitter to suit the document type
    
    chunks = []
    current_chunk = []
    current_chunk_token_count = 0
    
    for sentence in sentences:
        # Tokenize each sentence to get the number of tokens
        sentence_tokens = tokenizer.tokenize(sentence)
        sentence_token_count = len(sentence_tokens)
        
        # Check if adding this sentence would exceed the token limit
        if current_chunk_token_count + sentence_token_count <= max_tokens:
            current_chunk.append(sentence)
            current_chunk_token_count += sentence_token_count
        else:
            # Save the current chunk and reset for the next chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_chunk_token_count = sentence_token_count
    
    # Append the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks


Create a function that will process the pdfs and store vectors values in the FAISS index.

In [4]:
def process_pdfs_and_store_faiss(pdf_folder, pdf_files, tokenizer, model, faiss_index, text_chunk_df, bm25_corpus, doc_chunks):
    """Process all PDFs, tokenize, vectorize, and store embeddings in FAISS."""
    doc_ids = []  # Keep track of which document corresponds to which vectors
    all_embeddings = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Step 1: Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        
        # Step 2: Chunk text into smaller chunks based on sentences and 512 token limit
        chunks = chunk_text(text, tokenizer, max_tokens=512)
        
        # Step 3: Generate embeddings for each chunk
        embeddings = model.encode(chunks, convert_to_tensor=True)
        embeddings = embeddings.cpu().numpy()  # Convert to numpy for FAISS
        
        # Step 4: Add embeddings to FAISS index
        faiss_index.add(embeddings)
        
        # Keep track of document IDs and embedding count for reference
        doc_ids.extend([pdf_file] * len(embeddings))
        all_embeddings.append(embeddings)
        
        # Append the text chunks and file name to the DataFrame (for FAISS and final retrieval)
        for chunk in chunks:
            row_to_add = pd.DataFrame({"chunk": [chunk], "file": [pdf_file]})
            text_chunk_df = pd.concat([text_chunk_df, row_to_add], ignore_index=True)
        
        # BM25 setup: tokenizing chunks for keyword search
        for chunk in chunks:
            bm25_corpus.append(chunk.split())  # Tokenized for BM25
            doc_chunks.append(pdf_file)  # Track which document the chunk belongs to
    
    return doc_ids, all_embeddings, text_chunk_df, bm25_corpus, doc_chunks


Process the PDFs by running the function

In [9]:
# Initialize required variables
pdf_folder = '../data/raw'
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
text_chunk_df = pd.DataFrame(columns=["chunk", "file"])  # DataFrame to store text chunks and file names
bm25_corpus = []  # To store all chunks for BM25
doc_chunks = []  # To store chunks for BM25 document references

# Run the script to process PDFs
doc_ids, all_embeddings, text_chunk_df, bm25_corpus, doc_chunks = process_pdfs_and_store_faiss(
    pdf_folder, pdf_files, tokenizer, model, index, text_chunk_df, bm25_corpus, doc_chunks
)

# Save the FAISS index and metadata for future use
faiss.write_index(index, '../data/processed/financial_reports_faiss.index')


# Save text_chunk_df as a CSV for future querying
text_chunk_df.to_csv('../data/processed/text_chunk_df.csv', index=False)

Processing: LSE_AZN_2022.pdf
Processing: LSE_BP_2022.pdf
Processing: LSE_ULVR_2022.pdf
Processing: NASDAQ_AAPL_2022.pdf
Processing: NASDAQ_AMZN_2022.pdf
Processing: NASDAQ_INTC_2022.pdf
Processing: NASDAQ_MSFT_2022.pdf
Processing: NASDAQ_TSLA_2022.pdf
Processing: NYSE_BA_2022.pdf
Processing: NYSE_CVX_2022.pdf
Processing: NYSE_GS_2022.pdf
Processing: NYSE_HSBC_2022.pdf
Processing: NYSE_JNJ_2022.pdf
Processing: NYSE_JPM_2022.pdf
Processing: NYSE_K_2022.pdf
Processing: NYSE_MANU_2022.pdf
Processing: NYSE_PFE_2022.pdf
Processing: NYSE_V_2022.pdf
Processing: NYSE_WK_2022.pdf
Processing: NYSE_WMT_2022.pdf
Processing: NYSE_XOM_2021.pdf
Processing: OTC_NSRGY_2022.pdf
