# Query FAISS Database

This is the retrieval step within RAG, now we have created the database we now want to rank the text chunks using cosine similarity/Euclidean distance relative to our vectorised query.

In [4]:
# Import necessary libraries
import os
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the FAISS index
faiss_index_path = '../data/processed/financial_reports_faiss.index'
index = faiss.read_index(faiss_index_path)
    
# Load the text chunk DataFrame
text_chunk_df = pd.read_csv('../data/processed/text_chunk_df.csv')

# Initialize the Sentence-BERT model
model_name = "../models/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange







Define the FAISS cosine function.

In [8]:
def faiss_only_retrieval_cosine(query, faiss_index, model, text_chunk_df, top_n=5):
    """FAISS retrieval returning text chunk, file, and similarity score using cosine similarity."""
    
    # Step 1: Use Sentence-BERT to embed the query
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    
    # Normalize the query embedding for cosine similarity
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    # Step 2: Use FAISS to find the top_n most similar documents to the query embedding
    D, I = faiss_index.search(query_embedding, k=top_n)  # Note: D contains inner products now

    results = []
    for idx in range(len(I[0])):  # Iterate only over valid indices returned by FAISS
        if I[0][idx] < len(text_chunk_df):  # Ensure the index is within bounds
            chunk_idx = I[0][idx]
            chunk_text = text_chunk_df.iloc[chunk_idx]["chunk"]
            file_name = text_chunk_df.iloc[chunk_idx]["file"]
            # Use inner product as the similarity score
            similarity_score = D[0][idx]  # D contains inner products now
            results.append({
                "file_name": file_name,
                "text_chunk": chunk_text,
                "similarity_score": similarity_score
            })
    
    return results

Define the FAISS euclidean distance function.

In [9]:
def faiss_only_retrieval_euclidean(query, faiss_index, model, text_chunk_df, top_n=5):
    """FAISS retrieval returning text chunk, file, and similarity score."""
    
    # Step 1: Use Sentence-BERT to embed the query
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    
    # Step 2: Use FAISS to find the top_n most similar documents to the query embedding
    D, I = faiss_index.search(query_embedding, k=top_n)

    results = []
    for idx in range(len(I[0])):  # Iterate only over valid indices returned by FAISS
        if I[0][idx] < len(text_chunk_df):  # Ensure the index is within bounds
            chunk_idx = I[0][idx]
            chunk_text = text_chunk_df.iloc[chunk_idx]["chunk"]
            file_name = text_chunk_df.iloc[chunk_idx]["file"]
            similarity_score = 1 / (1 + D[0][idx])  # Convert distance to similarity score
            results.append({
                "file_name": file_name,
                "text_chunk": chunk_text,
                "similarity_score": similarity_score
            })
    
    return results


Run the query.

In [13]:
query = "What is happening to the electric car market? Is it expected to grow or shrink?"
# results = faiss_only_retrieval_cosine(query, index, model, text_chunk_df)
results = faiss_only_retrieval_euclidean(query, index, model, text_chunk_df)

# Print the results
for result in results:
    print(f"File: {result['file_name']}, Similarity: {result['similarity_score']}")
    print(f"Text Chunk: {result['text_chunk']}\n")

File: NASDAQ_TSLA_2022.pdf, Similarity: 0.5975007786962108
Text Chunk: As a result, the market for our vehicles could be 
negatively affected by numerous factors, such as: 
•
perceptions about electric vehicle features, quality, safety, performance and cost;
•
perceptions about the limited range over which electric vehicles may be driven on a single battery charge, and access to charging 
facilities; 
•
competition, including from other types of alternative fuel vehicles, plug-in hybrid electric vehicles and high fuel-economy internal 
combustion engine vehicles; 
•
volatility in the cost of oil, gasoline and energy, such as wide fluctuations in crude oil prices during 2020; 
•
government regulations and economic incentives and conditions; and
•
concerns about our future viability.
Finally, the target demographics for our vehicles, particularly Model 3 and Model Y, are highly competitive Sales of vehicles in the automotive 
industry tend to be cyclical in many markets, which may expose