In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import pickle
from collections import defaultdict

import os
import sys
sys.path.append("..")
from modules.extraction.preprocessing import DocumentProcessing
from modules.extraction.embedding import Embedding
from modules.retrieval.index.bruteforce import FaissBruteForce
from modules.retrieval.search import FaissSearch
from modules.generator.question_answering import QA_Generator

[nltk_data] Downloading package punkt_tab to /Users/jk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Task 4 

In a notebook called textwave/notebooks/demo_retrieval.ipynb, using all questions listed in `textwave/qa_resources/questions.tsv`, demonstrate your system's ability to retrieve the nearest neighbors. 

You will compare the retrieval performance of `all-MiniLM-L6-v2` embedding model and another embedding model of your choosing from the available list of [modelsLinks](https://sbert.net/docs/sentence_transformer/pretrained_models.html).  Measure the retrieval (ranking) performance of the two embedding models based on retrieved chunks being in the proper target article (see ArticleFile column in **textwave/qa_resources/questions.tsv**).

For example, given a question with an associated ArticleFile as "S08_set3_a4" a true positive would be a chunk that is extracted from S08_set3_a4.txt.clean. You may choose to use default parameters/configuration for the other modules.

> Note: you may refer to your IronClad index and search implementations. Place them in textwave/modules/retrieval/. Follow IronClads files structure for index.

In [2]:
# Initialize Params
MODEL_NAMES = ['all-MiniLM-L6-v2', 'multi-qa-mpnet-base-cos-v1', 'multi-qa-distilbert-cos-v1', 'paraphrase-multilingual-MiniLM-L12-v2'] #
CHUNKING_STRATEGY = 'sentence' # or 'fixed-length'
STORAGE_DIR = '../storage/'
FAISS_INDEX_DIR = '../storage/faiss_index/'
DISTANCE_METRIC = 'cosine'

In [3]:
# Helper function to load a serialized FAISS index instance from a file.
def load_faiss_index(filepath):
    with open(filepath, 'rb') as f:
        instance = pickle.load(f)
    return instance

# Helper function to generate embeddings for a list of text
def generate_embedding(inputs, embedding_model):
    embeddings = []
    for text in inputs:
        embedding_vector = embedding_model.encode(text)
        embeddings.append(embedding_vector)

    return embeddings

In [4]:
# Read the TSV file into a DataFrame
questions_df = pd.read_csv("../qa_resources/question.tsv", sep="\t")

questions_df = questions_df.dropna(subset=['Question','ArticleFile'])
print(f"{len(questions_df)} non-empty questions/articlefiles found.")

# Display the first 5 rows
print("The first 5 rows of the DataFrame:")
display(questions_df.head())


1032 non-empty questions/articlefiles found.
The first 5 rows of the DataFrame:


Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4


In [5]:
questions = questions_df['Question'].tolist()  # Extract the 'Question' column as a list
target_answers = questions_df['Answer'].tolist()  # Extract the 'Answer' column as a list
target_files = questions_df['ArticleFile'].tolist()  # Extract the 'ArticleFile' column as a list

assert len(questions) == len(target_answers) == len(target_files) == len(questions_df), "Length of questions, answers, and files must match."

In [6]:
# Generate chucks from the documents
documents = [os.path.join(STORAGE_DIR, f) for f in os.listdir(STORAGE_DIR) if f.endswith('.txt.clean')]
print(f"Total number of documents: {len(documents)}")

chunks_dict = defaultdict(list)

# Iterate over the documents and chunk them
for document in documents:
    document_name = os.path.basename(document)[:-10]

    # Initialize DocumentProcessing class
    document_processing = DocumentProcessing()

    if CHUNKING_STRATEGY == 'sentence':
        chunks_dict[document_name].extend(document_processing.sentence_chunking(document, num_sentences=15, overlap_size=0))
    elif CHUNKING_STRATEGY == 'fixed-length':
        chunks_dict[document_name].extend(document_processing.fixed_length_chunking(document, chunk_size=256, overlap_size=0))

# Get the chunks and document names into lists
chunks = [item for sub_chunks in chunks_dict.values() for item in sub_chunks]
document_names = [doc_name for doc_name in chunks_dict.keys() for _ in range(len(chunks_dict[doc_name]))]
assert len(chunks) == len(document_names), "Mismatch between chunks and document names"

# Print the number of chunks
print(f"Total number of chunks: {len(chunks)}")


Total number of documents: 150
Total number of chunks: 2351


In [7]:
# Generate embeddings for the questions and document chuncks as context
question_vectors = {}
context_vectors = {}

# Generate embeddings for each model
for model_name in MODEL_NAMES:
    embedding_model = Embedding(model_name=model_name)
    question_vectors[model_name] = generate_embedding(questions, embedding_model)
    context_vectors[model_name] = generate_embedding(chunks, embedding_model)
    print(f"With model {model_name}, {len(question_vectors[model_name])} question vectors and {len(context_vectors[model_name])} context vectors are generated.")

With model all-MiniLM-L6-v2, 1032 question vectors and 2351 context vectors are generated.
With model multi-qa-mpnet-base-cos-v1, 1032 question vectors and 2351 context vectors are generated.
With model multi-qa-distilbert-cos-v1, 1032 question vectors and 2351 context vectors are generated.
With model paraphrase-multilingual-MiniLM-L12-v2, 1032 question vectors and 2351 context vectors are generated.


In [8]:
# Store vector embeddings of context chunks in a BruteForace index
for model_name in MODEL_NAMES:
    faiss_index = FaissBruteForce(dim=len(context_vectors[model_name][0]), metric=DISTANCE_METRIC)
    faiss_index.add_embeddings(np.array(context_vectors[model_name]), metadata=document_names) # metadata is the document name
    faiss_index.save(FAISS_INDEX_DIR + f"faiss_index_{model_name}.pkl")
    print(f"FAISS index with model {model_name} is saved in {FAISS_INDEX_DIR}faiss_index_{model_name}.pkl")

FAISS index with model all-MiniLM-L6-v2 is saved in ../storage/faiss_index/faiss_index_all-MiniLM-L6-v2.pkl
FAISS index with model multi-qa-mpnet-base-cos-v1 is saved in ../storage/faiss_index/faiss_index_multi-qa-mpnet-base-cos-v1.pkl
FAISS index with model multi-qa-distilbert-cos-v1 is saved in ../storage/faiss_index/faiss_index_multi-qa-distilbert-cos-v1.pkl
FAISS index with model paraphrase-multilingual-MiniLM-L12-v2 is saved in ../storage/faiss_index/faiss_index_paraphrase-multilingual-MiniLM-L12-v2.pkl


In [9]:
# Initialize FAISS search to retrieve the top 3 results
results = []
for model_name in MODEL_NAMES:
    # Load the FAISS index
    faiss_index = load_faiss_index(FAISS_INDEX_DIR + f"faiss_index_{model_name}.pkl")
    faiss_search = FaissSearch(faiss_index, metric=DISTANCE_METRIC)

    # Perform the search for each question
    for question_vector, question, target_answer, target_file in zip(question_vectors[model_name], questions, target_answers, target_files):
        distances, indices, metadata = faiss_search.search(question_vector, k=3)

        # Store the result
        results.append({
            'ModelName': model_name,
            'Question': question,
            'TargetAnswer': target_answer,
            'TargetFile': target_file,
            'MetaResults': metadata,
        })

# Convert the result list to a DataFrame and display the first 10 rows
results = pd.DataFrame(results)
display(results[:10])

Unnamed: 0,ModelName,Question,TargetAnswer,TargetFile,MetaResults
0,all-MiniLM-L6-v2,Was Abraham Lincoln the sixteenth President of...,yes,S08_set3_a4,"[S08_set3_a4, S08_set3_a4, S08_set3_a4]"
1,all-MiniLM-L6-v2,Was Abraham Lincoln the sixteenth President of...,Yes.,S08_set3_a4,"[S08_set3_a4, S08_set3_a4, S08_set3_a4]"
2,all-MiniLM-L6-v2,Did Lincoln sign the National Banking Act of 1...,yes,S08_set3_a4,"[S08_set3_a4, S08_set3_a5, S08_set3_a4]"
3,all-MiniLM-L6-v2,Did Lincoln sign the National Banking Act of 1...,Yes.,S08_set3_a4,"[S08_set3_a4, S08_set3_a5, S08_set3_a4]"
4,all-MiniLM-L6-v2,Did his mother die of pneumonia?,no,S08_set3_a4,"[S08_set4_a4, S09_set4_a4, S08_set3_a10]"
5,all-MiniLM-L6-v2,Did his mother die of pneumonia?,No.,S08_set3_a4,"[S08_set4_a4, S09_set4_a4, S08_set3_a10]"
6,all-MiniLM-L6-v2,How many long was Lincoln's formal education?,18 months,S08_set3_a4,"[S08_set3_a4, S08_set3_a4, S08_set3_a4]"
7,all-MiniLM-L6-v2,How many long was Lincoln's formal education?,18 months.,S08_set3_a4,"[S08_set3_a4, S08_set3_a4, S08_set3_a4]"
8,all-MiniLM-L6-v2,When did Lincoln begin his political career?,1832,S08_set3_a4,"[S08_set3_a4, S08_set3_a4, S08_set3_a4]"
9,all-MiniLM-L6-v2,When did Lincoln begin his political career?,1832.,S08_set3_a4,"[S08_set3_a4, S08_set3_a4, S08_set3_a4]"


In [None]:
# Get performance metrics for each model
for model_name in MODEL_NAMES:
    model_results = results[results['ModelName'] == model_name]

    # Calculate the number of correct answers
    correct_answers = 0 # True Positives
    for index, row in model_results.iterrows():
        if row['TargetFile'] in row['MetaResults']:
            correct_answers+= 1

    # Calculate accuracy
    accuracy = correct_answers / len(model_results)
    print(f"Model {model_name} made {correct_answers} correct answers for {len(model_results)} questions. Accuracy is {accuracy:.2f}")

Model all-MiniLM-L6-v2 made 823 correct answers for 1032 questions. Accuracy is 0.80
Model multi-qa-mpnet-base-cos-v1 made 864 correct answers for 1032 questions. Accuracy is 0.84
Model multi-qa-distilbert-cos-v1 made 849 correct answers for 1032 questions. Accuracy is 0.82
Model paraphrase-multilingual-MiniLM-L12-v2 made 774 correct answers for 1032 questions. Accuracy is 0.75


#### Summary
In this analysis, four different models are applied to encode 1032 questions and 150 documents(context). Then indices are built to retrieve the top 3 documents for each question. To evaluate the  performance across models, consistent chuncking (by sentence), indexing (FAISS bruteforce index) and retrieval methodologies (consine metric) were applied.

According to the result, model `multi-qa-mpnet-base-cos-v1` shows the best performance with the highest accuracy rate of 0.84, following by model `multi-qa-distilbert-cos-v1` with accuracy rate of 0.82. The model `paraphrase-multilingual-MiniLM-L12-v2` demonstrates the worst performance for this case with accuracy score of 0.75. 

The model `multi-qa-mpnet-base-cos-v1` generally utilizes the more powerful MPNet architecture, explaining why it has higher accuracy. Notably, the multilingual model `paraphrase-multilingual-MiniLM-L12-v2` shows the lowest accuracy. While its multilingual capability can be beneficial in diverse linguistic contexts, it may sacrifice some accuracy compared to language-specific or English-optimized models.
