# **Retriever Evaluation**

## **Pre-requisites**

1. You have ran ```Retrieval_Experiment_1``` to get an experiment output
2. You have scored all the files inside the experiment output
3. You have zipped the experiment output to ```experiment_1_output.zip```

In [None]:
%pip install --quiet --upgrade bitsandbytes langchain langchain-community langchain-huggingface transformers beautifulsoup4 faiss-gpu rank_bm25 lark langchain_groq ragas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m116.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.5/157.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import pandas as pd
import random
import numpy as np
from google.colab import files
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import LLMContextRecall
from ragas.llms import LangchainLLMWrapper
from langchain_groq import ChatGroq
from google.colab import userdata

## **User Action Required**

1. Run the code below to create the ```experiment_outputs``` folder

2. Upload ```retriever_evaluation.py```

3. Upload the ```experiement_1_output.zip``` file that contains the files you have scored


In [None]:
experiment_folder = os.path.join(os.getcwd(), 'experiment_outputs', 'experiment_1_output')
os.makedirs(experiment_folder, exist_ok=True)

In [None]:
# Upload retriever_evaluation.py
files.upload();

Saving retriever_evaluation.py to retriever_evaluation.py


In [None]:
# Upload experiment_1_output.zip
files.upload();

Saving experiment_1_output.zip to experiment_1_output.zip


In [None]:
!unzip experiment_1_output.zip -d experiment_outputs/experiment_1_output

Archive:  experiment_1_output.zip
   creating: experiment_outputs/experiment_1_output/score_relevance/
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.0_faiss_1.0_score_relevance.csv  
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.1_faiss_0.9_score_relevance.csv  
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.2_faiss_0.8_score_relevance.csv  
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.3_faiss_0.7_score_relevance.csv  
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.4_faiss_0.6_score_relevance.csv  
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.5_faiss_0.5_score_relevance.csv  
  inflating: experiment_outputs/experiment_1_output/score_relevance/hybrid_retriever_bm25_0.6_faiss_0.4_score_relevance.csv  
  inflating: ex

In [None]:
import retriever_evaluation

## **Evaluate Experiment 1: Hybrid/Ensemble Retriever**

### **Binary Relevance: Mean Average Precision, Mean Reciprocal Rank**

Mean Average Precision:

Mean Reciprocal Rank:

In [None]:
experiment_1_binary_relevance_folder = os.path.join(experiment_folder,'binary_relevance')

# Initialise dictionaries: { 'bm25_bm25weight_faiss_faiss_weight' = mean average precision}, { 'bm25_bm25weight_faiss_faiss_weight' = mean reciprocal rank}
experiment_1_mean_ave_precision_res = {}
experiment_1_mean_reciprocal_rank_res = {}
# For each bm25 weight and faiss weight combination,
# Calculate the mean average precision and mean reciprocal rank over all the queries
for f_name in os.listdir(experiment_1_binary_relevance_folder):
    f_name_split = f_name.split('_')
    bm25_val = f_name_split[3]
    faiss_val = f_name_split[5]
    fp = os.path.join(experiment_1_binary_relevance_folder, f_name)
    full_df = pd.read_csv(fp)
    df_split = [group for query, group in full_df.groupby('query')]
    relevance_scores = []
    for df in df_split:
        # TOCHANGE: Assign random 1s or 0s to the relevant col for testing purposes
        df['relevant'] = [random.choice([0, 1]) for _ in range(len(df))]
        relevance_scores.append(list(df['relevant']))
    experiment_1_mean_ave_precision_res[f'bm25_{bm25_val}_faiss_{faiss_val}'] = retriever_evaluation.mean_average_precision(relevance_scores)
    experiment_1_mean_reciprocal_rank_res[f'bm25_{bm25_val}_faiss_{faiss_val}'] = retriever_evaluation.mean_reciprocal_rank(relevance_scores)

In [None]:
max_value_map = max(experiment_1_mean_ave_precision_res.values())
best_maps = {key: value for key, value in experiment_1_mean_ave_precision_res.items() if value == max_value_map}
print('The best weightage for the mean average precision using binary relevance is:')
for k,v in best_maps.items():
    bm25_weight = k.split('_')[1]
    faiss_weight = k.split('_')[-1]
    print(f'Weightage with bm25: {bm25_weight}, faiss: {faiss_weight} with a value of {v}')

The best weightage for the mean average precision using binary relevance is:
Weightage with bm25: 0.0, faiss: 1.0 with a value of 0.7566468253968255


In [None]:
max_value_mrr = max(experiment_1_mean_reciprocal_rank_res.values())
best_mrrs = {key: value for key, value in experiment_1_mean_reciprocal_rank_res.items() if value == max_value_mrr}
print('The best weightage for the mean reciprocal rank using binary relevance is:')
for k,v in best_mrrs.items():
    bm25_weight = k.split('_')[1]
    faiss_weight = k.split('_')[-1]
    print(f'Weightage with bm25: {bm25_weight}, faiss: {faiss_weight} with a value of {v}')

The best weightage for the mean reciprocal rank using binary relevance is:
Weightage with bm25: 0.8, faiss: 0.2 with a value of 1.0


### **Score Relevance: Mean Normalised Discounted Cumulative Gain**

For score relevance, put at k=5 first

Mean Normalised Discounted Cumulative Gain:

In [None]:
k = 5
experiment_1_score_relevance_folder = os.path.join(experiment_folder,'score_relevance')
# Initialise dictionaries: { 'bm25_bm25weight_faiss_faiss_weight' = mean normalised discounted cumulative gain}
experiment_1_mean_normalised_discounted_cumulative_gain_res = {}
# For each bm25 weight and faiss weight combination,
# Calculate the normalised discounted cumulative gain over all the queries
for f_name in os.listdir(experiment_1_score_relevance_folder):
    f_name_split = f_name.split('_')
    bm25_val = f_name_split[3]
    faiss_val = f_name_split[5]
    fp = os.path.join(experiment_1_score_relevance_folder, f_name)
    full_df = pd.read_csv(fp)
    df_split = [group for query, group in full_df.groupby('query')]
    relevance_scores = []
    for df in df_split:
        # TOCHANGE: Assign random score between 0 and 5 to the relevant col for testing purposes
        df['relevant'] = [random.choice([0, 1, 2, 3, 4, 5]) for _ in range(len(df))]
        relevance_scores.append(retriever_evaluation.ndcg_at_k(list(df['relevant']),k))
    experiment_1_mean_normalised_discounted_cumulative_gain_res[f'bm25_{bm25_val}_faiss_{faiss_val}'] = np.mean(relevance_scores)

In [None]:
max_value_map = max(experiment_1_mean_normalised_discounted_cumulative_gain_res.values())
best_maps = {key: value for key, value in experiment_1_mean_normalised_discounted_cumulative_gain_res.items() if value == max_value_map}
print('The best weightage for the mean normalised discounted cumulative gain using score relevance is:')
for k,v in best_maps.items():
    bm25_weight = k.split('_')[1]
    faiss_weight = k.split('_')[-1]
    print(f'Weightage with bm25: {bm25_weight}, faiss: {faiss_weight} with a value of {v}')

The best weightage for the mean normalised discounted cumulative gain using score relevance is:
Weightage with bm25: 0.3, faiss: 0.7 with a value of 0.7759726857883613


### **Estimated Context Recall with RAGAS**

Calculate using
- Reference/GT answer
- Retrieved context results

To estimate context recall from the Reference/GT answer, the Reference/GT answer is broken into claims

Each claim in the Reference/GT answer is analysed by an LLM to determine if it can be attributed to the retrieved context or not

```
context_recall = number of reference claims that can be attributed to the retrieved context / number of reference claims
```



**Extract the questions**

In [None]:
# From Retrieval Experiment 1 and 2, save questions to a list
question_1 = "best food eat Finland"
question_2 = "best food eat Iceland"

**Fill in ground truth answers for each question**

In [None]:
question_1_gpt_answer = """
Finland's culinary traditions offer a rich array of flavors, reflecting its natural resources and cultural heritage. Here are some quintessential Finnish dishes to experience:

Karjalanpiirakka (Karelian Pie)
Originating from the Karelia region, these rye crust pastries are traditionally filled with rice porridge and often topped with egg butter. They are a beloved Finnish snack, commonly enjoyed across the country.

Ruisleipä (Rye Bread)
A staple in Finnish cuisine, this dense and dark bread is made from sourdough rye. It's typically enjoyed with butter, cheese, or cold cuts, and forms an essential part of daily meals.

Kalakukko
Hailing from the Savonia region, this traditional dish consists of fish (commonly perch or salmon) and pork baked inside a thick rye bread crust, creating a hearty and portable meal.

Poronkäristys (Sautéed Reindeer)
A specialty from Lapland, this dish features thinly sliced reindeer meat sautéed with onions and butter, typically served with mashed potatoes and lingonberry jam.

Leipäjuusto (Bread Cheese)
Also known as 'squeaky cheese' due to its texture, this mild cheese is often warmed and served with cloudberry jam, offering a unique combination of flavors.

Lohikeitto (Salmon Soup)
A creamy soup made with fresh salmon, potatoes, leeks, and dill, providing a comforting and flavorful experience, especially during colder months.

Mustikkapiirakka (Blueberry Pie)
This traditional dessert features wild Finnish blueberries baked into a pie, often enjoyed with vanilla sauce or ice cream.

Exploring these dishes will provide a genuine taste of Finland's rich culinary heritage.
"""

question_2_gpt_answer = """
Iceland's culinary scene offers a rich tapestry of traditional dishes that reflect its unique heritage and natural resources. Here are some quintessential Icelandic foods to experience:

Pylsur (Icelandic Hot Dog)
A blend of lamb, pork, and beef, served in a soft bun with toppings like ketchup, sweet mustard, remoulade, and both raw and crispy fried onions. A popular spot to try this is Bæjarins Beztu Pylsur in Reykjavík, renowned for its delicious hot dogs.

Plokkfiskur (Fish Stew)
A hearty mix of white fish (such as cod or haddock), potatoes, onions, and béchamel sauce. This comforting dish showcases Iceland's rich fishing traditions.

Hangikjöt (Smoked Lamb)
Traditionally smoked over birch or dried sheep dung, this lamb is typically served thinly sliced with flatbread or potatoes, especially during festive seasons.

Kjötsúpa (Lamb Soup)
A nourishing soup made with lamb, root vegetables, and herbs, offering warmth during Iceland's colder months.

Skyr
A thick, creamy dairy product similar to yogurt but technically a cheese. It's enjoyed plain or with added flavors like berries and is a staple in Icelandic diets.

Harðfiskur (Dried Fish)
Wind-dried fish, often cod or haddock, served with salted butter. This protein-rich snack has been a traditional staple for centuries.

Kleinur
A twisted doughnut-like pastry, deep-fried and mildly sweet, commonly enjoyed with coffee.

For a contemporary twist on traditional Icelandic cuisine, consider dining at Dill in Reykjavík. As the first Icelandic restaurant awarded a Michelin star, Dill offers innovative dishes that highlight local ingredients.

Exploring these dishes will provide a genuine taste of Iceland's culinary heritage.
"""

qna = {
    question_1: question_1_gpt_answer,
    question_2: question_2_gpt_answer
}



**Use RAGAS library to calculate estimated context recall**

In [None]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
llm = ChatGroq()
context_recall = LLMContextRecall(llm=LangchainLLMWrapper(llm))

In [None]:
experiment_1_binary_relevance_folder = os.path.join(experiment_folder,'binary_relevance')
# Initialise dictionaries: { 'bm25_bm25weight_faiss_faiss_weight' = average estimated context recall}
experiment_1_estimated_context_recall = {}
# For each bm25 weight and faiss weight combination,
# Calculate the context recall over all the queries
for f_name in os.listdir(experiment_1_binary_relevance_folder):
    f_name_split = f_name.split('_')
    bm25_val = f_name_split[3]
    faiss_val = f_name_split[5]
    fp = os.path.join(experiment_1_binary_relevance_folder, f_name)
    full_df = pd.read_csv(fp)
    df_split = [group for query, group in full_df.groupby('query')]
    context_recall_scores = []
    # For each query, calculate the estimated context recall using the retrieved contexts
    for df in df_split:
      question = df['query'].iloc[0]
      reference = qna[question]
      contexts = list(df['retrieved_doc'])
      sample = SingleTurnSample(
          user_input=question,
          response="blank",
          reference=reference,
          retrieved_contexts=contexts,
      )
      context_recall_scores.append(await context_recall.single_turn_ascore(sample))
      print(context_recall_scores[-1])
    experiment_1_estimated_context_recall[f'bm25_{bm25_val}_faiss_{faiss_val}'] = np.mean(context_recall_scores)

0.1111111111111111
0.0
0.1111111111111111
0.0
0.75
0.3333333333333333
0.1111111111111111
0.0
0.75
0.3333333333333333
0.1111111111111111
0.0
0.1111111111111111
0.0
0.75
0.3333333333333333
0.75
0.3333333333333333
0.5
0.1111111111111111
0.75
0.3333333333333333


In [None]:
max_value_map = max(experiment_1_estimated_context_recall.values())
best_maps = {key: value for key, value in experiment_1_estimated_context_recall.items() if value == max_value_map}
print('The best weightage for the estimated context recall:')
for k,v in best_maps.items():
    bm25_weight = k.split('_')[1]
    faiss_weight = k.split('_')[-1]
    print(f'Weightage with bm25: {bm25_weight}, faiss: {faiss_weight} with a value of {v}')

The best weightage for the estimated context recall:
Weightage with bm25: 0.6, faiss: 0.4 with a value of 0.5416666666666666
Weightage with bm25: 0.9, faiss: 0.1 with a value of 0.5416666666666666
Weightage with bm25: 1.0, faiss: 0.0 with a value of 0.5416666666666666
Weightage with bm25: 0.8, faiss: 0.2 with a value of 0.5416666666666666
Weightage with bm25: 0.7, faiss: 0.3 with a value of 0.5416666666666666
