# Ablation Study Calculations
This notebook contains all my code to run the ablation study and to compute the results

In [1]:
import sys
import os
import json
# from dotenv import load_dotenv
from typing import Literal
import time

# load_dotenv()
# DATA_DIR = os.getenv("DATA_DIR_PATH")

In [2]:
notebook_dir = os.getcwd() # Get the current working directory of the notebook
src_dir = os.path.abspath(os.path.join(notebook_dir, '..', '..')) # Construct the path to the src directory
sys.path.append(src_dir) # Add the src directory to the system path

from src.pipelines.pipeline_runner import run_data_through_generator
from src.utils import load_json_to_pipelinedata, save_objects_as_json

  from .autonotebook import tqdm as notebook_tqdm


## Complete the ablation runs

In [7]:
what_to_ablate = Literal["query-rewriter", "sparse-retriever", "reranker", "sample-questions", "one-shot"] 

################################
# What do you wish to get rid of in this run?
what_to_ablate = "query-rewriter"
################################

for what_to_ablate in ["sparse-retriever", "reranker", "sample-questions", "one-shot"]:

    ablation_params = {
            "no_rewriter": False,
            "no_sparse_retriever": False,
            "no_reranker": False,
            "no_sample_questions": False,
            "no_one_shot": False
        }

    if what_to_ablate == "query-rewriter":
        input_file_main = os.path.join("..", "..", "data", "ablation", "datasets", "retrieved-(no-query-rewrite).json")
        output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite.json")

        input_file_control = os.path.join("..", "..", "data", "ablation", "datasets", "control-retrieved-(no-query-rewrite).json")
        output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite-control.json")
        ablation_params["no_rewriter"] = True
    elif what_to_ablate == "sparse-retriever":
        input_file_main = os.path.join("..", "..", "data", "ablation", "datasets", "retrieved-(no-sparse-retrieval).json")
        output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever.json")

        input_file_control = os.path.join("..", "..", "data", "ablation", "datasets", "control-retrieved-(no-sparse-retrieval).json")
        output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever-control.json")
        ablation_params["no_sparse_retriever"] = True
    elif what_to_ablate == "reranker":
        input_file_main = os.path.join("..", "..", "data", "ablation", "datasets", "retrieved-(no-query-rewrite).json")
        output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker.json")

        input_file_control = os.path.join("..", "..", "data", "ablation", "datasets", "control-retrieved-(no-query-rewrite).json")
        output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker-control.json")
        ablation_params["no_reranker"] = True
    elif what_to_ablate == "sample-questions":
        input_file_main = os.path.join("..", "..", "data", "main", "datasets", "test-prompts-rewritten-retrieved.json")
        output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions.json")
        
        input_file_control = os.path.join("..", "..", "data", "main", "datasets", "control-test-prompts-rewritten-retrieved.json")
        output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions-control.json")
        ablation_params["no_sample_questions"] = True
    elif what_to_ablate == "one-shot":
        input_file_main = os.path.join("..", "..", "data", "main", "datasets", "test-prompts-rewritten-retrieved.json")
        output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot.json")
        input_file_control = os.path.join("..", "..", "data", "main", "datasets", "control-test-prompts-rewritten-retrieved.json")
        output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot-control.json")
        ablation_params["no_one_shot"] = True
    else: 
        raise Exception("You haven't specified a correct ablation to generate.")

    with open(input_file_control, 'r', encoding="utf-8") as file:
        data_control = json.load(file)
        incomplete_pipeline_data_control = load_json_to_pipelinedata(data_control)

    with open(input_file_main, 'r', encoding="utf-8") as file:
        data_main = json.load(file)
        incomplete_pipeline_data_main = load_json_to_pipelinedata(data_main)

    # # Run the ablation study using llama3-70b-8192 due to its success in the 
    res = run_data_through_generator(incomplete_pipeline_data_control, "llama3-70b-8192", ablation_params=ablation_params, verbose=False)
    save_objects_as_json(res, output_file_control, rewrite=True)

    def process_in_batches(data, batch_size, wait_time, output_file, model, ablation_params, verbose=False):
        total_elements = len(data)
        for i in range(0, total_elements, batch_size):

            batch = data[i:i + batch_size]
            res = run_data_through_generator(batch, model, ablation_params=ablation_params, verbose=verbose)
            save_objects_as_json(res, output_file, rewrite=False)
            if i + batch_size < total_elements:
                print("Sleepy time...")
                time.sleep(wait_time)
                print("Okay I'm back now...")
                print(f"i={i}")

    process_in_batches(incomplete_pipeline_data_main, 30, 5, output_file_main, "llama3-70b-8192", ablation_params=ablation_params, verbose=False)


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 15138459: character maps to <undefined>

## Ablation study automatic evaluation

In [8]:
from evals.automatic.test_utils import extract_eval_data
from evals.automatic.hit_ratio_and_mrr import compute_hit_ratio_and_mrr
from evals.automatic.BERTscore import bertscore_computation_pipeline
from evals.automatic.test_utils import append_to_file
from evals.automatic.embedding_similarity import cosine_similarity_pipeline_from_stored_embeddings
from evals.automatic.k_f1_plus_plus import compute_avg_kf1_score
from bert_score import BERTScorer

################################
what_to_ablate = Literal["query-rewriter", "sparse-retriever", "reranker", "sample-questions", "one-shot"] 

# What do you wish to exclude?
what_to_ablate = "one-shot"
################################

# This is where we will store all results
results_file = os.path.join("..", "..", "evals", "ablation", "results", "results.txt")

if what_to_ablate == "query-rewriter":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite-control.json")
elif what_to_ablate == "sparse-retriever":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever-control.json")
elif what_to_ablate == "reranker":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker-control.json")
elif what_to_ablate == "sample-questions":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions-control.json")
elif what_to_ablate == "one-shot":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot-control.json")
else: 
    raise Exception("You haven't specified a correct ablation to evaluate.")

# Extract the data to PipelineData format
pipeline_data_main = extract_eval_data(data_file_main)
pipeline_data_control = extract_eval_data(data_file_control)

"""Compute the Hit Rate and MRR"""
if what_to_ablate not in ["sample-questions", "one-shot"]:
    # For control
    _, hit_rate, mrr = compute_hit_ratio_and_mrr(pipeline_data_control)
    append_to_file(results_file, f"CONTROL-{what_to_ablate} Hit Rate: {hit_rate}")
    append_to_file(results_file, f"CONTROL-{what_to_ablate} MRR: {mrr}")

    # For main
    _, hit_rate, mrr = compute_hit_ratio_and_mrr(pipeline_data_main)
    append_to_file(results_file, f"MAIN-{what_to_ablate} Hit Rate: {hit_rate}")
    append_to_file(results_file, f"MAIN-{what_to_ablate} MRR: {mrr}")

"""Compute BERTScore"""
# BERTScore scorer
scorer = BERTScorer(model_type='bert-base-uncased')
_, P_content, R_content, F1_content, _, _, _ = bertscore_computation_pipeline(pipeline_data_control, scorer)
append_to_file(results_file, f"CONTROL-{what_to_ablate} BERTscore Precision: {P_content}, Recall: {R_content}, F1: {F1_content}")
_, P_content, R_content, F1_content, _, _, _ = bertscore_computation_pipeline(pipeline_data_main, scorer)
append_to_file(results_file, f"MAIN-{what_to_ablate} BERTscore Precision: {P_content}, Recall: {R_content}, F1: {F1_content}")

"""Compute Embedding Similarity"""
_, similarity_content, _ = cosine_similarity_pipeline_from_stored_embeddings(pipeline_data_control)
append_to_file(results_file, f"CONTROL-{what_to_ablate} all-MiniLM-l6-v2 Cosine Similarity: {similarity_content}")
_, similarity_content, _ = cosine_similarity_pipeline_from_stored_embeddings(pipeline_data_main)
append_to_file(results_file, f"MAIN-{what_to_ablate} all-MiniLM-l6-v2 Cosine Similarity: {similarity_content}")

"""Compute K-F1++"""
avg_kf1_score_content, _, avg_precision_score_content, _, avg_recall_score_content, _ = compute_avg_kf1_score(pipeline_data_control)
append_to_file(results_file, f"CONTROL-{what_to_ablate} K-F1++: {avg_kf1_score_content}, Precision: {avg_precision_score_content} Recall: {avg_recall_score_content}")
avg_kf1_score_content, _, avg_precision_score_content, _, avg_recall_score_content, _ = compute_avg_kf1_score(pipeline_data_main)
append_to_file(results_file, f"MAIN-{what_to_ablate} K-F1++: {avg_kf1_score_content}, Precision: {avg_precision_score_content} Recall: {avg_recall_score_content}")
    

Processing data from pipeline...: 100%|██████████| 30/30 [00:20<00:00,  1.47it/s]
Processing data from pipeline...: 100%|██████████| 300/300 [03:14<00:00,  1.54it/s]
Generating cosine similarities...: 100%|██████████| 30/30 [00:00<00:00, 4128.12it/s]
Generating cosine similarities...: 100%|██████████| 300/300 [00:00<00:00, 5549.20it/s]
Computing K-F1++ scores: 100%|██████████| 30/30 [00:00<00:00, 487.26it/s]
Computing K-F1++ scores: 100%|██████████| 300/300 [00:00<00:00, 729.63it/s]


In [6]:
from evals.automatic.ragas import run_for_pipeline_5_7, compute_average_metrics_from_csv
from evals.automatic.test_utils import extract_eval_data
from evals.automatic.test_utils import append_to_file
import pandas as pd

################################
what_to_ablate = Literal["query-rewriter", "sparse-retriever", "reranker", "sample-questions", "one-shot"] 

# What do you wish to exclude?
what_to_ablate = "one-shot"
################################

# This is where we will store all results
results_file = os.path.join("..", "..", "evals", "ablation", "results", "results.txt")

if what_to_ablate == "query-rewriter":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite-control.json")
elif what_to_ablate == "sparse-retriever":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever-control.json")
elif what_to_ablate == "reranker":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker-control.json")
elif what_to_ablate == "sample-questions":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions-control.json")
elif what_to_ablate == "one-shot":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot-control.json")
else: 
    raise Exception("You haven't specified a correct ablation to evaluate.")

# Extract the data to PipelineData format
pipeline_data_main = extract_eval_data(data_file_main)
pipeline_data_main = pipeline_data_main # 300 is too expensive to compute for RAGAS so just do 100?
pipeline_data_control = extract_eval_data(data_file_control)


import time
def process_in_batches(data, batch_size, wait_time, get_context_relevance):
    total_elements = len(data)
    for i in range(0, total_elements, batch_size):
        batch = data[i:i + batch_size]
        df_main_partial = run_for_pipeline_5_7(batch, get_context_relevance=get_context_relevance)
        
        yield df_main_partial
        
        if i + batch_size < total_elements:
            print("Sleepy time...")
            time.sleep(wait_time)
            print("Okay I'm back now...")
            print(f"i={i}")

"""Compute RAGAS metrics"""
if what_to_ablate not in ["sample-questions", "one-shot"]:
    # compute G, AR, CR, and RCR
    columns_to_average = ['answer_relevancy', 'faithfulness', 'context_relevancy', 'reverse_context_relevancy']  # Specify the columns you want to average

    df_control = run_for_pipeline_5_7(pipeline_data_control, get_context_relevance=True)
    average_gr, average_ar, average_cr, average_rcr = compute_average_metrics_from_csv(df_control, columns_to_average)
    append_to_file(results_file, f"CONTROL-{what_to_ablate} RAGAS Groundedness: {average_gr}, Answer Relevancy: {average_ar}, Context Relevancy: {average_cr}, Reverse Context Relevancy: {average_rcr}")

    time.sleep(30)
    
    partial_main_dfs = []
    for df_partial in process_in_batches(pipeline_data_main, 25, 30, get_context_relevance=True):
        partial_main_dfs.append(df_partial)
    
    # Concatenate all the partial dataframes into one big dataframe
    df_main = pd.concat(partial_main_dfs, ignore_index=True)

    # df_main = run_for_pipeline_5_7(pipeline_data_main, get_context_relevance=True)
    average_gr, average_ar, average_cr, average_rcr = compute_average_metrics_from_csv(df_main, columns_to_average)
    append_to_file(results_file, f"MAIN-{what_to_ablate} RAGAS Groundedness: {average_gr}, Answer Relevancy: {average_ar}, Context Relevancy: {average_cr}, Reverse Context Relevancy: {average_rcr}")

else:
    # compute G, AR, RCR
    columns_to_average = ['answer_relevancy', 'faithfulness', 'reverse_context_relevancy']  # Specify the columns you want to average

    df_control = run_for_pipeline_5_7(pipeline_data_control, get_context_relevance=False)
    average_gr, average_ar, average_rcr = compute_average_metrics_from_csv(df_control, columns_to_average)
    append_to_file(results_file, f"CONTROL-{what_to_ablate} RAGAS Groundedness: {average_gr}, Answer Relevancy: {average_ar}, Reverse Context Relevancy: {average_rcr}")
    
    time.sleep(30)

    partial_main_dfs = []
    for df_partial in process_in_batches(pipeline_data_main, 25, 30, get_context_relevance=False):
        partial_main_dfs.append(df_partial)
    
    # Concatenate all the partial dataframes into one big dataframe
    df_main = pd.concat(partial_main_dfs, ignore_index=True)

    # df_main = run_for_pipeline_5_7(pipeline_data_main, get_context_relevance=True)
    average_gr, average_ar, average_rcr = compute_average_metrics_from_csv(df_main, columns_to_average)
    append_to_file(results_file, f"MAIN-{what_to_ablate} RAGAS Groundedness: {average_gr}, Answer Relevancy: {average_ar}, Reverse Context Relevancy: {average_rcr}")

Evaluating: 100%|██████████| 60/60 [00:43<00:00,  1.37it/s]
Evaluating:  40%|████      | 12/30 [00:01<00:01, 14.51it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.569000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.068000 seconds
Evaluating:  50%|█████     | 15/30 [00:01<00:00, 16.61it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.096000 seconds
Evaluating:  60%|██████    | 18/30 [00:01<00:00, 17.38it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.082000 seconds
Evaluating:  83%|████████▎ | 25/30 [00:02<00:00,  8.96it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.116000 seconds
Evaluating: 100%|██████████| 30/30 [00:04<00:00,  6.29it/s]
Evaluating:  38%|███▊      | 19/50 [00:12<00:11,  2.66it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.491000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.243000 seconds
INFO

Sleepy time...
Okay I'm back now...
i=0


Evaluating:  44%|████▍     | 22/50 [00:14<00:14,  1.88it/s]INFO:openai._base_client:Retrying request to /chat/completions in 1.061000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.122000 seconds
Evaluating:  56%|█████▌    | 28/50 [00:15<00:08,  2.65it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.663000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.384000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.457000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.621000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.044000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.281000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.199000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 1.152000 seconds
INFO:openai._base_client:Retrying request to /chat/c

Sleepy time...
Okay I'm back now...
i=25


Evaluating:  36%|███▌      | 18/50 [00:15<00:26,  1.22it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.726000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.624000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.033000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.806000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.817000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.225000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.746000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.756000 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.751000 seconds
Evaluating:  58%|█████▊    | 29/50 [00:17<00:08,  2.48it/s]INFO:openai._base_client:Retrying request to /chat/completions in 0.142000 seconds
INFO:openai._base_client:Retrying request to /chat/c

Sleepy time...
Okay I'm back now...
i=50


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-6365' coro=<AsyncClient.aclose() done, defined at /Users/victoroldensand/Documents/KTH/master-thesis/codebase/twiga/.venv/lib/python3.11/site-packages/httpx/_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "/Users/victoroldensand/Documents/KTH/master-thesis/codebase/twiga/.venv/lib/python3.11/site-packages/httpx/_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "/Users/victoroldensand/Documents/KTH/master-thesis/codebase/twiga/.venv/lib/python3.11/site-packages/httpx/_transports/default.py", line 385, in aclose
    await self._pool.aclose()
  File "/Users/victoroldensand/Documents/KTH/master-thesis/codebase/twiga/.venv/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "/Users/