# Ablation Study Calculations
This notebook contains all my code to run the ablation study and to compute the results

In [15]:
import sys
import os
import json
from dotenv import load_dotenv
from typing import Literal

load_dotenv()
DATA_DIR = os.getenv("DATA_DIR_PATH")

In [9]:
notebook_dir = os.getcwd() # Get the current working directory of the notebook
src_dir = os.path.abspath(os.path.join(notebook_dir, '..', '..')) # Construct the path to the src directory
sys.path.append(src_dir) # Add the src directory to the system path

from src.pipelines.pipeline_runner import run_data_through_generator
from src.utils import load_json_to_pipelinedata, save_objects_as_json

## Complete the ablation runs

In [21]:
what_to_ablate = Literal["query-rewriter", "sparse-retriever", "reranker", "sample-questions", "one-shot"] 

################################
# What do you wish to get rid of in this run?
what_to_ablate = "query-rewriter"
################################

ablation_params = {
        "no_rewriter": False,
        "no_sparse_retriever": False,
        "no_reranker": False,
        "no_sample_questions": False,
        "no_one_shot": False
    }

if what_to_ablate == "query-rewriter":
    input_file_main = os.path.join("..", "..", "data", "ablation", "datasets", "retrieved-(no-query-rewrite).json")
    output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite.json")
    input_file_control = os.path.join("..", "..", "data", "ablation", "datasets", "control-retrieved-(no-query-rewrite).json")
    output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite-control.json")
    ablation_params["no_rewriter"] = True
elif what_to_ablate == "sparse-retriever":
    input_file_main = os.path.join("..", "..", "data", "ablation", "datasets", "retrieved-(no-sparse-retrieval).json")
    output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever.json")
    input_file_control = os.path.join("..", "..", "data", "ablation", "datasets", "control-retrieved-(no-sparse-retrieval).json")
    output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever-control.json")
    ablation_params["no_sparse_retriever"] = True
elif what_to_ablate == "reranker":
    input_file_main = os.path.join("..", "..", "data", "ablation", "datasets", "retrieved-(no-query-rewrite).json")
    output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker.json")
    input_file_control = os.path.join("..", "..", "data", "ablation", "datasets", "control-retrieved-(no-query-rewrite).json")
    output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker-control.json")
    ablation_params["no_reranker"] = True
elif what_to_ablate == "sample-questions":
    input_file_main = os.path.join("..", "..", "data", "main", "datasets", "test-prompts-rewritten-retrieved.json")
    output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions.json")
    input_file_control = os.path.join("..", "..", "data", "main", "datasets", "control-test-prompts-rewritten-retrieved.json")
    output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions-control.json")
    ablation_params["no_sample_questions"] = True
elif what_to_ablate == "one-shot":
    input_file_main = os.path.join("..", "..", "data", "main", "datasets", "test-prompts-rewritten-retrieved.json")
    output_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot.json")
    input_file_control = os.path.join("..", "..", "data", "main", "datasets", "control-test-prompts-rewritten-retrieved.json")
    output_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot-control.json")
    ablation_params["no_one_shot"] = True
else: 
    raise Exception("You haven't specified a correct ablation to generate.")

with open(input_file_control, 'r') as file:
    data_control = json.load(file)
    incomplete_pipeline_data_control = load_json_to_pipelinedata(data_control)

with open(input_file_main, 'r') as file:
    data_main = json.load(file)
    incomplete_pipeline_data_main = load_json_to_pipelinedata(data_main)

res = run_data_through_generator(incomplete_pipeline_data_control, "llama3-70b-8192", ablation_params=ablation_params, verbose=False)
save_objects_as_json(res, output_file_control, rewrite=True)

res = run_data_through_generator(incomplete_pipeline_data_main, "llama3-70b-8192", ablation_params=ablation_params, verbose=False)
save_objects_as_json(res, output_file_main, rewrite=True)

False


## Ablation study automatic evaluation

In [None]:
from evals.automatic.test_utils import extract_eval_data
from evals.automatic.hit_ratio_and_mrr import compute_hit_ratio_and_mrr
from evals.automatic.test_utils import append_to_file

################################
# What do you wish to get rid of in this run?
what_to_ablate = "query-rewriter"
################################

# This is where we will store all results
results_file = os.path.join("..", "..", "evals", "ablation", "results", "results.txt")

if what_to_ablate == "query-rewriter":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-query-rewrite-control.json")
elif what_to_ablate == "sparse-retriever":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sparse-retriever-control.json")
elif what_to_ablate == "reranker":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-reranker-control.json")
elif what_to_ablate == "sample-questions":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-sample-questions-control.json")
elif what_to_ablate == "one-shot":
    data_file_main = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot.json")
    data_file_control = os.path.join("..", "..", "data", "ablation", "complete_runs", "ablation-llama3-no-one-shot-control.json")
else: 
    raise Exception("You haven't specified a correct ablation to evaluate.")

# Extract the data to PipelineData format
pipeline_data_main = extract_eval_data(data_file_main)
pipeline_data_control = extract_eval_data(data_file_control)


"""Compute the Hit Rate and MRR"""

if what_to_ablate not in ["sample-questions", "one-shot"]:
    # For control
    _, hit_rate, mrr = compute_hit_ratio_and_mrr(pipeline_data_control)
    append_to_file(results_file, f"CONTROL-{what_to_ablate} Hit Rate: {hit_rate}")
    append_to_file(results_file, f"CONTROL-{what_to_ablate} MRR: {mrr}")

    # For main
    _, hit_rate, mrr = compute_hit_ratio_and_mrr(pipeline_data_main)
    append_to_file(results_file, f"MAIN-{what_to_ablate} Hit Rate: {hit_rate}")
    append_to_file(results_file, f"MAIN-{what_to_ablate} MRR: {mrr}")

"""Compute BERTScore"""

"""Compute Embedding Similarity"""

"""Compute K-F1++"""

"""Compute RAGAS metrics"""
if what_to_ablate not in ["sample-questions", "one-shot"]:
    # compute G, AR, CR, and RCR
    pass
else:
    # compute G, AR, RCR
    pass

    