In [102]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [103]:
import os
import pandas as pd
import plotly.express as px
from glob import glob

In [104]:
RESULTS_PATH = os.path.join(os.path.abspath(''), "..", "results")
MODEL_TO_INSPECT = '*' # * for all models otherwise the name of the model
DATASET_TO_INSPECT = '*' # * for all datasets otherwise the name of the dataset
RETRIEVAL_TO_INSPECT = '*' # * for all retrieval otherwise the name of the retrieval

In [105]:
result_dfs = []
for path in glob(os.path.join(RESULTS_PATH, f"{DATASET_TO_INSPECT}+{MODEL_TO_INSPECT}+{RETRIEVAL_TO_INSPECT}.feather")):
    dataset, model, retrieval = os.path.basename(path).split('+')
    result_df = pd.read_feather(path)
    result_df['dataset'] = dataset
    result_df['model'] = model
    result_df['retrieval'] = retrieval
    result_dfs.append(result_df)

full_results_df = pd.concat(result_dfs)

In [106]:
def display_results(
    full_results_df, 
    value_vars = [
        "factual_correctness",
        "faithfulness",
        "context_recall",
        "semantic_similarity",
        #"non_llm_context_recall", 
        #"llm_context_precision_with_reference",
        #"non_llm_context_precision_with_reference",
        #"context_entity_recall",    
    ],
    display_ttr = False,
    color_by = 'retrieval',
    
):
    id_vars = ['dataset', 'model', 'retrieval']
    id_without_color_by = id_vars.copy()
    id_without_color_by.remove(color_by)
    id_without_color_by = sorted(id_without_color_by, key=lambda x: len(full_results_df[x].unique()), reverse=True)
    
    grouped_results = full_results_df[value_vars + ["time"] + id_vars].groupby(id_vars).mean().reset_index()
    
    cumul_results = full_results_df[value_vars + ["time"] + [color_by]].groupby([color_by]).mean().reset_index()
    
    unpivoted_results = grouped_results.melt(
        id_vars=id_vars, value_vars=value_vars, var_name='metric', value_name='value'
    )
    (
        px.bar(
            unpivoted_results, 
            x='metric', 
            y='value', 
            color=color_by, 
            facet_row=id_without_color_by[0], 
            facet_col=id_without_color_by[1], 
            title='Results', 
            barmode="group",
            facet_row_spacing = 0.1,
            height=600 * len(unpivoted_results[id_without_color_by[0]].unique()), 
            width=1200 * len(unpivoted_results[id_without_color_by[1]].unique()),
            )
        .update_layout(title_font_size=24)
        .update_xaxes(showgrid=True)
    ).show()
    
    unpivoted_results = cumul_results.melt(
        id_vars=color_by, value_vars=value_vars, var_name='metric', value_name='value'
    )
    
    (
        px.bar(
            unpivoted_results, 
            x='metric', 
            y='value', 
            color=color_by, 
            title='Cumulative Results', 
            barmode="group",
            height=600, 
            width=1200,
            )
        .update_layout(title_font_size=24)
        .update_xaxes(showgrid=True)
    ).show()
    
    if display_ttr:
        px.bar(
            grouped_results, 
            x=id_without_color_by[0], 
            y='time', 
            color=color_by, 
            facet_row=id_without_color_by[1], 
            title='Time to retrieve', 
            barmode="group",
            height=600 * len(grouped_results[id_without_color_by[1]].unique()), 
            width=1200,
        ).show()
        
        px.bar(
            cumul_results,
            x=color_by,
            y='time',
            title='Time to retrieve - All Datasets and Models and Retrievals',
            color=color_by,
            barmode="relative",
            height=800,
            width=1200,
        ).show()

In [107]:
display_results(full_results_df, display_ttr=True, color_by="retrieval")