In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import plotly.express as px
from glob import glob

In [3]:
RESULTS_PATH = os.path.join(os.path.abspath(''), "..", "results")
MODEL_TO_INSPECT = '*' # * for all models otherwise the name of the model
DATASET_TO_INSPECT = '*' # * for all datasets otherwise the name of the dataset

In [4]:
result_dfs = []
for path in glob(os.path.join(RESULTS_PATH, f"{MODEL_TO_INSPECT}+{DATASET_TO_INSPECT}+*.feather")):
    dataset, model, retrieval = os.path.basename(path).split('+')
    result_df = pd.read_feather(path)
    result_df['dataset'] = dataset
    result_df['model'] = model
    result_df['retrieval'] = retrieval
    result_dfs.append(result_df)

full_results_df = pd.concat(result_dfs)

In [None]:
def display_results(
    full_results_df, 
    value_vars = [
        "factual_correctness",
        "faithfulness",
        "context_recall",
        "semantic_similarity",
        "non_llm_context_recall",
        "llm_context_precision_with_reference",
        "non_llm_context_precision_with_reference",
        "context_entity_recall",    
    ],
    display_ttr = False,
    
):
    grouped_results = full_results_df[value_vars + ["time"] + ['dataset', 'model', 'retrieval']].groupby(['dataset', 'model', 'retrieval']).mean().reset_index()
    
    cumul_results = full_results_df[value_vars + ["time"] + ['retrieval']].groupby(['retrieval']).mean().reset_index()
    
    unpivoted_results = grouped_results.melt(
        id_vars=['dataset', 'model', 'retrieval'], value_vars=value_vars, var_name='metric', value_name='value'
    )
    (
        px.bar(
            unpivoted_results, 
            x='metric', 
            y='value', 
            color='retrieval', 
            facet_row='dataset', 
            facet_col='model', 
            title='Results', 
            barmode="group",
            facet_row_spacing = 0.1,
            height=600 * len(unpivoted_results['dataset'].unique()), 
            width=1200 * len(unpivoted_results['model'].unique()),
            )
        .update_layout(title_font_size=24)
        .update_xaxes(showgrid=True)
    ).show()
    
    unpivoted_results = cumul_results.melt(
        id_vars=['retrieval'], value_vars=value_vars, var_name='metric', value_name='value'
    )
    
    (
        px.bar(
            unpivoted_results, 
            x='metric', 
            y='value', 
            color='retrieval', 
            title='Cumulative Results', 
            barmode="group",
            height=600, 
            width=1200,
            )
        .update_layout(title_font_size=24)
        .update_xaxes(showgrid=True)
    ).show()
    
    if display_ttr:
        px.bar(
            grouped_results, 
            x='model', 
            y='time', 
            color='retrieval', 
            facet_row='dataset', 
            title='Time to retrieve', 
            barmode="group",
            facet_row_spacing = 0.1,
            height=600 * len(grouped_results['dataset'].unique()), 
            width=1200,
        ).show()
        
        px.bar(
            cumul_results,
            x='retrieval',
            y='time',
            title='Time to retrieve - All Datasets and Models',
            color='retrieval',
            barmode="group",
            height=600,
            width=1200,
        ).show()

In [6]:
display_results(full_results_df, display_ttr=True)