In [1]:
import mlflow
import pandas as pd

uri = 'file:///Users/vitkostejn/school/diploma_thesis/code/mlruns' # Set your MLflow tracking URI here
mlflow.set_tracking_uri(uri)


In [2]:
import pickle

def get_metric_data(run, metric='NDCG20'):
    artifact_path = run['artifact_uri'].replace('file://', '') + '/logs.pkl'
    with open(artifact_path, 'rb') as f:
        data = pickle.load(f)
    return data[metric]

In [3]:
aggregation_mapping = {
    "average": "Avg",
    "max": "Max",
    "common_features": "ComF",
    "wcom": "WCom",
    "topk": "TopK",
}

def get_method_name(row):
    method = row['params.recommender_strategy']
    if method == "SAE":
        aggregation = row['params.SAE_fusion_strategy']
        method += "-" + aggregation_mapping.get(aggregation, aggregation)
    return method.replace('_', '-')

In [4]:

experiment_id = '523100174176986081'
# params.group_set = 'test' and params.user_set = 'test'
runs = mlflow.search_runs(
    experiment_ids=[experiment_id],
    filter_string="params.group_set = 'test' and params.user_set = 'test'",
    output_format="pandas",
)

In [13]:
data = []

metrics = ['NDCG20_com', 'NDCG20_min', 'NDCG20_mean', 'Popularity']
add_name_metrics = ['Group_NDCG', 'User_NDCG_Mins', 'User_NDCG_Means', 'Popularity']
for i, run in runs.iterrows():
    if run['status'] == 'FAILED':
        print(f"Skipping run {run['run_id']} due to failure.")
        continue
    method = get_method_name(run)
    print(f"Processing run: {method}")
    
    metrics_data = {}
    for metric, add_metric in zip(metrics, add_name_metrics):
        metrics_data[metric] = get_metric_data(run, add_metric)
    for i in range(metrics_data[metrics[0]].shape[0]):
        row = {
            'Method': method,
            'Group_type': run['params.group_type'],
            'Group_index': i,
        }
        for metric in metrics:
            row[metric.replace('20', '')] = float(metrics_data[metric][i])
        data.append(row)
df = pd.DataFrame(data)
print(len(df), "rows in the dataframe")
df.tail()
df.to_csv("test-extended-results.csv", index=False)

Skipping run 17a301fd2d14471ea08d5e39bf41c047 due to failure.
Processing run: MPL
Processing run: EPFuzzDA
Processing run: GFAR
Processing run: LMS
Processing run: ADD
Processing run: POPULAR
Processing run: MPL
Processing run: EPFuzzDA
Processing run: GFAR
Processing run: LMS
Processing run: ADD
Processing run: POPULAR
Processing run: MPL
Processing run: EPFuzzDA
Processing run: GFAR
Processing run: LMS
Processing run: ADD
Processing run: POPULAR
Processing run: SAE-WCom
Processing run: SAE-TopK
Processing run: SAE-Max
Processing run: SAE-ComF
Processing run: SAE-Avg
Processing run: SAE-WCom


  row[metric.replace('20', '')] = float(metrics_data[metric][i])


Processing run: SAE-TopK
Processing run: SAE-Max
Processing run: SAE-ComF
Processing run: SAE-Avg
Processing run: SAE-WCom
Processing run: SAE-TopK
Processing run: SAE-Max
Processing run: SAE-ComF
Processing run: SAE-Avg
Processing run: ELSA-INT
Processing run: ELSA-INT
Processing run: ELSA-INT
Processing run: ELSA
Processing run: ELSA
Processing run: ELSA
39000 rows in the dataframe


In [6]:
grouped_df = df.groupby(['Method', 'Group_type']).agg(list).reset_index()

In [None]:
from statistics import mean
from scipy import stats


group_types = ['sim', 'random', 'outlier']
metrics = ["NDCG20_com", "NDCG20_min", "NDCG20_mean", "Popularity"]

significance_results = {}

for group_type in group_types:
    curr_runs = grouped_df[grouped_df['Group_type'] == group_type]
    for metric in metrics:
        best_method = max(curr_runs['Method'].values, key=lambda x: mean(curr_runs[curr_runs['Method'] == x][metric].values[0]))
        best_list = curr_runs[curr_runs['Method'] == best_method][metric].values[0]
        for method in curr_runs['Method'].values:
            approach_list = curr_runs[curr_runs['Method'] == method][metric].values[0]
            t_stat, p_value = stats.ttest_rel(best_list, approach_list, alternative='greater')
            significance_results[(group_type, method, metric)] = p_value
            
for (group_type, method, metric), p_value in significance_results.items():
    print(group_type, method, metric, p_value)

sim ADD NDCG20_com 0.07485615610541298
sim ELSA NDCG20_com 2.982250416635818e-26
sim ELSA-INT NDCG20_com 4.42966209040028e-139
sim EPFuzzDA NDCG20_com 7.859371125191554e-05
sim GFAR NDCG20_com 8.279333589164189e-121
sim LMS NDCG20_com 2.658315835942324e-09
sim MPL NDCG20_com 4.7348195943930475e-206
sim POPULAR NDCG20_com 1.640203932405366e-192
sim SAE-Avg NDCG20_com 0.060860857596647326
sim SAE-ComF NDCG20_com 7.725558391970059e-33
sim SAE-Max NDCG20_com 1.9089643007368657e-14
sim SAE-TopK NDCG20_com nan
sim SAE-WCom NDCG20_com 9.134073778165553e-07
sim ADD NDCG20_min 0.12373727765298345
sim ELSA NDCG20_min 1.1172300348415418e-53
sim ELSA-INT NDCG20_min 2.063132323632808e-138
sim EPFuzzDA NDCG20_min 0.002223870170575656
sim GFAR NDCG20_min 5.798326800638472e-66
sim LMS NDCG20_min 1.4764016929082478e-23
sim MPL NDCG20_min 4.7910368678862814e-142
sim POPULAR NDCG20_min 1.225707211353363e-209
sim SAE-Avg NDCG20_min 0.4231522212728437
sim SAE-ComF NDCG20_min 5.531938918740981e-38
sim SAE-M

In [None]:
from scipy import stats

significance = []

group_types = ['sim', 'random', 'outlier']

add_name_metrics = ['Group_NDCG', 'User_NDCG_Mins', 'User_NDCG_Means', 'Popularity']
official_metrics = ['$NDCG_{com}$', '$NDCG_{min}$', '$NDCG_{mean}$', 'Popularity']
for group_type in group_types:
    print(f"Group Type: {group_type.upper()}")
    group_runs = [run for run in runs if run.data.params.get('group_type') == group_type]
    print(f"Number of runs: {len(group_runs)}")
    for metric, add_name_metric, official_metric in zip(metrics, add_name_metrics, official_metrics):
        metric_data = get_metric_data(group_runs, metric=add_name_metric)
        ff = min if official_metric == 'Popularity' else max
        best_run = ff(metric_data.items(), key=lambda x: x[1].mean())
        print(f"Best {add_name_metric} for {group_type} groups: {best_run[0]} with mean {best_run[1].mean():.3f} and std {best_run[1].std():.3f}")
        
        # Get significance to the best run
        for i, run in enumerate(group_runs):
            strategy = get_method_name(run)
            data = {
                'Group Type': group_type,
                'Metric': official_metric,
                'Strategy': strategy,
                'Value': metric_data[strategy].mean(),
            }
            curr_results = metric_data[strategy]
            alternative = 'greater' if official_metric != 'Popularity' else 'less'
            t_stat, p_val = stats.ttest_rel(best_run[1].squeeze(), curr_results.squeeze(), alternative=alternative)
            print(f"{strategy}: t = {t_stat:.3f}  p = {p_val:.4f}")
            data['p-value'] = p_val
            if p_val <= 0.05 and strategy != best_run[0]:
                data['Significant'] = True
            else:
                data['Significant'] = False
            significance.append(data)

Group Type: SIM
Number of runs: 13
Best Group_NDCG for sim groups: SAE-TopK with mean 0.639 and std 0.175
MPL: t = 39.420  p = 0.0000
EPFuzzDA: t = 3.794  p = 0.0001
GFAR: t = 26.926  p = 0.0000
LMS: t = 5.889  p = 0.0000
ADD: t = 1.442  p = 0.0749
POPULAR: t = 37.411  p = 0.0000
SAE-WCom: t = 4.800  p = 0.0000
SAE-TopK: t = nan  p = nan
SAE-Max: t = 7.679  p = 0.0000
SAE-ComF: t = 12.313  p = 0.0000
SAE-Avg: t = 1.549  p = 0.0609
ELSA-INT: t = 29.603  p = 0.0000
ELSA: t = 10.837  p = 0.0000
Best User_NDCG_Mins for sim groups: SAE-TopK with mean 0.558 and std 0.125
MPL: t = 30.035  p = 0.0000
EPFuzzDA: t = 2.851  p = 0.0022
GFAR: t = 18.454  p = 0.0000
LMS: t = 10.187  p = 0.0000
ADD: t = 1.157  p = 0.1237
POPULAR: t = 39.956  p = 0.0000
SAE-WCom: t = 4.511  p = 0.0000
SAE-TopK: t = nan  p = nan
SAE-Max: t = 5.299  p = 0.0000
SAE-ComF: t = 13.381  p = 0.0000
SAE-Avg: t = 0.194  p = 0.4232
ELSA-INT: t = 29.505  p = 0.0000
ELSA: t = 16.341  p = 0.0000
Best User_NDCG_Means for sim groups:

In [None]:
df = pd.DataFrame(significance).sort_values(by=['Significant'], ascending=True)
df.head(20)
df.to_csv('paper/significance.csv', index=False)