**_Results for the semi-synthetic dataset_**

In [17]:
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

**1) Set up client**

In [18]:
mlflow.set_tracking_uri('http://localhost:3333')
client = MlflowClient()

**2) List experiments**

In [19]:
model_names = ['GT', 'CT', 'CRN', 'TECDE', 'RMSN', 'G-Net']
experiment_names = [model_name + '/tumor_generator_FINAL' for model_name in model_names]
experiments = {exp.name: exp.experiment_id for exp in client.search_experiments()}

**3) Access all runs within each experiment and extract metrics + params**

In [20]:
metric_names = ['decoder_test_rmse_2-step']

param_names = ['model/name', 'dataset/coeff', 'dataset/seed', 'dataset/num_patients/train']
data_dict = {}

for experiment_name in experiment_names:
    if experiment_name in experiments:
        experiment_id = experiments[experiment_name]
        # Get all runs for the experiment
        runs = client.search_runs(experiment_ids=[experiment_id])
        # Extract metrics and parameters
        runs_data = [
            {
                **run.data.metrics,
                **run.data.params,
                'run_id': run.info.run_id
            }
            for run in runs
        ]
        combined_df = pd.DataFrame(runs_data)
        
        if not combined_df.empty:
            # Filter the DataFrame to only include the desired metrics and parameters
            columns_to_include = ['run_id'] + metric_names + param_names
            filtered_df = combined_df[columns_to_include]
            data_dict[experiment_name.split("/")[0]] = filtered_df
        else:
            print(f"No runs found for experiment {experiment_name}.")
    else:
        print(f"Experiment {experiment_name} does not exist.")

for model in model_names:
    data_dict[model] = data_dict[model].rename(columns={'decoder_test_rmse_2-step': '2',
                                                        'dataset/coeff': 'gamma',
                                                        'dataset/num_patients/train': 'N'})
    data_dict[model]['gamma'] = pd.to_numeric(data_dict[model]['gamma'])

**4) Compute mean and std of RMSEs per model and confounding strength**

In [21]:
GT_mean_rmse = data_dict['GT'].groupby('gamma')[['2']].mean()
GT_mean_rmse['model'] = 'GT'
GT_std_rmse = data_dict['GT'].groupby('gamma')[['2']].std()
GT_std_rmse['model'] = 'GT'

GNet_mean_rmse = data_dict['G-Net'].groupby('gamma')[['2']].mean()
GNet_mean_rmse['model'] = 'G-Net'
GNet_std_rmse = data_dict['G-Net'].groupby('gamma')[['2']].std()
GNet_std_rmse['model'] = 'G-Net'

RMSN_mean_rmse = data_dict['RMSN'].groupby('gamma')[['2']].mean()
RMSN_mean_rmse['model'] = 'RMSN'
RMSN_std_rmse = data_dict['RMSN'].groupby('gamma')[['2']].std()
RMSN_std_rmse['model'] = 'RMSN'


CT_mean_rmse = data_dict['CT'].groupby('gamma')[['2']].mean()
CT_mean_rmse['model'] = 'CT'
CT_std_rmse = data_dict['CT'].groupby('gamma')[['2']].std()
CT_std_rmse['model'] = 'CT'


CRN_mean_rmse = data_dict['CRN'].groupby('gamma')[['2']].mean()
CRN_mean_rmse['model'] = 'CRN'
CRN_std_rmse = data_dict['CRN'].groupby('gamma')[['2']].std()
CRN_std_rmse['model'] = 'CRN'


TECDE_mean_rmse = data_dict['TECDE'].groupby('gamma')[['2']].mean()
TECDE_mean_rmse['model'] = 'TECDE'
TECDE_std_rmse = data_dict['TECDE'].groupby('gamma')[['2']].std()
TECDE_std_rmse['model'] = 'TECDE'

mean_rmse = pd.concat([GT_mean_rmse, GNet_mean_rmse, RMSN_mean_rmse, CT_mean_rmse, CRN_mean_rmse, TECDE_mean_rmse], axis=0).reset_index()
mean_rmse['gamma'] = mean_rmse['gamma'].astype(float)
mean_rmse = mean_rmse[['model', 'gamma', '2']].rename(columns={'2': 'mean_rmse'})
std_rmse = pd.concat([GT_std_rmse, GNet_std_rmse, RMSN_std_rmse, CT_std_rmse, CRN_std_rmse, TECDE_std_rmse], axis=0).reset_index()
std_rmse = std_rmse[['model', 'gamma', '2']].rename(columns={'2': 'std_rmse'})
std_rmse['N'] = std_rmse['gamma'].astype(float)

**Average RMSE**

In [25]:
mean_rmse=mean_rmse.sort_values(by=['model', 'gamma'])

#sns.lineplot(data=mean_rmse, x='gamma', y='mean_rmse', hue='model', marker='o')
mean_rmse

Unnamed: 0,model,gamma,mean_rmse
44,CRN,10.0,4.050165
45,CRN,11.0,5.444592
46,CRN,12.0,6.174003
47,CRN,13.0,4.979471
48,CRN,14.0,5.243874
...,...,...,...
61,TECDE,16.0,4.837271
62,TECDE,17.0,4.307843
63,TECDE,18.0,4.442143
64,TECDE,19.0,4.610445


**Standard deviation**

In [23]:
std_rmse=std_rmse.sort_values(by=['model', 'gamma'])
std_rmse

Unnamed: 0,model,gamma,std_rmse,N
44,CRN,10.0,0.545081,10.0
45,CRN,11.0,1.678152,11.0
46,CRN,12.0,1.270498,12.0
47,CRN,13.0,1.487298,13.0
48,CRN,14.0,0.325312,14.0
...,...,...,...,...
61,TECDE,16.0,0.463863,16.0
62,TECDE,17.0,0.384258,17.0
63,TECDE,18.0,0.531160,18.0
64,TECDE,19.0,0.423011,19.0


**Relative improvement** 

In [24]:
non_GT_mean_rmse = mean_rmse[mean_rmse['model'] != 'GT']

# minimum of non_GT_mean_rmse for each confounding parameter
min_non_GT_mean_rmse = non_GT_mean_rmse.groupby('gamma')[['mean_rmse']].min()

1 - np.array(GT_mean_rmse['2']) / np.array(min_non_GT_mean_rmse['mean_rmse'])


array([0.06357549, 0.07292116, 0.07872555, 0.12938869, 0.15016834,
       0.09892358, 0.1289567 , 0.13128205, 0.17447468, 0.14787989,
       0.12492065])