_**Results for the semi-synthetic dataset**_

In [6]:
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc

**1) Set up client**

In [7]:
mlflow.set_tracking_uri('http://localhost:3333')
client = MlflowClient()

**2) List experiments**

In [8]:
model_names = ['GT', 'CT', 'CRN', 'TECDE', 'RMSN', 'G-Net']
experiment_names = [model_name + '/mimic3_synthetic_FINAL' for model_name in model_names]
experiments = {exp.name: exp.experiment_id for exp in client.search_experiments()}

**3) Access all runs within each experiment and extract metrics + params**

In [9]:
metric_names = ['decoder_test_rmse_2-step', 'decoder_test_rmse_3-step', 'decoder_test_rmse_4-step',
                'decoder_test_rmse_5-step', 'decoder_test_rmse_6-step']

param_names = ['model/name', 'dataset/max_number', 'dataset/seed']
data_dict = {}

for experiment_name in experiment_names:
    if experiment_name in experiments:
        experiment_id = experiments[experiment_name]
        # Get all runs for the experiment
        runs = client.search_runs(experiment_ids=[experiment_id])
        # Extract metrics and parameters
        runs_data = [
            {
                **run.data.metrics,
                **run.data.params,
                'run_id': run.info.run_id
            }
            for run in runs
        ]
        combined_df = pd.DataFrame(runs_data)
        
        if not combined_df.empty:
            # Filter the DataFrame to only include the desired metrics and parameters
            columns_to_include = ['run_id'] + metric_names + param_names
            filtered_df = combined_df[columns_to_include]
            data_dict[experiment_name.split("/")[0]] = filtered_df
        else:
            print(f"No runs found for experiment {experiment_name}.")
    else:
        print(f"Experiment {experiment_name} does not exist.")

for model in model_names:
    data_dict[model] = data_dict[model].rename(columns={'decoder_test_rmse_2-step': '2',
                                                        'decoder_test_rmse_3-step': '3',
                                                        'decoder_test_rmse_4-step': '4',
                                                        'decoder_test_rmse_5-step': '5',
                                                        'decoder_test_rmse_6-step': '6',
                                                        'dataset/max_number': 'N'})

**4) Compute mean and std of RMSEs per model and sample size**

In [10]:
GT_mean_rmse = data_dict['GT'].groupby('N')[['2', '3', '4', '5', '6']].mean()
GT_mean_rmse['model'] = 'GT'
GT_std_rmse = data_dict['GT'].groupby('N')[['2', '3', '4', '5', '6']].std()
GT_std_rmse['model'] = 'GT'


GNet_mean_rmse = data_dict['G-Net'].groupby('N')[['2', '3', '4', '5', '6']].mean()
GNet_mean_rmse['model'] = 'G-Net'
GNet_std_rmse = data_dict['G-Net'].groupby('N')[['2', '3', '4', '5', '6']].std()
GNet_std_rmse['model'] = 'G-Net'

RMSN_mean_rmse = data_dict['RMSN'].groupby('N')[['2', '3', '4', '5', '6']].mean()
RMSN_mean_rmse['model'] = 'RMSN'
RMSN_std_rmse = data_dict['RMSN'].groupby('N')[['2', '3', '4', '5', '6']].std()
RMSN_std_rmse['model'] = 'RMSN'


CT_mean_rmse = data_dict['CT'].groupby('N')[['2', '3', '4', '5', '6']].mean()
CT_mean_rmse['model'] = 'CT'
CT_std_rmse = data_dict['CT'].groupby('N')[['2', '3', '4', '5', '6']].std()
CT_std_rmse['model'] = 'CT'


CRN_mean_rmse = data_dict['CRN'].groupby('N')[['2', '3', '4', '5', '6']].mean()
CRN_mean_rmse['model'] = 'CRN'
CRN_std_rmse = data_dict['CRN'].groupby('N')[['2', '3', '4', '5', '6']].std()
CRN_std_rmse['model'] = 'CRN'


TECDE_mean_rmse = data_dict['TECDE'].groupby('N')[['2', '3', '4', '5', '6']].mean()
TECDE_mean_rmse['model'] = 'TECDE'
TECDE_std_rmse = data_dict['TECDE'].groupby('N')[['2', '3', '4', '5', '6']].std()
TECDE_std_rmse['model'] = 'TECDE'

mean_rmse = pd.concat([GT_mean_rmse, GNet_mean_rmse, RMSN_mean_rmse, CT_mean_rmse, CRN_mean_rmse, TECDE_mean_rmse], axis=0).reset_index()
std_rmse = pd.concat([GT_std_rmse, GNet_std_rmse, RMSN_std_rmse, CT_std_rmse, CRN_std_rmse, TECDE_std_rmse], axis=0).reset_index()

mean_rmse = mean_rmse.melt(id_vars=['N', 'model'], value_vars=['2', '3', '4', '5', '6'], var_name='tau', value_name='mean_rmse')
mean_rmse['N'] = mean_rmse['N'].astype(int)
mean_rmse['tau'] = mean_rmse['tau'].astype(int)
std_rmse = std_rmse.melt(id_vars=['N', 'model'], value_vars=['2', '3', '4', '5', '6'], var_name='tau', value_name='std_rmse')
std_rmse['N'] = std_rmse['N'].astype(int)
std_rmse['tau'] = std_rmse['tau'].astype(int)

**Average RMSE**

In [11]:
mean_rmse=mean_rmse.sort_values(by=['model', 'N', 'tau'])
mean_rmse

Unnamed: 0,N,model,tau,mean_rmse
12,1000,CRN,2,0.417589
30,1000,CRN,3,0.580008
48,1000,CRN,4,0.739015
66,1000,CRN,5,0.844462
84,1000,CRN,6,0.945248
...,...,...,...,...
17,3000,TECDE,2,0.706733
35,3000,TECDE,3,0.783462
53,3000,TECDE,4,0.878594
71,3000,TECDE,5,0.944441


**Standard deviation**

In [12]:
std_rmse=std_rmse.sort_values(by=['model', 'N', 'tau'])
std_rmse

Unnamed: 0,N,model,tau,std_rmse
12,1000,CRN,2,0.114793
30,1000,CRN,3,0.205470
48,1000,CRN,4,0.310860
66,1000,CRN,5,0.416592
84,1000,CRN,6,0.511582
...,...,...,...,...
17,3000,TECDE,2,0.091428
35,3000,TECDE,3,0.093194
53,3000,TECDE,4,0.108446
71,3000,TECDE,5,0.118396


**Relative improvement**

In [20]:
# minimum of non_GT_mean_rmse for each confounding parameter
non_GT_mean_rmse = mean_rmse[mean_rmse['model'] != 'GT']
min_non_GT_mean_rmse = non_GT_mean_rmse.groupby(['N','tau'])[['mean_rmse']].min()

1 - np.array(mean_rmse[mean_rmse['model'] == 'GT']['mean_rmse']) / np.array(min_non_GT_mean_rmse['mean_rmse'])


array([0.0952854 , 0.19655063, 0.16315255, 0.16687307, 0.10775189,
       0.15253114, 0.22537414, 0.22470995, 0.22626632, 0.25027353,
       0.26714094, 0.2404785 , 0.25182911, 0.24554943, 0.21553881])