# Imports


In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import os
import plotly.express as px
import sys
sys.path.append('../personalized-diabetes')
from sigopt_functions import gMSE, Pen, gSE
import tensorflow as tf
import plotly.graph_objects as go

In [None]:
DIR = '../personalized-diabetes/preds'
REGEX = 'base_(\d)_(train|test)_M(\d+)_D(\d+).csv' 
M_global = [      10,20,50,100,200,400,800,1000]


In [None]:
def load_dfs(baselines):
    dfs = []
    for B, d_range in baselines.items():
        for M in M_global:
            for mode in ['train', 'test']:
                for D in d_range:
                    if D == 0:
                        df = pd.read_csv(os.path.join(DIR, f'base_{B}_{mode}_M{M}.csv'))
                    else:
                        df = pd.read_csv(os.path.join(DIR, f'base_{B}_{mode}_M{M}_D{D}.csv'))
                    df['M'] = M
                    df['D'] = D
                    df['B'] = B
                    df['split'] = mode
                    dfs.append(df)

    df = pd.concat(dfs)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    df.rename(columns={'0':'y_hat'}, inplace=True)

    return df

In [None]:
def calc_granular_mses(df, baselines_dict):
    res = []
    for B, d_range in baselines_dict.items():
        for M in M_global:
            #train_mse = np.mean(np.square(df[df.split=='train'].y - df[df.split=='train'].y_hat))
            #test_mse = np.mean(np.square(df[df.split=='test'].y - df[df.split=='test'].y_hat))

            for D in d_range:
                df_train = df[(df.split=='train')&(df.D==D)&(df.M==M)&(df.B==B)]
                df_test = df[(df.split=='test')&(df.D==D)&(df.M==M)&(df.B==B)]
                train_mse_d = np.mean(np.square(df_train.y - df_train.y_hat))
                test_mse_d = np.mean(np.square(df_test.y - df_test.y_hat))
                train_gmse_d = gMSE(df_train.y, df_train.y_hat)
                test_gmse_d = gMSE(df_test.y, df_test.y_hat)
                res.append({'D': D, 'M':M, 'B': B, 'split': 'train', 'mse': train_mse_d, 'gmse': train_gmse_d, 'weights': df_train.shape[0]})
                res.append({'D': D, 'M':M, 'B': B, 'split': 'test',  'mse': test_mse_d,  'gmse':  test_gmse_d, 'weights': df_test.shape[0]})
    res_df = pd.DataFrame(res)
    return res_df

In [None]:
def reproduce_agg_stats(df_all, baselines_dict, verbose=True):
    res = []
    for B in baselines_dict.keys():
        for M in M_global:

            df = df_all[(df_all.M==M)&(df_all.B==B) ]
            try: 
                train_weighted = np.average(df[df.split=='train'].mse, weights =df[df.split=='train'].weights)

                train_weighted_gmse = np.average(df[df.split=='train'].gmse, weights =df[df.split=='train'].weights)
                res.append({'M': M, 'B': B, 'Weighted':True, 'split':'train', 'mse': train_weighted, 'gmse': train_weighted_gmse})
            except ZeroDivisionError:
                print(f'Zero division error at train, B{B}, M{M}')

            try:
                test_weighted = np.average(df[df.split=='test'].mse, weights = df[df.split=='test'].weights)
                test_gmse_weighted = np.average(df[df.split=='test'].gmse, weights = df[df.split=='test'].weights)
                res.append({'M': M, 'B': B, 'Weighted':True, 'split':'test', 'mse': test_weighted, 'gmse': test_gmse_weighted})
            except ZeroDivisionError: 
                print(f'Zero division error at test, B{B}, M{M}')
                
            train_unweighted = np.average(df[df.split=='train'].mse)
            train_gmse_unweighted = np.average(df[df.split=='train'].gmse)

            res.append({'M': M, 'B': B, 'Weighted':False, 'split':'train', 'mse': train_unweighted, 'gmse': train_gmse_unweighted})
            test_unweighted = np.average(df[df.split=='test'].mse)
            test_gmse_unweighted = np.average(df[df.split=='test'].gmse)

            res.append({'M': M, 'B': B, 'Weighted':False, 'split':'test', 'mse': test_unweighted, 'gmse': test_gmse_unweighted})
            

    return pd.DataFrame(res)
    

In [None]:
#excl = [1,9,10,12,16,18,19,21,22,23,24,25,26,27,29,30]
#list(set(range(1,31)) - set(excl))
baselines_dict = {
    1: [0],
    #11: [0],
    #21: [0],
    3: range(1,31),
    #5: range(1,31),
    6: range(1,31),
    7: [0],
   # 101: [0],
   # 106: range(1,31),
   # 116: range(1,31),
}

baselines = load_dfs(baselines_dict)
baselines[baselines.B==7]
baselines.loc[baselines.B==7, 'D'] = baselines.loc[baselines.B==7, 'DeidentID'].astype(int)
baselines.drop(columns=['DeidentID'], inplace=True)
baselines_dict = {
    1: [0],
    #11: [0],
    #21: [0],
    3: range(1,31),
    #5: range(1,31),
    6: range(1,31),
    7: range(1,31),
   # 101: [0],
   # 106: range(1,31),
   # 116: range(1,31),
}
res_df = calc_granular_mses(baselines, baselines_dict)
res_df.gmse = res_df.gmse.apply(lambda x: x.numpy())


In [None]:
#res_df.to_pickle('res_df.pickle')


baselines_dict = {
    1: [0],
    11: [0],
    21: [0],
    3: range(1,31),
    5: range(1,31),
    6: range(1,31),
    101: [0],
    106: range(1,31),
    116: range(1,31),
}

res_df = pd.read_pickle('res_df.pickle')
_ = reproduce_agg_stats(res_df, baselines_dict, verbose=False)

In [None]:
#res_df.D = res_df.D.astype(str)
px.scatter(res_df, x='M', color='D', y='mse', facet_col='split', facet_row='B')

In [None]:
patients_to_exclude = [1,9,10,12,16,18,19,21,22,23,24,25,26,27,29,30]
#patients_to_exclude = []
agg_stats = reproduce_agg_stats(res_df[(~res_df.D.isin(patients_to_exclude))], baselines_dict, verbose=False)

px.line(agg_stats[agg_stats.split=='test'], x='M', y='gmse', color='B', facet_col='Weighted', markers=True)

In [None]:
x = agg_stats.loc[(agg_stats.Weighted) & (agg_stats.split=='test'), ['M', 'B', 'gmse']].pivot(index=['B'], columns=['M'], values=['gmse']).round(2).to_string().split('\n')
vals = ['&'.join(ele.split()) for ele in x]
print(vals)

In [None]:
# Create plots for file
patients_to_exclude = [1,9,10,12,16,18,19,21,22,23,24,25,26,27,29,30]
#patients_to_exclude = []
agg_stats = reproduce_agg_stats(res_df[(~res_df.D.isin(patients_to_exclude)) ], baselines_dict, verbose=False)

fig = px.line(agg_stats[(agg_stats.split=='test')&(agg_stats.Weighted)], x='M', y='gmse', color='B', markers=True, labels={'M': 'Missingness Modulo', 'gmse': 'gMSE', 'B': 'Baseline'},  width=1500, height=1000)
fig.update_layout(font=dict(size=20))

fig.write_image('Model_Performances.png')

fig.show()


In [None]:
res_df

# Baselines scatter colorized

In [None]:
Y_tensor = tf.constant(baselines.y, dtype=tf.float32)
Y_hat_tensor = tf.constant(baselines.y_hat, dtype=tf.float32)

baselines = baselines.assign(pen=Pen(g = Y_tensor, g_hat = Y_hat_tensor, ), gSE = gSE(g = Y_tensor, g_hat = Y_hat_tensor, )[0])

In [None]:
fig = px.scatter(baselines[(baselines.M==1000)&(baselines.split=='test')], x='y', y='y_hat', color='pen', height=800, width=1200, color_continuous_scale='RdYlGn_r', opacity=0.05, facet_col='B', facet_col_wrap=2,labels={'M': 'Missingness Modulo', 'gmse': 'gMSE', 'B': 'Baseline'})
fig.update_traces(marker=dict(size=2))
fig.update_layout(
    plot_bgcolor='white',
    
)
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=True, gridwidth=1, gridcolor='LightGrey')

#fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
#fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
fig.write_image('scatter_pen.png')
fig.show()

In [None]:
fig = px.scatter(baselines[(baselines.M==1000)&(baselines.split=='test')], x='y', y='y_hat', color='gSE', height=1000, width=1500, color_continuous_scale='RdYlGn_r', range_color=[0,10000], opacity=0.04, facet_col='B', facet_col_wrap=2, labels={'M': 'Missingness Modulo', 'gmse': 'gMSE', 'B': 'Baseline'})
fig.update_traces(marker=dict(size=2))
fig.update_layout(
    plot_bgcolor='white',
    
)
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, showgrid=True, gridwidth=1, gridcolor='LightGrey')
fig.update_layout(font=dict(size=20))
#fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
#fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightPink')
fig.write_image('scatter_gSE.png')
fig.show()

# Export for Leander

baselines_dict_leander = {
    1: [0],
    #11: [0],
    #21: [0],
   # 101: [0],
   # 106: range(1,31),
   # 116: range(1,31),
}
[      10,20,50,100,200, 400, 800,1000]
baselines_leander = load_dfs(baselines_dict_leander)
baselines_leander.to_csv('base_1_leander.csv')

In [None]:
baselines[baselines.M==10].groupby('split').run.count()