In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import glob
import nibabel as nb
import itertools
import json
import sys
import statsmodels.api as sm
from statsmodels.formula.api import ols, wls
from statsmodels.formula.api import mixedlm
from patsy.contrasts import Treatment

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
from statsmodels.discrete.discrete_model import Probit, MNLogit
from statsmodels.tools import add_constant
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFdr
from sklearn.feature_selection import f_regression, mutual_info_regression

import seaborn as sns
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(1024)

In [2]:
base_dir = '/home/abhijit/Jyotirmay/my_thesis'
basic_feats = ['age', 'sex', 'bmi_numeric']
spleen_sample_cols = ['0_spleen','1_spleen','2_spleen','3_spleen','4_spleen','5_spleen','6_spleen','7_spleen','8_spleen','9_spleen']
liver_sample_cols = ['0_liver','1_liver','2_liver','3_liver','4_liver','5_liver','6_liver','7_liver','8_liver','9_liver']
vols_feat = ['seg_liver', 'seg_spleen']
iou_feats = ['iou_spleen', 'iou_liver', 'iou_mean']
selected_model_feats = basic_feats + spleen_sample_cols + liver_sample_cols + vols_feat + iou_feats + ['volume_id', 'diabetes_status']
selected_dataset_feats = basic_feats +  ['volume_id', 'diabetes_status']
all_paths = [
#     {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_concat_report_final.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_concat_report_final.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_concat_report_final.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_concat_report_final.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_concat_report_final.csv'},
    {'dataset_KORA_processed': './dataset_groups/whole_body_datasets/KORA/test_processed_True_concat_report_final.csv'},
    {'UKB_MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_UKB_v2/UKB/0_0.0_concat_report_final.csv'},
    {'UKB_probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_UKB_v2/UKB/10_1573834823.1121247_concat_report_final.csv'}
]

ukb_paths = [
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_UKB_v2/UKB/0_0.0_concat_report_final.csv'},
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_UKB_v2/UKB/10_1573834823.1121247_concat_report_final.csv'}
]

model_report_paths = {key:val for d in all_paths for key,val in d.items()}
# ukb_model_report_paths = {key:val for d in ukb_paths for key,val in d.items()}

In [3]:
flatten = lambda l: [item for sublist in l for item in sublist]

def transform_to_categorical(df, categorical_features_list):
    for f in categorical_features_list:
        dfDummies = pd.get_dummies(df[f], prefix = f)
        df = pd.concat([df, dfDummies], axis=1)
    return df

def rename(df, cols_map=None):
    if cols_map is None:
        cols_map =  {'bmi-numeric':'bmi_numeric', 'blood-pressure-diastolic':'blood_pressure_diastolic', 'blood-pressure-systolic':'blood_pressure_systolic',
             'cholesterol-hdl':'cholesterol_hdl', 'cholesterol-ldl':'cholesterol_ldl', 'cholesterol-total':'cholesterol_total',
             'mri-liver-fat-artifacts':'mri_liver_fat_artifacts', 'mri-liver-fat-lobus-dexter':'mri_liver_fat_lobus_dexter', 
              'mri-liver-fat-lobus-sinister':'mri_liver_fat_lobus_sinister', 'mri-liver-fat-portal-vein':'mri_liver_fat_portal_vein',
             'meds-lipoprotein-lowering':'meds_lipoprotein_lowering', 'meds-antihypertensive':'meds_antihypertensive',
              'smoker_non-smoker':'smoker_non_smoker','alcohol-g/day':'alcohol_g_day'}
    df.rename(columns=cols_map, inplace=True)
    return df

def z_score_column_normalise(df, column_list):
    normalised_cols_map = {}
    for column in column_list:
        normalised_cols_map[column] = column+'_normalised'
        df[normalised_cols_map[column]] = (df[column] - df[column].mean())/df[column].std(ddof=0)
    return df, normalised_cols_map

def z_score_group_normalise(df, cols_to_normalise):
    normalised_cols_map = {}
    group_cols_value = df[cols_to_normalise].values
    mean, std = np.mean(group_cols_value), np.std(group_cols_value, ddof=0)
    for column in cols_to_normalise:
        normalised_cols_map[column] = 'normalised_'+column
        df[normalised_cols_map[column]] = (df[column] - mean)/std
    return df, normalised_cols_map

def pre_process_for_mlm(df, to_be_transpose_cols, value_name):
    cols = list(df.columns)
    cols_without_tobe_transposed_cols = list(set(cols) - set(to_be_transpose_cols))
    df = df.melt(id_vars=cols_without_tobe_transposed_cols, 
            value_vars=to_be_transpose_cols, 
            value_name=value_name)

    return df

def model_evaluation_matrics(file_paths_dict, cols_to_fetch):
    dicts = {}
    for model, path in file_paths_dict.items():
        if 'dataset' in model:
            continue
        df = pd.read_csv(path)
        dicts[model] = df[cols_to_fetch].iloc[1:].mean()
    return dicts

def individual_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex']):
    p_values = {}
    for f in feats:
        try:
            features_string = make_feature_string([f], categorical_feats)
            fii = ols(f'{target_col} ~ {features_string}', df).fit()
            feat_dict = fii.pvalues.to_dict()
            coeffs = fii.params
        
            for k, v in feat_dict.items():
                orig_key = k
                if k == 'Intercept':
                    k = f+'_Intercept'
                p_values[k] = v

                p_values[k+'_coeff'] = coeffs[orig_key]
            p_values['fitting_score'] = fii.rsquared
        except Exception as e:
            print(e)

    return p_values


def make_feature_string(feats, categorical_cols=[]):
    feat_str = '1+'
    for c_col in categorical_cols:
        if type(c_col) is tuple:
            if c_col[0] not in feats:
#                 print(f'{c_col} is not present in given feature list, SKIPPING IT!')
                continue
            feat_str += f'C({c_col[0]}, Treatment(reference={c_col[1]}))+'
            c_col = c_col[0]
        else:
            if c_col not in feats:
#                 print(f'{c_col} is not present in given feature list, SKIPPING IT!')
                continue
            feat_str += f'C({c_col}, Treatment)+'
        feats.remove(c_col)
    
    other_feats_str = '+'.join(feats)
    if other_feats_str is '':
        final_feat_str = feat_str[:-1]
    else:
        final_feat_str = feat_str + other_feats_str

    return final_feat_str
    



def normal_mixed_effect_model(df, target_col, features_string, group_col, L1_wt=None):
    if L1_wt is None:
        model = mixedlm(f'{target_col} ~ {features_string}', df, groups=df[group_col]).fit()
    else:
        model = mixedlm(f'{target_col} ~ {features_string}', df, groups=df[group_col]).fit_regularized(L1_wt=L1_wt)
    return model

def anova_test(ols_model):
    anova_stats = sm.stats.anova_lm(ols_model)
    return anova_stats

def df_from_nested_dicts(dicts):
    df = pd.concat({k+'_'+kk: pd.concat({kk:pd.DataFrame(vv, index=[0]).T}, axis=1) for k, v in dicts.items() for kk, vv in v.items()}, axis=1)
    return df

def df_from_nested_dicts_group(dicts):
    df = pd.concat({k+'_'+kk: pd.concat({kk:pd.DataFrame(vv, index=[0]).T}, axis=1) for k, v in dicts.items() for kk, vv in v.items()}, axis=1)
    return df

def highlight_significance(df, threshold=0.05):
    return df.style.applymap(lambda x: 'background-color : yellow' if x==True or x<threshold else '')


def discrete_group_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex'], is_classification=False):
    p_values = {}
    try:
#             features_string = f'1+C({f}, Treatment)' if f in categorical_feats else f'1+{f}'
        x = df[feats]
        y = df[target_col]
        x = add_constant(x)
        model = Probit(y, x)
        fii = model.fit()
#             print(model.pdf(x))
#         fii_ = fii.get_margeff()
#         print(fii.summary())
# #         print(fii_.summary())
        feat_dict = fii.pvalues.to_dict()
        coeffs = fii.params
        
        for k, v in feat_dict.items():
            p_values[k] = v
#             p_values[k+'_coeff'] = coeffs[k]

        p_values['aic'] = fii.aic
        p_values['bic'] = fii.bic
        p_values['psuedo_r2'] = fii.prsquared
    except Exception as e:
        print('ERROR:', e)

    return p_values, fii

# def weighted_group_feats(df, target_col, features_string, alpha_col):
#     alpha = df[alpha_col].values
#     model = wls(f'{target_col} ~ {features_string}', df, weights=(1/(1-alpha))).fit()
#     return model

def discrete_weighted_group_feats(df, target_col, feats, alpha_col):
    p_values = {}
    alpha = df[alpha_col].values
    x = df[feats]
    y = df[target_col]
    x = add_constant(x)
    fii = sm.GLM(y, x, family=sm.families.Binomial(sm.families.links.probit), var_weights=alpha).fit()
#     print(fii.summary())
#     print(fii.to_dict())
    
    feat_dict = fii.pvalues.to_dict()
    coeffs = fii.params

    for k, v in feat_dict.items():
        p_values[k] = v
#         p_values[k+'_coeff'] = coeffs[k]

    p_values['aic'] = fii.aic
    p_values['bic'] = fii.bic
    sst_val = sum(map(lambda x: np.power(x,2),y-np.mean(y))) 
    sse_val = sum(map(lambda x: np.power(x,2),fii.resid_response)) 
    r2 = 1-(sse_val/sst_val)
    p_values['psuedo_r2'] = r2
    return p_values, fii

def normal_group_fit(df, target_col, features_string):
    p_values = {}
    fii = ols(f'{target_col} ~ {features_string}', df).fit()
    feat_dict = fii.pvalues.to_dict()
    coeffs = fii.params

    for k, v in feat_dict.items():
        p_values[k] = v
#         p_values[k+'_coeff'] = coeffs[k]

    p_values['aic'] = fii.aic
    p_values['bic'] = fii.bic
    p_values['psuedo_r2'] = fii.rsquared
    return p_values, fii

def weighted_group_feats(df, target_col, features_string, alpha_col):
    p_values = {}
    alpha = df[alpha_col].values
    fii = wls(f'{target_col} ~ {features_string}', df, weights=alpha).fit()
    feat_dict = fii.pvalues.to_dict()
    coeffs = fii.params

    for k, v in feat_dict.items():
        p_values[k] = v
#         p_values[k+'_coeff'] = coeffs[k]

    p_values['aic'] = fii.aic
    p_values['bic'] = fii.bic
    p_values['psuedo_r2'] = fii.rsquared
    return p_values, fii

In [4]:
# model.summary()

In [5]:
# import statsmodels.api as sm
# y = df[target_col]
# x = df[feats]
# logmodel=sm.GLM(y, x, family=sm.families.Binomial(sm.families.links.probit)).fit()
# logmodel.summary()

# model = Probit(y, x).fit()

In [6]:
# sst_val = sum(map(lambda x: np.power(x,2),y-np.mean(y))) 
# sse_val = sum(map(lambda x: np.power(x,2),model.resid_response)) 
# r2 = 1-(sse_val/sst_val)

In [7]:
# r2

In [8]:
# print(logmodel.resid_deviance, logmodel.null_deviance, model.resid_response, logmodel.resid_response)

In [9]:
cols_to_fetch = ['sncc', 'ged', 'iou_spleen', 'iou_liver', 'dice_spleen',
       'dice_liver', 'surface_distance_avg_spleen', 'surface_distance_avg_liver']
dicts = model_evaluation_matrics(model_report_paths, cols_to_fetch)
df_model_eval = pd.DataFrame.from_dict(dicts)
df_model_eval = df_model_eval.T
# df_model_eval.columns = cols_to_fetch
df_model_eval

Unnamed: 0,sncc,ged,iou_spleen,iou_liver,dice_spleen,dice_liver,surface_distance_avg_spleen,surface_distance_avg_liver
full_bayesian_0dot01,0.440058,0.195726,0.77843,0.843838,0.876213,0.921414,0.878335,0.817338
MC_dropout_quicknat,0.400336,0.166126,0.821658,0.878743,0.897251,0.937764,0.915783,0.871014
probabilistic_quicknat,0.164762,0.228213,0.988126,0.992441,0.873734,0.925382,0.875181,0.827286
hierarchical_quicknat,0.188499,0.209206,0.983803,0.989027,0.883031,0.932817,0.89713,0.85492
UKB_MC_dropout_quicknat,,,0.730973,0.807269,,,,
UKB_probabilistic_quicknat,,,0.934133,0.967882,,,,


# Post Process reports

In [10]:
dfs = {}
common_vols = None

for model, path in model_report_paths.items():
    if 'dataset' in model:
        continue
    if 'UKB' in model:
        continue
    print(model)
    df = pd.read_csv(path)
    df = rename(df)
    dfs[model] = df
    vols = df[df['iou_mean']>0.51].volume_id.values
    if common_vols is None:
        common_vols = vols
    else:
        common_vols = np.intersect1d(common_vols, vols)
    print(common_vols.shape)

for model, df in dfs.items():
    dfs[model] = df[df.volume_id.isin(common_vols)][selected_model_feats]
    df, normalised_cols = z_score_column_normalise(dfs[model], ['seg_spleen', 'seg_liver'])
    dfs[model] = df
    print('model shape:', dfs[model].shape)
    
df_dataset = pd.read_csv(model_report_paths['dataset_KORA_processed'])
df_dataset = rename(df_dataset)
dfs['dataset_KORA_processed'] = df[df.volume_id.isin(common_vols)][selected_dataset_feats]
print(dfs['dataset_KORA_processed'].shape)

## Change pre-diabetic state to diabetic
for model, df in dfs.items():
    df.loc[df['diabetes_status']==2, 'diabetes_status'] = 1
    dfs[model] = df
    dfs[model].to_csv(f'final_feats_{model}.csv', index=False)

full_bayesian_0dot01
(148,)
MC_dropout_quicknat
(147,)
probabilistic_quicknat
(147,)
hierarchical_quicknat
(147,)
model shape: (147, 32)
model shape: (147, 32)
model shape: (147, 32)
model shape: (147, 32)
(147, 5)


In [11]:
dfs_ukb = {}
common_vols = None

for model, path in model_report_paths.items():
    if 'dataset' in model:
        continue
    if 'UKB' not in model:
        continue
    print(model)
    df = pd.read_csv(path)
    df = rename(df)
    dfs_ukb[model] = df
    vols = df[df['iou_mean']>0.51].volume_id.values
    if common_vols is None:
        common_vols = vols
    else:
        common_vols = np.intersect1d(common_vols, vols)
    print(common_vols.shape)

for model, df in dfs_ukb.items():
    dfs_ukb[model] = df[df.volume_id.isin(common_vols)][selected_model_feats]
    df, normalised_cols = z_score_column_normalise(dfs_ukb[model], ['seg_spleen', 'seg_liver'])
    dfs_ukb[model] = df
    print('model shape:', dfs_ukb[model].shape)
    
# df_dataset = pd.read_csv(model_report_paths['dataset_KORA_processed'])
# df_dataset = rename(df_dataset)
# dfs['dataset_KORA_processed'] = df[df.volume_id.isin(common_vols)][selected_dataset_feats]
# print(dfs['dataset_KORA_processed'].shape)

## Change pre-diabetic state to diabetic
for model, df in dfs_ukb.items():
    df.loc[df['diabetes_status']==2, 'diabetes_status'] = 1
    dfs_ukb[model] = df
    dfs_ukb[model].to_csv(f'final_feats_{model}.csv', index=False)

UKB_MC_dropout_quicknat
(18209,)
UKB_probabilistic_quicknat
(2265,)
model shape: (2265, 32)
model shape: (2265, 32)


# Equations

Eq-1: Diabetes_status = a0 +a1*age + a2*sex + a3*BMI

Eq-2: Diabetes_status = a0 +a1*age + a2*sex + a3*BMI + a4*seg_<spleen|liver>

Eq-3: Diabetes_status = a0 +a1*age + a2*sex + a3*BMI + a4*seg_<spleen|liver> + a5*iou_<spleen|liver>

Eq-4: (Diabetes_status == a0 +a1*age + a2*sex + a3*BMI + a4*seg_<spleen|liver>) * iou_<spleen|liver>

Eq-5: Diabetes_status = a0 +a1*age + a2*sex + a3*BMI +  a4*seg_<spleen|liver>)i

# Group features (eq 1, eq 2, eq 3 and eq 4)

In [12]:
feats = basic_feats
dicts = {}
anova_test_dicts = {}

for model, df in dfs.items():
#     
#     df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
#     df = df.fillna(0)

    dicts[model] = {}
    target_col = 'diabetes_status'
    
    if 'dataset' in model:
        #     Equation 1 process
        p_values, statsmodel = discrete_group_feature_stats(basic_feats, df, target_col, ['sex'], True)
        dicts[model][target_col] = p_values
        continue

#     Equation 2 process
    feats = basic_feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    p_values, statsmodel = discrete_group_feature_stats(feats, df, target_col, ['sex'], True)
    dicts[model][target_col+'_with_seg_volumes'] = p_values
    
#     Equation 3 process
    feats = basic_feats + [ 'seg_spleen_normalised', 'seg_liver_normalised', 'iou_spleen', 'iou_liver']
    p_values, statsmodel = discrete_group_feature_stats(feats, df, target_col, ['sex'], True)
    dicts[model][target_col+'_with_seg_and_iou'] = p_values
    
#     Equation 4 process
    feats = basic_feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    p_values, statsmodel = discrete_weighted_group_feats(df, target_col, feats, 'iou_mean')
    dicts[model][target_col+'_with_regularised_seg_volumes'] = p_values

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

Optimization terminated successfully.
         Current function value: 0.453328
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.423270
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.438315
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.416056
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.446703
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.418020
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.447388
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.426626
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.466512
         Iterations 6


Unnamed: 0,Unnamed: 1,Unnamed: 2,age,aic,bic,bmi_numeric,const,iou_liver,iou_spleen,psuedo_r2,seg_liver_normalised,seg_spleen_normalised,sex
full_bayesian_0dot01_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,6.42342e-05,145.279,163.221,0.00216086,2.22206e-09,,,0.292918,0.0553561,0.401713,0.123875
full_bayesian_0dot01_diabetes_status_with_seg_and_iou,diabetes_status_with_seg_and_iou,0,3.17202e-05,140.441,164.365,0.0161631,0.00735714,0.0508004,0.0130456,0.339802,0.0177676,0.162197,0.245023
full_bayesian_0dot01_diabetes_status_with_regularised_seg_volumes,diabetes_status_with_regularised_seg_volumes,0,0.000236618,122.597,-593.054,0.00596156,1.42822e-07,,,0.356826,0.0885591,0.515395,0.171234
MC_dropout_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,4.14119e-05,140.865,158.807,0.005537,9.64899e-09,,,0.316336,0.0128543,0.0863165,0.206989
MC_dropout_quicknat_diabetes_status_with_seg_and_iou,diabetes_status_with_seg_and_iou,0,5.58475e-05,138.321,162.244,0.00630988,0.10392,0.262835,0.0304617,0.351054,0.0125101,0.0267151,0.214105
MC_dropout_quicknat_diabetes_status_with_regularised_seg_volumes,diabetes_status_with_regularised_seg_volumes,0,0.000128616,123.442,-592.209,0.00988928,1.98462e-07,,,0.37671,0.0174434,0.137046,0.24153
probabilistic_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,3.98198e-05,143.331,161.273,0.00468571,7.28572e-09,,,0.303252,0.0199145,0.372123,0.238113
probabilistic_quicknat_diabetes_status_with_seg_and_iou,diabetes_status_with_seg_and_iou,0,6.77854e-05,138.898,162.821,0.00532531,0.739283,0.0797722,0.0108944,0.34799,0.014919,0.18579,0.427834
probabilistic_quicknat_diabetes_status_with_regularised_seg_volumes,diabetes_status_with_regularised_seg_volumes,0,4.13435e-05,142.221,-573.43,0.00438872,1.88696e-08,,,0.363349,0.017079,0.392107,0.237873
hierarchical_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,4.15443e-05,143.532,161.475,0.00850952,8.65908e-09,,,0.302184,0.0236925,0.371974,0.231841


# Outputs

A: Base statsmodel comparision with DL statsmodels.

1. (SIMILARITY) Comparision to statsmodel trained on raw dataset, all other statsmodels trained with DL models outputs (segmentation maps), fetures like AGE and BMI show significant relation to DIABETES_STATUS.
2. (DIFFERENCE) Although, SEX has significant relation with DIABETES_STATUS in raw dataset trained statsmodel, but deeplearning outputs based statsmodel does not show the same. 

B: Statsmodel with segmentations output analysis:

1. Full_bayesian does not show any significant relation from segmentations with DIABETES_STATUS
2. MC_dropout and Hierarchical_quicknat shows significant relation from SEG_LIVER_NORMALISED but not from SEG_SPLEEN_NORMALISED with DIABETES_STATUS
3. probabilistic quicknat shows significant relation from SEG_SPLEEN_NORMALISED but nto from SEG_LIVER_NORMALISED with DIABETES_STATUS
4. Considering above result, 2/4 model shows significant relation between SEG_LIVER_NORMALISED (MAJORITY). 

C: Statsmodel with segmentations and iou output analysis.

1. Now with IOU_SPLEEN and IOU_LIVER as inputs features with other feats above, full_bayesian shows significant result from SEG_LIVER_NORMALISED with DIABETES_STATUS
2. With MC_dropout and Hierarchical_quicknat strengthen their significance with DIABETES_STATUS.
3. Probabilistic_quicknat still does not show significance relation between SEG_LIVER_NORMALISED but it strengthen its significance output. 

D: Comparision between statsmodel with and without segmentations and with and without iou_scores.

1. Compare to model without spleen and liver segmentation, model with segmentations are better fit to predict binomial diabetes_status, as per psudo-rsquared values.
2. As per AIC score, all models with segmentation as a feature scored less than model without segmentations as feature, except full_bayesian where it missed slightly.
3. With inclusion of IOU_SCORES features, all model shows better fit as per AIC score now, unlike full_bayesian in point 2.
4. Similarly with the inclusion of IOU_SCORES, psuedo_rsquared value improves by 4% wrt statsmodel with segmentation output only and 7-8% comparision to raw statsmodel.


In [13]:
feats = basic_feats
dicts = {}
anova_test_dicts = {}
for model, df in dfs_ukb.items():
#     
#     df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
#     df = df.fillna(0)
    if 'UKB' not in model:
#         print('dataset cannot be processed!')
        continue
    dicts[model] = {}
    target_col = 'diabetes_status'

    p_values, statsmodel = discrete_group_feature_stats(basic_feats, df, target_col, ['sex'], True)
    dicts[model][target_col] = p_values
    if 'dataset' in model:
        #     Equation 1 process
       
        continue

#     Equation 2 process
    feats = basic_feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    p_values, statsmodel = discrete_group_feature_stats(feats, df, target_col, ['sex'], True)
    dicts[model][target_col+'_with_seg_volumes'] = p_values
    
#     Equation 3 process
    feats = basic_feats + [ 'seg_spleen_normalised', 'seg_liver_normalised', 'iou_spleen', 'iou_liver']
    p_values, statsmodel = discrete_group_feature_stats(feats, df, target_col, ['sex'], True)
    dicts[model][target_col+'_with_seg_and_iou'] = p_values
    
    #     Equation 4.0 process
    feats = basic_feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    p_values, statsmodel = discrete_weighted_group_feats(df, target_col, feats, 'iou_mean')
    dicts[model][target_col+'_reg_iou_mean'] = p_values
    
# #     Equation 4.1 process
#     feats = basic_feats + [ 'seg_spleen_normalised']
#     p_values, statsmodel = discrete_weighted_group_feats(df, target_col, feats, 'iou_spleen')
#     dicts[model][target_col+'_reg_iou_spleen'] = p_values
    
#     #     Equation 4.2 process
#     feats = basic_feats + [ 'seg_liver_normalised']
#     p_values, statsmodel = discrete_weighted_group_feats(df, target_col, feats, 'iou_liver')
#     dicts[model][target_col+'_reg_iou_liver'] = p_values

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

Optimization terminated successfully.
         Current function value: 0.192564
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.191015
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.190466
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.192564
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.191813
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.191789
         Iterations 7


Unnamed: 0,Unnamed: 1,Unnamed: 2,age,aic,bic,bmi_numeric,const,iou_liver,iou_spleen,psuedo_r2,seg_liver_normalised,seg_spleen_normalised,sex
UKB_MC_dropout_quicknat_diabetes_status,diabetes_status,0,0.00787408,880.317,903.218,1.12824e-13,1.24414e-21,,,0.0818751,,,0.000445689
UKB_MC_dropout_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.00538947,877.299,911.651,1.24473e-11,7.60097e-21,,,0.089261,0.00926347,0.273414,0.00146554
UKB_MC_dropout_quicknat_diabetes_status_with_seg_and_iou,diabetes_status_with_seg_and_iou,0,0.00364919,878.812,924.614,8.96841e-12,7.13055e-11,0.782104,0.155872,0.0918794,0.00626578,0.14924,0.00150214
UKB_MC_dropout_quicknat_diabetes_status_reg_iou_mean,diabetes_status_reg_iou_mean,0,0.0121909,681.608,-16781.9,1.99368e-09,8.41616e-17,,,0.0404311,0.0202262,0.308807,0.00733467
UKB_probabilistic_quicknat_diabetes_status,diabetes_status,0,0.00787408,880.317,903.218,1.12824e-13,1.24414e-21,,,0.0818751,,,0.000445689
UKB_probabilistic_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.00777941,880.915,915.267,1.30272e-13,1.2696199999999999e-21,,,0.0854554,0.076499,0.240173,0.000371224
UKB_probabilistic_quicknat_diabetes_status_with_seg_and_iou,diabetes_status_with_seg_and_iou,0,0.00764768,884.802,930.605,1.28522e-13,0.0129681,0.739094,0.846921,0.085574,0.133264,0.232979,0.000376546
UKB_probabilistic_quicknat_diabetes_status_reg_iou_mean,diabetes_status_reg_iou_mean,0,0.0118563,837.064,-16626.5,5.85401e-13,1.3220899999999998e-20,,,0.0378012,0.0805444,0.241515,0.000532664


# Regularised Group Feats (eq 4)

In [14]:
feats = basic_feats
dicts = {}
anova_test_dicts = {}
for model, df in dfs.items():
#     
#     df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
#     df = df.fillna(0)

    if 'dataset' in model:
        print('dataset cannot be processed!')
        continue
    
        
    dicts[model] = {}
    target_cols = ['seg_spleen_normalised', 'seg_liver_normalised']
    for target_col in target_cols:
#     #     Equation 1 process
#         feats_string = make_feature_string(basic_feats, ['sex', 'diabetes_status'])
#         p_values, statsmodel = normal_group_fit(df, target_col, feats_string)
#         dicts[model][target_col] = p_values

    #     Equation 2 process
        feats = basic_feats + ['diabetes_status']
        feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
        p_values, statsmodel = normal_group_fit(df, target_col, feats_string)
        dicts[model][target_col+'_with_seg_volumes'] = p_values

    #     Equation 3 process
        if target_col == 'seg_spleen_normalised':
            iou_feat = ['iou_spleen']
        else:
            iou_feat = ['iou_liver']
        feats = basic_feats + [ 'diabetes_status'] + iou_feat
        feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
        p_values, statsmodel = normal_group_fit(df, target_col, feats_string)
        dicts[model][target_col+'_with_seg_and_iou'] = p_values

    #     Equation 4 process
        feats = basic_feats + [ 'diabetes_status']
        feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
        p_values, statsmodel = weighted_group_feats(df, target_col, feats_string, iou_feat[0])
        dicts[model][target_col+'_with_regularised_seg_volumes'] = p_values

p_value_df = df_from_nested_dicts(dicts).T
p_value_df.to_csv('to_seg.csv')
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

dataset cannot be processed!


Unnamed: 0,Unnamed: 1,Unnamed: 2,"C(diabetes_status, Treatment)[T.1]","C(sex, Treatment)[T.1]",Intercept,age,aic,bic,bmi_numeric,iou_liver,iou_spleen,psuedo_r2
full_bayesian_0dot01_seg_spleen_normalised_with_seg_volumes,seg_spleen_normalised_with_seg_volumes,0,0.831708,7.10993e-05,0.0629043,0.0133766,377.617,392.57,2.21994e-05,,,0.286147
full_bayesian_0dot01_seg_spleen_normalised_with_seg_and_iou,seg_spleen_normalised_with_seg_and_iou,0,0.494864,0.000373559,3.67396e-05,0.0190193,362.011,379.954,2.5561e-06,,4.09161e-05,0.366723
full_bayesian_0dot01_seg_spleen_normalised_with_regularised_seg_volumes,seg_spleen_normalised_with_regularised_seg_volumes,0,0.901738,0.000132591,0.0832539,0.0200232,376.832,391.784,7.04509e-05,,,0.260111
full_bayesian_0dot01_seg_liver_normalised_with_seg_volumes,seg_liver_normalised_with_seg_volumes,0,0.053071,7.56286e-07,0.0669612,8.81387e-08,315.672,330.624,1.14521e-10,,,0.531621
full_bayesian_0dot01_seg_liver_normalised_with_seg_and_iou,seg_liver_normalised_with_seg_and_iou,0,0.0206533,5.34982e-07,0.0001141,2.45802e-08,305.174,323.117,1.55131e-12,0.000547783,,0.569796
full_bayesian_0dot01_seg_liver_normalised_with_regularised_seg_volumes,seg_liver_normalised_with_regularised_seg_volumes,0,0.0480241,7.82313e-08,0.0211651,2.05766e-07,304.592,319.544,1.45701e-11,,,0.549149
MC_dropout_quicknat_seg_spleen_normalised_with_seg_volumes,seg_spleen_normalised_with_seg_volumes,0,0.298486,5.04164e-05,0.00308023,0.0611889,371.304,386.256,1.82299e-07,,,0.316159
MC_dropout_quicknat_seg_spleen_normalised_with_seg_and_iou,seg_spleen_normalised_with_seg_and_iou,0,0.145872,0.000169981,1.83463e-05,0.117201,363.121,381.064,1.01471e-07,,0.00181038,0.361924
MC_dropout_quicknat_seg_spleen_normalised_with_regularised_seg_volumes,seg_spleen_normalised_with_regularised_seg_volumes,0,0.304442,4.23498e-05,0.00351615,0.0805364,368.962,383.914,3.64626e-07,,,0.30491
MC_dropout_quicknat_seg_liver_normalised_with_seg_volumes,seg_liver_normalised_with_seg_volumes,0,0.0140865,4.40084e-08,0.00951843,1.10787e-07,298.141,313.093,9.88919e-13,,,0.584276


In [15]:
feats = basic_feats
dicts = {}
anova_test_dicts = {}
for model, df in dfs_ukb.items():
#     
#     df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
#     df = df.fillna(0)

    if 'dataset' in model:
        print('dataset cannot be processed!')
        continue
    if 'UKB' not in model:
#         print('dataset cannot be processed!')
        continue
        
    dicts[model] = {}
    target_cols = ['seg_spleen_normalised', 'seg_liver_normalised']
    for target_col in target_cols:
    #     Equation 1 process
        feats = basic_feats + ['diabetes_status']
        feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
        p_values, statsmodel = normal_group_fit(df, target_col, feats_string)
        dicts[model][target_col] = p_values

#     #     Equation 2 process
#         feats = basic_feats + ['diabetes_status']
#         feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
#         p_values, statsmodel = normal_group_fit(df, target_col, feats_string)
#         dicts[model][target_col+'_with_seg_volumes'] = p_values

    #     Equation 3 process
        if target_col == 'seg_spleen_normalised':
            iou_feat = ['iou_spleen']
        else:
            iou_feat = ['iou_liver']
        feats = basic_feats + [ 'diabetes_status'] + iou_feat
        feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
        p_values, statsmodel = normal_group_fit(df, target_col, feats_string)
        dicts[model][target_col+'_with_iou'] = p_values

    #     Equation 4 process
        feats = basic_feats + [ 'diabetes_status']
        feats_string = make_feature_string(feats, ['sex', 'diabetes_status'])
        p_values, statsmodel = weighted_group_feats(df, target_col, feats_string, iou_feat[0])
        dicts[model][target_col+'_with_regularised_seg_volumes'] = p_values

p_value_df = df_from_nested_dicts(dicts).T
p_value_df.to_csv('to_seg.csv')
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

Unnamed: 0,Unnamed: 1,Unnamed: 2,"C(diabetes_status, Treatment)[T.1.0]","C(sex, Treatment)[T.1]",Intercept,age,aic,bic,bmi_numeric,iou_liver,iou_spleen,psuedo_r2
UKB_MC_dropout_quicknat_seg_spleen_normalised,seg_spleen_normalised,0,0.713705,2.155e-10,0.0303443,0.00328474,6338.62,6367.25,2.59751e-10,,,0.0428401
UKB_MC_dropout_quicknat_seg_spleen_normalised_with_iou,seg_spleen_normalised_with_iou,0,0.934849,5.42973e-09,2.8569599999999997e-20,0.0340601,6171.77,6206.12,1.58441e-11,,1.6816899999999998e-38,0.1116
UKB_MC_dropout_quicknat_seg_spleen_normalised_with_regularised_seg_volumes,seg_spleen_normalised_with_regularised_seg_volumes,0,0.725957,9.13141e-11,0.0778206,0.00173398,6401.5,6430.12,8.61393e-10,,,0.0430317
UKB_MC_dropout_quicknat_seg_liver_normalised,seg_liver_normalised,0,0.011574,2.15691e-09,0.00025549,0.000362379,6269.87,6298.5,2.4094999999999998e-21,,,0.0714566
UKB_MC_dropout_quicknat_seg_liver_normalised_with_iou,seg_liver_normalised_with_iou,0,0.0121305,1.22577e-09,2.02018e-06,0.000493217,6261.97,6296.32,5.75449e-22,0.0016793,,0.075506
UKB_MC_dropout_quicknat_seg_liver_normalised_with_regularised_seg_volumes,seg_liver_normalised_with_regularised_seg_volumes,0,0.0116755,1.32315e-09,0.000312037,0.000449238,6247.29,6275.92,1.10012e-20,,,0.0701431
UKB_probabilistic_quicknat_seg_spleen_normalised,seg_spleen_normalised,0,0.563799,0.964382,0.1576,0.0116883,6430.07,6458.7,0.337278,,,0.00340279
UKB_probabilistic_quicknat_seg_spleen_normalised_with_iou,seg_spleen_normalised_with_iou,0,0.965901,0.537936,1.63078e-99,0.00675073,5394.52,5428.88,0.34264,,5.47643e-227,0.369654
UKB_probabilistic_quicknat_seg_spleen_normalised_with_regularised_seg_volumes,seg_spleen_normalised_with_regularised_seg_volumes,0,0.642308,0.927368,0.0752974,0.00946912,6365.08,6393.71,0.336813,,,0.00354168
UKB_probabilistic_quicknat_seg_liver_normalised,seg_liver_normalised,0,0.112436,0.36936,0.681506,0.118024,6430.85,6459.47,0.204856,,,0.00306205
