In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import glob
import nibabel as nb
import itertools
import json
import sys
import statsmodels.api as sm
from statsmodels.formula.api import ols, wls
from statsmodels.formula.api import mixedlm
from patsy.contrasts import Treatment

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
from statsmodels.discrete.discrete_model import Probit, MNLogit
from statsmodels.tools import add_constant
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFdr
from sklearn.feature_selection import f_regression, mutual_info_regression

import seaborn as sns
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(1024)

In [5]:
base_dir = '/home/jyotirmay/Desktop/my_thesis'

In [6]:
# !pip install 

In [7]:
smoking_feats = ['smoker_former', 'smoker_irregular', 'smoker_non_smoker', 'smoker_regular', 'smoking-packages']
bmi_feats = ['bmi-who_normal', 'bmi-who_obesity class I', 'bmi-who_obesity class II', 'bmi-who_obesity class III',
            'bmi_numeric', 'bmi-who_pre-obisety']
blood_pressure_feats = ['blood-pressure-diastolic', 'blood-pressure-systolic']
cholesterol_feats = ['cholesterol-hdl', 'cholesterol-ldl', 'cholesterol-total']
mri_feats = ['mri-liver-fat-artifacts', 'mri-liver-fat-lobus-dexter', 
             'mri-liver-fat-lobus-sinister', 'mri-liver-fat-portal-vein']
alcohol_feats = ['alcohol-g/day']
hbalc_feats = ['hba1c-mmol/mol', 'hba1c-percentage']
medication_feats = ['meds-antidiabetic', 'meds-antihypertensive', 'meds-incretin-mimetics', 'meds-insulin-therapy',
                    'meds-lipoprotein-lowering', 'meds-oral-antidiabetic']
triglyceride = ['triglyceride']
hypertension = ['hypertension']
basic_feats = ['age', 'height', 'sex', 'weight']

vols_feat = ['seg_liver', 'seg_spleen']
spleen_sample_cols = ['0_spleen','1_spleen','2_spleen','3_spleen','4_spleen','5_spleen','6_spleen','7_spleen','8_spleen','9_spleen']
liver_sample_cols = ['0_liver','1_liver','2_liver','3_liver','4_liver','5_liver','6_liver','7_liver','8_liver','9_liver']

feats_from_paper_for_group_test = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status_0', 'diabetes_status_1', 'diabetes_status_2'], ['hypertension'], ['triglyceride'],
             ['cholesterol_hdl', 'cholesterol_ldl'],
             ['mri_liver_fat_artifacts', 'mri_liver_fat_lobus_dexter', 
              'mri_liver_fat_lobus_sinister', 'mri_liver_fat_portal_vein'],
             ['meds_lipoprotein_lowering'],
             ['smoker_former', 'smoker_non_smoker', 'smoker_regular']]

feats_from_paper_for_group_test_no_categorisation = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status'], ['hypertension'], ['triglyceride'],
             ['cholesterol_hdl', 'cholesterol_ldl'],
             ['mri_liver_fat_artifacts', 'mri_liver_fat_lobus_dexter', 
              'mri_liver_fat_lobus_sinister', 'mri_liver_fat_portal_vein'],
             ['meds_lipoprotein_lowering'],
             ['smoker_former', 'smoker_non_smoker', 'smoker_regular']]

feats_from_paper_for_individual_test = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status_0', 'diabetes_status_1', 'diabetes_status_2'], ['hypertension'], ['triglyceride'],
             ['blood_pressure_diastolic', 'blood_pressure_systolic'],
             ['cholesterol_hdl', 'cholesterol_ldl', 'cholesterol_total'],
             ['mri_liver_fat_artifacts', 'mri_liver_fat_lobus_dexter', 
              'mri_liver_fat_lobus_sinister', 'mri_liver_fat_portal_vein'],
             ['meds_lipoprotein_lowering', 'meds_antihypertensive'],
             ['smoker_former', 'smoker_non_smoker', 'smoker_regular'], ['alcohol_g_day']]

feats_from_paper_for_individual_test_ukb = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status']]

paper_link = 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0177154&type=printable'

In [8]:
model_merged_feats_path = [
    {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_concat_report_final.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_concat_report_final.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_concat_report_final.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_concat_report_final.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_concat_report_final.csv'}
]
all_dataset_merged_feats_path = [
    {'all_KORA_processed_False': './dataset_groups/whole_body_datasets/KORA/all_processed_False_concat_report_final.csv'}, 
    {'all_KORA_processed_True': './dataset_groups/whole_body_datasets/KORA/all_processed_True_concat_report_final.csv'}
]

test_dataset_merged_feats_path = [
    {'test_KORA_processed_False': './dataset_groups/whole_body_datasets/KORA/test_processed_False_concat_report_final.csv'}, 
    {'test_KORA_processed_True': './dataset_groups/whole_body_datasets/KORA/test_processed_True_concat_report_final.csv'}
]

all_paths = [
    {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_concat_report_final.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_concat_report_final.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_concat_report_final.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_concat_report_final.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_concat_report_final.csv'},
#     {'all_KORA_processed_False': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/all_processed_False_concat_report_final.csv'}, 
#     {'all_KORA_processed_True': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/all_processed_True_concat_report_final.csv'},
#     {'test_KORA_processed_False': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/test_processed_False_concat_report_final.csv'}, 
    {'test_KORA_processed_True': './dataset_groups/whole_body_datasets/KORA/test_processed_True_concat_report_final.csv'}
]

final_model_report_path = [
    {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_final_report.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_final_report.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_final_report.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_final_report.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_final_report.csv'}
]

ukb_paths = [
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_UKB_v2/UKB/10_1573078374.453554_concat_report_final.csv'}
]

fb = 'full_bayesian'
fb01 = 'full_bayesian_0dot01'
mc = 'MC_dropout_quicknat'
pq = 'probabilistic_quicknat'
hq = 'hierarchical_quicknat'
af = 'all_KORA_processed_False'
at = 'all_KORA_processed_True'
tf = 'test_KORA_processed_False'
tt = 'test_KORA_processed_True'

In [9]:
flatten = lambda l: [item for sublist in l for item in sublist]

def transform_to_categorical(df, categorical_features_list):
    for f in categorical_features_list:
        dfDummies = pd.get_dummies(df[f], prefix = f)
        df = pd.concat([df, dfDummies], axis=1)
    return df

def rename(df, cols_map=None):
    if cols_map is None:
        cols_map =  {'bmi-numeric':'bmi_numeric', 'blood-pressure-diastolic':'blood_pressure_diastolic', 'blood-pressure-systolic':'blood_pressure_systolic',
             'cholesterol-hdl':'cholesterol_hdl', 'cholesterol-ldl':'cholesterol_ldl', 'cholesterol-total':'cholesterol_total',
             'mri-liver-fat-artifacts':'mri_liver_fat_artifacts', 'mri-liver-fat-lobus-dexter':'mri_liver_fat_lobus_dexter', 
              'mri-liver-fat-lobus-sinister':'mri_liver_fat_lobus_sinister', 'mri-liver-fat-portal-vein':'mri_liver_fat_portal_vein',
             'meds-lipoprotein-lowering':'meds_lipoprotein_lowering', 'meds-antihypertensive':'meds_antihypertensive',
              'smoker_non-smoker':'smoker_non_smoker','alcohol-g/day':'alcohol_g_day'}
    df.rename(columns=cols_map, inplace=True)
    return df

def z_score_column_normalise(df, column_list):
    normalised_cols_map = {}
    for column in column_list:
        normalised_cols_map[column] = column+'_normalised'
        df[normalised_cols_map[column]] = (df[column] - df[column].mean())/df[column].std(ddof=0)
    return df, normalised_cols_map

def z_score_group_normalise(df, cols_to_normalise):
    normalised_cols_map = {}
    group_cols_value = df[cols_to_normalise].values
    mean, std = np.mean(group_cols_value), np.std(group_cols_value, ddof=0)
    for column in cols_to_normalise:
        normalised_cols_map[column] = 'normalised_'+column
        df[normalised_cols_map[column]] = (df[column] - mean)/std
    return df, normalised_cols_map

def pre_process_for_mlm(df, to_be_transpose_cols, value_name):
    cols = list(df.columns)
    cols_without_tobe_transposed_cols = list(set(cols) - set(to_be_transpose_cols))
    df = df.melt(id_vars=cols_without_tobe_transposed_cols, 
            value_vars=to_be_transpose_cols, 
            value_name=value_name)

    return df

def split_diabetes_state(df):
    df_normal = df[df['diabetes_status']==0]
    df_pre_diabetic = df[df['diabetes_status']==1]
    df_diabetic = df[df['diabetes_status']==2]
    df_normal_affx = df_normal.rename(columns=lambda x: 'normal_'+x)
    df_pre_diabetic_affx = df_pre_diabetic.rename(columns=lambda x: 'pre_diabetic_'+x)
    df_diabetic_affx = df_diabetic.rename(columns=lambda x: 'diabetic_'+x)
    dfs = pd.concat([df_normal_affx, df_pre_diabetic_affx, df_diabetic_affx])
    return dfs

def plot_and_ttest(df, cols):
    dicts = {}
    for col_subset in itertools.combinations(cols, 2):
        print(f'{col_subset[0]} vs {col_subset[1]}')
        df[list(col_subset)].boxplot(rot=45)
        t,p = stats.ttest_ind(df[col_subset[0]].dropna().values, df[col_subset[1]].dropna().values)

        print('ttest_score:', t)
        print('p_value:', p)
        print('\n')
        dicts[f'{col_subset[0]} vs {col_subset[1]}'] = p
        plt.show()
    return dicts

def model_evaluation_matrics(file_paths_dict, cols_to_fetch):
    dicts = {}
    for model, path in file_paths_dict.items():
        df = pd.read_csv(path)
        dicts[model] = df[cols_to_fetch].iloc[1:].mean()
    return dicts

def individual_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex']):
    p_values = {}
    for f in feats:
        try:
            features_string = make_feature_string([f], categorical_feats)
            fii = ols(f'{target_col} ~ {features_string}', df).fit()
            feat_dict = fii.pvalues.to_dict()
            coeffs = fii.params
        
            for k, v in feat_dict.items():
                orig_key = k
                if k == 'Intercept':
                    k = f+'_Intercept'
                p_values[k] = v

                p_values[k+'_coeff'] = coeffs[orig_key]
            p_values['fitting_score'] = fii.rsquared
        except Exception as e:
            print(e)

    return p_values

def make_feature_string(feats, categorical_cols=[]):
    feat_str = '1+'
    for c_col in categorical_cols:
        if type(c_col) is tuple:
            if c_col[0] not in feats:
#                 print(f'{c_col} is not present in given feature list, SKIPPING IT!')
                continue
            feat_str += f'C({c_col[0]}, Treatment(reference={c_col[1]}))+'
            c_col = c_col[0]
        else:
            if c_col not in feats:
#                 print(f'{c_col} is not present in given feature list, SKIPPING IT!')
                continue
            feat_str += f'C({c_col}, Treatment)+'
        feats.remove(c_col)
    
    other_feats_str = '+'.join(feats)
    if other_feats_str is '':
        final_feat_str = feat_str[:-1]
    else:
        final_feat_str = feat_str + other_feats_str

    return final_feat_str
    

def normal_group_fit(df, target_col, features_string):
    model = ols(f'{target_col} ~ {features_string}', df).fit()
    plot_model_outputs(df['bmi_numeric'].values, df[target_col].values, model, 'OLS')
    return model

def weighted_group_feats(df, target_col, features_string, alpha_col):
    alpha = df[alpha_col].values
    model = wls(f'{target_col} ~ {features_string}', df, weights=(1/(1-alpha))).fit()
    return model

def normal_mixed_effect_model(df, target_col, features_string, group_col, L1_wt=None):
    if L1_wt is None:
        model = mixedlm(f'{target_col} ~ {features_string}', df, groups=df[group_col]).fit()
    else:
        model = mixedlm(f'{target_col} ~ {features_string}', df, groups=df[group_col]).fit_regularized(L1_wt=L1_wt)
    return model

def anova_test(ols_model):
    anova_stats = sm.stats.anova_lm(ols_model)
    return anova_stats

def df_from_nested_dicts(dicts):
    df = pd.concat({k+'_'+kk: pd.concat({kk:pd.DataFrame(vv, index=[0]).T}, axis=1) for k, v in dicts.items() for kk, vv in v.items()}, axis=1)
    return df

def df_from_nested_dicts_group(dicts):
    df = pd.concat({k+'_'+kk: pd.concat({kk:pd.DataFrame(vv, index=[0]).T}, axis=1) for k, v in dicts.items() for kk, vv in v.items()}, axis=1)
    return df

def highlight_significance(df, threshold=0.05):
    return df.style.applymap(lambda x: 'background-color : yellow' if x==True or x<threshold else '')

def choose_best_features(df, feats, target_col, percentile=50):
    X, y = df[feats], df[target_col]
    columns = X.columns.values
    feat_selection_model = SelectFdr(f_regression, alpha=0.05).fit(X, y)
    col_mask = feat_selection_model.get_support()
    return columns[col_mask]
    

In [10]:
import statsmodels
def discrete_individual_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex'], is_classification=False):
    p_values = {}
    for f in feats:
        try:
            x = df[f]
            y = df[target_col]
            x = add_constant(x)
            model = MNLogit(y, x)
            fii = model.fit()
            fii_ = fii.get_margeff()
            print(fii.summary())
            print(fii_.summary())
            feat_dict = fii.pvalues.to_dict()
            coeffs = fii.params
            
            for ko, vo in feat_dict.items():
                 for k, v in vo.items():
                    orig_key = k
                    if k == 'const':
                        k = f+'_Intercept'
                    p_values[str(ko)+'_'+k] = v
                    p_values[str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]
                
            p_values['aic'] = fii.aic
            p_values['bic'] = fii.bic
        except Exception as e:
            print('ERROR:', e)

    return p_values

def discrete_group_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex'], is_classification=False):
    p_values = {}
    try:
        x = df[feats]
        y = df[target_col]
        x = add_constant(x)
        model = MNLogit(y, x)
        fii = model.fit()
        fii_ = fii.get_margeff()
        print(fii.summary())
        print(fii_.summary())
        feat_dict = fii.pvalues.to_dict()
        coeffs = fii.params

        for ko, vo in feat_dict.items():
             for k, v in vo.items():
                orig_key = k
#                 if k == 'const':
#                     k = f+'_Intercept'
                p_values[str(ko)+'_'+k] = v
                p_values[str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]

        p_values['aic'] = fii.aic
        p_values['bic'] = fii.bic
    except Exception as e:
        print('ERROR:', e)

    return p_values, fii

def discrete_weighted_group_feats(df, target_col, feats, alpha_col):
    alpha = df[alpha_col].values
    x = df[feats]
    y = df[target_col]
    x = add_constant(x)
    model = MNLogit(y, x, weights=(1/(1-alpha))).fit()
    print(model.summary())
    return model

significance_check = lambda x: False if x>0.05 else True
def group_feature_stats(features_string, df, target_col, return_model=False):
    p_values, fii = None, None
    dicts = {}
    try:
        model = ols(f'{target_col} ~ {features_string}', df)
        fii = model.fit()
        p_values = fii.pvalues.to_dict()
        coeffs = fii.params.to_dict()
        dicts = {}
        for k, v in p_values.items():
                dicts[k] = v
                dicts[k+'_coeff'] = coeffs[k]

    except Exception as e:
        print(e)
    if return_model:
        return dicts, fii, model
    else:
        return dicts, fii

In [11]:
model_merged_feats_path_combined = {key:val for d in all_paths for key,val in d.items()}
only_models_final_report_path = {key:val for d in final_model_report_path for key,val in d.items()}
ukb_paths_conbined = {key:val for d in ukb_paths for key,val in d.items()}

In [13]:
cols_to_fetch = ['sncc', 'ged', 'iou_spleen', 'iou_liver', 'dice_spleen',
       'dice_liver', 'surface_distance_avg_spleen', 'surface_distance_avg_liver']
dicts = model_evaluation_matrics(only_models_final_report_path, cols_to_fetch)
df_model_eval = pd.DataFrame.from_dict(dicts)
df_model_eval = df_model_eval.T
df_model_eval

Unnamed: 0,sncc,ged,iou_spleen,iou_liver,dice_spleen,dice_liver,surface_distance_avg_spleen,surface_distance_avg_liver
full_bayesian,0.604205,0.223997,0.524697,0.66866,0.828703,0.894704,0.809946,0.736798
full_bayesian_0dot01,0.439956,0.195701,0.778342,0.84361,0.87628,0.921408,0.87853,0.817305
MC_dropout_quicknat,0.399954,0.166135,0.821329,0.878295,0.897294,0.937717,0.915982,0.870747
probabilistic_quicknat,0.163864,0.228308,0.988114,0.992439,0.873666,0.925428,0.875153,0.827462
hierarchical_quicknat,0.187984,0.209171,0.983818,0.988998,0.883146,0.932802,0.897351,0.854833


# Baseline Stats (with input data)

# Group features Stats

In [14]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
anova_test_dicts = {}
for key, value in model_merged_feats_path_combined.items():
#     if 'KORA' in key:
#         print('dataset cannot be processed!')
#         continue
    df = pd.read_csv(value)
    df = rename(df)
    df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
    df = df.fillna(0)

    dicts[key] = {}
    
    target_col = 'diabetes_status'
    best_feats_spleen = feats 
    p_value_dict_spleen, model = discrete_group_feature_stats(best_feats_spleen, df, target_col, ['sex'], True)

    dicts[key][target_col] = p_value_dict_spleen
    
    feats_ = feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    p_value_dict_spleen, model = discrete_group_feature_stats(feats_, df, target_col, ['sex'], True)

    dicts[key][target_col+'_with_seg_volumes'] = p_value_dict_spleen

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

Optimization terminated successfully.
         Current function value: 0.683834
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:        diabetes_status   No. Observations:                  153
Model:                        MNLogit   Df Residuals:                      145
Method:                           MLE   Df Model:                            6
Date:                Thu, 28 Nov 2019   Pseudo R-squ.:                  0.2151
Time:                        13:18:07   Log-Likelihood:                -104.63
converged:                       True   LL-Null:                       -133.30
Covariance Type:            nonrobust   LLR p-value:                 1.555e-10
diabetes_status=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               -12.6741      2.359     -5.372      0.000     -17.298      -8.050
age     

                          MNLogit Regression Results                          
Dep. Variable:        diabetes_status   No. Observations:                  153
Model:                        MNLogit   Df Residuals:                      145
Method:                           MLE   Df Model:                            6
Date:                Thu, 28 Nov 2019   Pseudo R-squ.:                  0.2151
Time:                        13:18:07   Log-Likelihood:                -104.63
converged:                       True   LL-Null:                       -133.30
Covariance Type:            nonrobust   LLR p-value:                 1.555e-10
diabetes_status=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               -12.6741      2.359     -5.372      0.000     -17.298      -8.050
age                   0.0865      0.026      3.284      0.001       0.035       0.138
sex                   0.

Optimization terminated successfully.
         Current function value: 0.683834
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:        diabetes_status   No. Observations:                  153
Model:                        MNLogit   Df Residuals:                      145
Method:                           MLE   Df Model:                            6
Date:                Thu, 28 Nov 2019   Pseudo R-squ.:                  0.2151
Time:                        13:18:07   Log-Likelihood:                -104.63
converged:                       True   LL-Null:                       -133.30
Covariance Type:            nonrobust   LLR p-value:                 1.555e-10
diabetes_status=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               -12.6741      2.359     -5.372      0.000     -17.298      -8.050
age     

Optimization terminated successfully.
         Current function value: 0.734952
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:        diabetes_status   No. Observations:                  155
Model:                        MNLogit   Df Residuals:                      147
Method:                           MLE   Df Model:                            6
Date:                Thu, 28 Nov 2019   Pseudo R-squ.:                  0.1771
Time:                        13:18:07   Log-Likelihood:                -113.92
converged:                       True   LL-Null:                       -138.44
Covariance Type:            nonrobust   LLR p-value:                 7.291e-09
diabetes_status=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               -10.4646      2.394     -4.371      0.000     -15.157      -5.772
age     

Unnamed: 0,Unnamed: 1,Unnamed: 2,0_age,0_age_coeff,0_bmi_numeric,0_bmi_numeric_coeff,0_const,0_const_coeff,0_seg_liver_normalised,0_seg_liver_normalised_coeff,0_seg_spleen_normalised,0_seg_spleen_normalised_coeff,0_sex,0_sex_coeff,1_age,1_age_coeff,1_bmi_numeric,1_bmi_numeric_coeff,1_const,1_const_coeff,1_seg_liver_normalised,1_seg_liver_normalised_coeff,1_seg_spleen_normalised,1_seg_spleen_normalised_coeff,1_sex,1_sex_coeff,aic,bic
full_bayesian_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
full_bayesian_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.00103513,0.0952656,0.000748393,0.191317,2.16153e-07,-12.413,0.28744,0.356579,0.679883,-0.107057,0.155577,0.761931,0.00273092,0.109231,0.00114728,0.22229,9.03486e-07,-15.4421,0.350017,0.344292,0.849347,-0.0568916,0.0403522,1.58231,231.838,268.204
full_bayesian_0dot01_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
full_bayesian_0dot01_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.00163936,0.0950628,0.00175447,0.185088,2.71138e-07,-12.1578,0.215139,0.431851,0.531631,-0.173756,0.211894,0.676663,0.000537572,0.149809,0.0476895,0.149992,3.79952e-06,-15.6013,0.0106133,1.16935,0.334279,-0.318377,0.152776,1.13772,225.825,262.19
MC_dropout_quicknat_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
MC_dropout_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.001813,0.0979487,0.002014,0.194832,3.8542e-07,-12.5944,0.0765,0.700763,0.0717982,-0.500433,0.252937,0.65975,0.000225926,0.171521,0.141591,0.122617,8.66985e-06,-15.9048,0.00107712,1.66542,0.108415,-0.526242,0.377906,0.733379,218.838,255.203
probabilistic_quicknat_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
probabilistic_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.00112891,0.0981565,0.00217581,0.177966,3.93095e-07,-12.0697,0.355624,-0.253859,0.10102,0.590105,0.316961,0.566935,0.00047654,0.147536,0.0391392,0.152045,4.74571e-06,-15.3487,0.305199,-0.332692,0.00571058,1.26311,0.303087,0.846478,224.284,260.649
hierarchical_quicknat_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
hierarchical_quicknat_diabetes_status_with_seg_volumes,diabetes_status_with_seg_volumes,0,0.0015468,0.0973214,0.00381125,0.175636,4.89593e-07,-11.9554,0.141602,0.560296,0.497264,-0.185467,0.293764,0.589163,0.000274582,0.1679,0.159679,0.112362,9.01007e-06,-15.4295,0.0020321,1.54347,0.221091,-0.398987,0.333521,0.790126,221.772,258.137


# Regularised group feat stats test

In [15]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
anova_test_dicts = {}
for key, value in model_merged_feats_path_combined.items():
    if 'KORA' in key:
#         print('dataset cannot be processed!')
        continue

    df = pd.read_csv(value)
    df = rename(df)
    df = transform_to_categorical(df, ['diabetes_status', 'sex'])
    df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
    df = df.fillna(0)

    dicts[key] = {}
    anova_test_dicts[key] = {}
    
    target_col = 'diabetes_status'
    best_feats_spleen = feats
    feature_string =  make_feature_string(list(best_feats_spleen), [ 'sex'])

    model = discrete_weighted_group_feats(df, target_col, feats , 'iou_spleen')
    feat_dict = model.pvalues.to_dict()
    coeffs = model.params
    dicts[key][target_col] = {}
    for ko, vo in feat_dict.items():
             for k, v in vo.items():
                orig_key = k
#                 if k == 'const':
#                     k = f+'_Intercept'
                dicts[key][target_col][str(ko)+'_'+k] = v
                dicts[key][target_col][str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]

    dicts[key][target_col]['aic'] = model.aic
    dicts[key][target_col]['bic'] = model.bic
    
    feats_ = feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    print(feats_)
    model = discrete_weighted_group_feats(df, target_col, feats_ , 'iou_liver')
#     result = anova_test(model)
#     dicts[key][target_col] = model.pvalues.to_dict()
    feat_dict = model.pvalues.to_dict()
    coeffs = model.params
    dicts[key][target_col+'_seg_volumes'] = {}
    for ko, vo in feat_dict.items():
             for k, v in vo.items():
                orig_key = k
#                 if k == 'const':
#                     k = f+'_Intercept'
                dicts[key][target_col+'_seg_volumes'][str(ko)+'_'+k] = v
                dicts[key][target_col+'_seg_volumes'][str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]

    dicts[key][target_col+'_seg_volumes']['aic'] = model.aic
    dicts[key][target_col+'_seg_volumes']['bic'] = model.bic

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

Optimization terminated successfully.
         Current function value: 0.683834
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:        diabetes_status   No. Observations:                  153
Model:                        MNLogit   Df Residuals:                      145
Method:                           MLE   Df Model:                            6
Date:                Thu, 28 Nov 2019   Pseudo R-squ.:                  0.2151
Time:                        13:18:13   Log-Likelihood:                -104.63
converged:                       True   LL-Null:                       -133.30
Covariance Type:            nonrobust   LLR p-value:                 1.555e-10
diabetes_status=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               -12.6741      2.359     -5.372      0.000     -17.298      -8.050
age     

                          MNLogit Regression Results                          
Dep. Variable:        diabetes_status   No. Observations:                  153
Model:                        MNLogit   Df Residuals:                      141
Method:                           MLE   Df Model:                           10
Date:                Thu, 28 Nov 2019   Pseudo R-squ.:                  0.2692
Time:                        13:18:13   Log-Likelihood:                -97.419
converged:                       True   LL-Null:                       -133.30
Covariance Type:            nonrobust   LLR p-value:                 2.024e-11
    diabetes_status=1       coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -12.5944      2.481     -5.076      0.000     -17.457      -7.731
age                       0.0979      0.031      3.119      0.002       0.036       0.159
sex     

Unnamed: 0,Unnamed: 1,Unnamed: 2,0_age,0_age_coeff,0_bmi_numeric,0_bmi_numeric_coeff,0_const,0_const_coeff,0_seg_liver_normalised,0_seg_liver_normalised_coeff,0_seg_spleen_normalised,0_seg_spleen_normalised_coeff,0_sex,0_sex_coeff,1_age,1_age_coeff,1_bmi_numeric,1_bmi_numeric_coeff,1_const,1_const_coeff,1_seg_liver_normalised,1_seg_liver_normalised_coeff,1_seg_spleen_normalised,1_seg_spleen_normalised_coeff,1_sex,1_sex_coeff,aic,bic
full_bayesian_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
full_bayesian_diabetes_status_seg_volumes,diabetes_status_seg_volumes,0,0.00103513,0.0952656,0.000748393,0.191317,2.16153e-07,-12.413,0.28744,0.356579,0.679883,-0.107057,0.155577,0.761931,0.00273092,0.109231,0.00114728,0.22229,9.03486e-07,-15.4421,0.350017,0.344292,0.849347,-0.0568916,0.0403522,1.58231,231.838,268.204
full_bayesian_0dot01_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
full_bayesian_0dot01_diabetes_status_seg_volumes,diabetes_status_seg_volumes,0,0.00163936,0.0950628,0.00175447,0.185088,2.71138e-07,-12.1578,0.215139,0.431851,0.531631,-0.173756,0.211894,0.676663,0.000537572,0.149809,0.0476895,0.149992,3.79952e-06,-15.6013,0.0106133,1.16935,0.334279,-0.318377,0.152776,1.13772,225.825,262.19
MC_dropout_quicknat_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
MC_dropout_quicknat_diabetes_status_seg_volumes,diabetes_status_seg_volumes,0,0.001813,0.0979487,0.002014,0.194832,3.8542e-07,-12.5944,0.0765,0.700763,0.0717982,-0.500433,0.252937,0.65975,0.000225926,0.171521,0.141591,0.122617,8.66985e-06,-15.9048,0.00107712,1.66542,0.108415,-0.526242,0.377906,0.733379,218.838,255.203
probabilistic_quicknat_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
probabilistic_quicknat_diabetes_status_seg_volumes,diabetes_status_seg_volumes,0,0.00112891,0.0981565,0.00217581,0.177966,3.93095e-07,-12.0697,0.355624,-0.253859,0.10102,0.590105,0.316961,0.566935,0.00047654,0.147536,0.0391392,0.152045,4.74571e-06,-15.3487,0.305199,-0.332692,0.00571058,1.26311,0.303087,0.846478,224.284,260.649
hierarchical_quicknat_diabetes_status,diabetes_status,0,0.00102185,0.0864784,1.33576e-05,0.215348,7.76792e-08,-12.6741,,,,,0.0747735,0.882643,0.00273775,0.0990378,4.98666e-05,0.246388,4.98628e-07,-15.6458,,,,,0.0172142,1.73911,225.253,249.497
hierarchical_quicknat_diabetes_status_seg_volumes,diabetes_status_seg_volumes,0,0.0015468,0.0973214,0.00381125,0.175636,4.89593e-07,-11.9554,0.141602,0.560296,0.497264,-0.185467,0.293764,0.589163,0.000274582,0.1679,0.159679,0.112362,9.01007e-06,-15.4295,0.0020321,1.54347,0.221091,-0.398987,0.333521,0.790126,221.772,258.137


# Normalised Mixed Effect Model

In [16]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
for key, value in model_merged_feats_path_combined.items():
    if 'KORA' in key:
        print('dataset cannot be processed!')
        continue
    df = pd.read_csv(value)
    df = rename(df)
    df = df.fillna(0)
    df_s = df.copy()
    df_l = df.copy()

    dicts[key] = {}
    
    target_col = 'diabetes_status'
    df_spleen, spleen_normalised_cols_map = z_score_group_normalise(df, spleen_sample_cols)
    df_liver, liver_normalised_cols_map = z_score_group_normalise(df_spleen, liver_sample_cols)
    best_feats_spleen = feats + list(spleen_normalised_cols_map.values()) + list(liver_normalised_cols_map.values()) # choose_best_features(df_spleen, feats, target_col)
    feature_string =  make_feature_string(list(best_feats_spleen), ['sex'])
    p_value_dict_spleen, model = group_feature_stats(feature_string, df_liver, target_col)
    dicts[key][target_col] = p_value_dict_spleen
    dicts[key][target_col]['fitting_score'] = model.rsquared

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)

p_value_df_styler

dataset cannot be processed!


Unnamed: 0,Unnamed: 1,Unnamed: 2,Intercept,Intercept_coeff,"C(sex, Treatment)[T.1]","C(sex, Treatment)[T.1]_coeff",age,age_coeff,bmi_numeric,bmi_numeric_coeff,normalised_0_spleen,normalised_0_spleen_coeff,normalised_1_spleen,normalised_1_spleen_coeff,normalised_2_spleen,normalised_2_spleen_coeff,normalised_3_spleen,normalised_3_spleen_coeff,normalised_4_spleen,normalised_4_spleen_coeff,normalised_5_spleen,normalised_5_spleen_coeff,normalised_6_spleen,normalised_6_spleen_coeff,normalised_7_spleen,normalised_7_spleen_coeff,normalised_8_spleen,normalised_8_spleen_coeff,normalised_9_spleen,normalised_9_spleen_coeff,normalised_0_liver,normalised_0_liver_coeff,normalised_1_liver,normalised_1_liver_coeff,normalised_2_liver,normalised_2_liver_coeff,normalised_3_liver,normalised_3_liver_coeff,normalised_4_liver,normalised_4_liver_coeff,normalised_5_liver,normalised_5_liver_coeff,normalised_6_liver,normalised_6_liver_coeff,normalised_7_liver,normalised_7_liver_coeff,normalised_8_liver,normalised_8_liver_coeff,normalised_9_liver,normalised_9_liver_coeff,fitting_score
full_bayesian_diabetes_status,diabetes_status,0,1.579e-07,-2.37409,0.0736579,0.215228,0.000219631,0.0237856,0.000256959,0.0495324,0.645824,-0.463013,0.0710246,-1.46715,0.937044,-0.0856173,0.748839,0.336683,0.709611,0.32281,0.600073,-0.498329,0.575585,-0.599488,0.841716,0.204004,0.0575819,1.81216,0.63778,0.461304,0.779418,-0.489726,0.316216,-1.91647,0.0226111,4.33434,0.928419,0.178388,0.639054,1.01101,0.140071,-2.50025,0.82932,0.432946,0.247878,-2.35539,0.84419,0.322895,0.557225,1.00614,0.371511
full_bayesian_0dot01_diabetes_status,diabetes_status,0,8.10262e-07,-2.23463,0.156042,0.165253,6.72719e-06,0.0282494,0.00489771,0.0382534,0.0131142,4.98458,0.00341352,-4.51984,0.316141,2.0645,0.664029,0.93047,0.530059,1.07586,0.912526,-0.194055,0.530292,1.18338,0.450634,-1.40011,0.115156,-3.28456,0.470542,-0.90197,0.668501,0.95491,0.75318,-0.896631,0.368854,2.26029,0.383934,-2.44384,0.121328,-2.4986,0.0823426,5.32694,0.267818,2.88613,0.757676,-0.748468,0.401773,-2.19288,0.27147,-2.41427,0.444527
MC_dropout_quicknat_diabetes_status,diabetes_status,0,8.53675e-06,-2.00587,0.619163,0.0584572,3.6976e-07,0.0318887,0.0829341,0.0241316,0.840563,-0.412758,0.408386,1.7835,0.696128,0.795633,0.793635,-0.682922,0.365716,-1.89718,0.328717,2.56302,0.236158,2.38377,0.384131,-1.82305,0.576859,-1.18617,0.337262,-1.52745,0.891983,0.479912,0.138538,4.64162,0.0337057,-5.56347,0.152745,4.52128,0.38819,2.71729,0.917757,0.307744,0.905744,0.310393,0.464038,-2.11661,0.103161,-4.87923,0.960641,-0.150101,0.441679
probabilistic_quicknat_diabetes_status,diabetes_status,0,6.5804e-05,-1.6892,0.191722,0.154096,6.84119e-05,0.0235995,0.0397974,0.0267377,0.911042,2.70666,0.579974,13.2387,0.00230365,77.8105,0.479607,-16.5116,0.678624,10.6743,0.433625,-22.1131,0.0102782,-62.9684,0.100492,40.2445,0.00850325,-67.2456,0.375527,24.1201,0.0301058,-61.1965,0.429318,-15.5416,0.0915146,45.0726,0.931481,2.31149,0.323248,22.7445,0.764112,8.04499,0.388042,21.2576,0.139453,41.9339,0.383443,-23.5208,0.122442,-40.8891,0.488884
hierarchical_quicknat_diabetes_status,diabetes_status,0,6.48496e-06,-2.1113,0.207746,0.161976,2.92213e-06,0.0282517,0.0231618,0.0322092,0.340186,-21.0446,0.276604,19.9073,0.551586,14.7998,0.501539,15.3862,0.928646,1.56862,0.0116686,-32.3202,0.109401,38.0499,0.239465,-27.4715,0.192206,20.8508,0.204798,-29.7619,0.0956474,-52.4163,0.137246,55.2685,0.468305,21.8465,0.790716,-7.83728,0.240134,-38.0697,0.0993681,43.6059,0.914771,-3.91276,0.330932,-33.1986,0.840064,5.61316,0.740884,9.27106,0.435307
