In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import glob
import nibabel as nb
import itertools
import json
import sys
import statsmodels.api as sm
from statsmodels.formula.api import ols, wls
from statsmodels.formula.api import mixedlm
from patsy.contrasts import Treatment

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import sklearn
import statsmodels
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
from statsmodels.discrete.discrete_model import Probit, MNLogit
from statsmodels.tools import add_constant
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFdr
from sklearn.feature_selection import f_regression, mutual_info_regression

import seaborn as sns
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
np.random.seed(1024)

In [2]:
base_dir = '/home/abhijit/Jyotirmay/my_thesis'

In [3]:
smoking_feats = ['smoker_former', 'smoker_irregular', 'smoker_non_smoker', 'smoker_regular', 'smoking-packages']
bmi_feats = ['bmi-who_normal', 'bmi-who_obesity class I', 'bmi-who_obesity class II', 'bmi-who_obesity class III',
            'bmi_numeric', 'bmi-who_pre-obisety']
blood_pressure_feats = ['blood-pressure-diastolic', 'blood-pressure-systolic']
cholesterol_feats = ['cholesterol-hdl', 'cholesterol-ldl', 'cholesterol-total']
mri_feats = ['mri-liver-fat-artifacts', 'mri-liver-fat-lobus-dexter', 
             'mri-liver-fat-lobus-sinister', 'mri-liver-fat-portal-vein']
alcohol_feats = ['alcohol-g/day']
hbalc_feats = ['hba1c-mmol/mol', 'hba1c-percentage']
medication_feats = ['meds-antidiabetic', 'meds-antihypertensive', 'meds-incretin-mimetics', 'meds-insulin-therapy',
                    'meds-lipoprotein-lowering', 'meds-oral-antidiabetic']
triglyceride = ['triglyceride']
hypertension = ['hypertension']
basic_feats = ['age', 'height', 'sex', 'weight']

vols_feat = ['seg_liver', 'seg_spleen']
spleen_sample_cols = ['0_spleen','1_spleen','2_spleen','3_spleen','4_spleen','5_spleen','6_spleen','7_spleen','8_spleen','9_spleen']
liver_sample_cols = ['0_liver','1_liver','2_liver','3_liver','4_liver','5_liver','6_liver','7_liver','8_liver','9_liver']

feats_from_paper_for_group_test = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status_0', 'diabetes_status_1', 'diabetes_status_2'], ['hypertension'], ['triglyceride'],
             ['cholesterol_hdl', 'cholesterol_ldl'],
             ['mri_liver_fat_artifacts', 'mri_liver_fat_lobus_dexter', 
              'mri_liver_fat_lobus_sinister', 'mri_liver_fat_portal_vein'],
             ['meds_lipoprotein_lowering'],
             ['smoker_former', 'smoker_non_smoker', 'smoker_regular']]

feats_from_paper_for_group_test_no_categorisation = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status'], ['hypertension'], ['triglyceride'],
             ['cholesterol_hdl', 'cholesterol_ldl'],
             ['mri_liver_fat_artifacts', 'mri_liver_fat_lobus_dexter', 
              'mri_liver_fat_lobus_sinister', 'mri_liver_fat_portal_vein'],
             ['meds_lipoprotein_lowering'],
             ['smoker_former', 'smoker_non_smoker', 'smoker_regular']]

feats_from_paper_for_individual_test = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status_0', 'diabetes_status_1', 'diabetes_status_2'], ['hypertension'], ['triglyceride'],
             ['blood_pressure_diastolic', 'blood_pressure_systolic'],
             ['cholesterol_hdl', 'cholesterol_ldl', 'cholesterol_total'],
             ['mri_liver_fat_artifacts', 'mri_liver_fat_lobus_dexter', 
              'mri_liver_fat_lobus_sinister', 'mri_liver_fat_portal_vein'],
             ['meds_lipoprotein_lowering', 'meds_antihypertensive'],
             ['smoker_former', 'smoker_non_smoker', 'smoker_regular'], ['alcohol_g_day']]

feats_from_paper_for_individual_test_ukb = [['age', 'sex', 'bmi_numeric'],
             ['diabetes_status']]

paper_link = 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0177154&type=printable'

In [4]:
model_merged_feats_path = [
#     {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_concat_report_final.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_concat_report_final.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_concat_report_final.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_concat_report_final.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_concat_report_final.csv'}
]
all_dataset_merged_feats_path = [
    {'all_KORA_processed_False': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/all_processed_False_concat_report_final.csv'}, 
    {'all_KORA_processed_True': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/all_processed_True_concat_report_final.csv'}
]

test_dataset_merged_feats_path = [
    {'test_KORA_processed_False': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/test_processed_False_concat_report_final.csv'}, 
    {'test_KORA_processed_True': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/test_processed_True_concat_report_final.csv'}
]

all_paths = [
    {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_concat_report_final.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_concat_report_final.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_concat_report_final.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_concat_report_final.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_concat_report_final.csv'},
#     {'all_KORA_processed_False': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/all_processed_False_concat_report_final.csv'}, 
#     {'all_KORA_processed_True': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/all_processed_True_concat_report_final.csv'},
#     {'test_KORA_processed_False': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/test_processed_False_concat_report_final.csv'}, 
    {'test_KORA_processed_True': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/test_processed_True_concat_report_final.csv'}
]
only_data_paths =  {'test_KORA_processed_True': '/home/abhijit/Jyotirmay/my_thesis/dataset_groups/whole_body_datasets/KORA/test_processed_True_concat_report_final.csv'}
final_model_report_path = [
    {'full_bayesian': './projects/full_bayesian/reports/full_bayesian_KORA_v2/KORA/10_1571866968.4002764_final_report.csv'},
    {'full_bayesian_0dot01': './projects/full_bayesian/reports/full_bayesian_KORA_v4/KORA/10_1572514598.527084_final_report.csv'},
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_KORA_v2/KORA/10_1572006141.7793334_final_report.csv'}, 
    {'probabilistic_quicknat': './projects/probabilistic_quicknat/reports/probabilistic_quicknat_KORA_v2/KORA/10_1571996796.7963011_final_report.csv'}, 
    {'hierarchical_quicknat': './projects/hierarchical_quicknat/reports/hierarchical_quicknat_KORA_v2/KORA/10_1571905560.9377904_final_report.csv'}
]

ukb_paths = [
    {'MC_dropout_quicknat': './projects/MC_dropout_quicknat/reports/MC_dropout_quicknat_UKB_v2/UKB/10_1573078374.453554_concat_report_final.csv'}
]

fb = 'full_bayesian'
fb01 = 'full_bayesian_0dot01'
mc = 'MC_dropout_quicknat'
pq = 'probabilistic_quicknat'
hq = 'hierarchical_quicknat'
af = 'all_KORA_processed_False'
at = 'all_KORA_processed_True'
tf = 'test_KORA_processed_False'
tt = 'test_KORA_processed_True'

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

def transform_to_categorical(df, categorical_features_list):
    for f in categorical_features_list:
        dfDummies = pd.get_dummies(df[f], prefix = f)
        df = pd.concat([df, dfDummies], axis=1)
    return df

def rename(df, cols_map=None):
    if cols_map is None:
        cols_map =  {'bmi-numeric':'bmi_numeric', 'blood-pressure-diastolic':'blood_pressure_diastolic', 'blood-pressure-systolic':'blood_pressure_systolic',
             'cholesterol-hdl':'cholesterol_hdl', 'cholesterol-ldl':'cholesterol_ldl', 'cholesterol-total':'cholesterol_total',
             'mri-liver-fat-artifacts':'mri_liver_fat_artifacts', 'mri-liver-fat-lobus-dexter':'mri_liver_fat_lobus_dexter', 
              'mri-liver-fat-lobus-sinister':'mri_liver_fat_lobus_sinister', 'mri-liver-fat-portal-vein':'mri_liver_fat_portal_vein',
             'meds-lipoprotein-lowering':'meds_lipoprotein_lowering', 'meds-antihypertensive':'meds_antihypertensive',
              'smoker_non-smoker':'smoker_non_smoker','alcohol-g/day':'alcohol_g_day'}
    df.rename(columns=cols_map, inplace=True)
    return df

def z_score_column_normalise(df, column_list):
    normalised_cols_map = {}
    for column in column_list:
        normalised_cols_map[column] = column+'_normalised'
        df[normalised_cols_map[column]] = (df[column] - df[column].mean())/df[column].std(ddof=0)
    return df, normalised_cols_map

def z_score_group_normalise(df, cols_to_normalise):
    normalised_cols_map = {}
    group_cols_value = df[cols_to_normalise].values
    mean, std = np.mean(group_cols_value), np.std(group_cols_value, ddof=0)
    for column in cols_to_normalise:
        normalised_cols_map[column] = 'normalised_'+column
        df[normalised_cols_map[column]] = (df[column] - mean)/std
    return df, normalised_cols_map

def pre_process_for_mlm(df, to_be_transpose_cols, value_name):
    cols = list(df.columns)
    cols_without_tobe_transposed_cols = list(set(cols) - set(to_be_transpose_cols))
    df = df.melt(id_vars=cols_without_tobe_transposed_cols, 
            value_vars=to_be_transpose_cols, 
            value_name=value_name)

    return df

def split_diabetes_state(df):
    df_normal = df[df['diabetes_status']==0]
    df_pre_diabetic = df[df['diabetes_status']==1]
    df_diabetic = df[df['diabetes_status']==2]
    df_normal_affx = df_normal.rename(columns=lambda x: 'normal_'+x)
    df_pre_diabetic_affx = df_pre_diabetic.rename(columns=lambda x: 'pre_diabetic_'+x)
    df_diabetic_affx = df_diabetic.rename(columns=lambda x: 'diabetic_'+x)
    dfs = pd.concat([df_normal_affx, df_pre_diabetic_affx, df_diabetic_affx])
    return dfs

def plot_and_ttest(df, cols):
    dicts = {}
    for col_subset in itertools.combinations(cols, 2):
        print(f'{col_subset[0]} vs {col_subset[1]}')
        df[list(col_subset)].boxplot(rot=45)
        t,p = stats.ttest_ind(df[col_subset[0]].dropna().values, df[col_subset[1]].dropna().values)

        print('ttest_score:', t)
        print('p_value:', p)
        print('\n')
        dicts[f'{col_subset[0]} vs {col_subset[1]}'] = p
        plt.show()
    return dicts

def model_evaluation_matrics(file_paths_dict, cols_to_fetch):
    dicts = {}
    for model, path in file_paths_dict.items():
        df = pd.read_csv(path)
        dicts[model] = df[cols_to_fetch].iloc[1:].mean()
    return dicts

def individual_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex']):
    p_values = {}
    for f in feats:
        try:
            features_string = make_feature_string([f], categorical_feats)
            fii = ols(f'{target_col} ~ {features_string}', df).fit()
            feat_dict = fii.pvalues.to_dict()
            coeffs = fii.params
        
            for k, v in feat_dict.items():
                orig_key = k
                if k == 'Intercept':
                    k = f+'_Intercept'
                p_values[k] = v

                p_values[k+'_coeff'] = coeffs[orig_key]
            p_values['fitting_score'] = fii.rsquared
        except Exception as e:
            print(e)

    return p_values

def make_feature_string(feats, categorical_cols=[]):
    feat_str = '1+'
    for c_col in categorical_cols:
        if type(c_col) is tuple:
            if c_col[0] not in feats:
#                 print(f'{c_col} is not present in given feature list, SKIPPING IT!')
                continue
            feat_str += f'C({c_col[0]}, Treatment(reference={c_col[1]}))+'
            c_col = c_col[0]
        else:
            if c_col not in feats:
#                 print(f'{c_col} is not present in given feature list, SKIPPING IT!')
                continue
            feat_str += f'C({c_col}, Treatment)+'
        feats.remove(c_col)
    
    other_feats_str = '+'.join(feats)
    if other_feats_str is '':
        final_feat_str = feat_str[:-1]
    else:
        final_feat_str = feat_str + other_feats_str

    return final_feat_str
    

def normal_group_fit(df, target_col, features_string):
    model = ols(f'{target_col} ~ {features_string}', df).fit()
    plot_model_outputs(df['bmi_numeric'].values, df[target_col].values, model, 'OLS')
    return model

def weighted_group_feats(df, target_col, features_string, alpha_col):
    alpha = df[alpha_col].values
    model = wls(f'{target_col} ~ {features_string}', df, weights=(1/(1-alpha))).fit()
    return model

def normal_mixed_effect_model(df, target_col, features_string, group_col, L1_wt=None):
    if L1_wt is None:
        model = mixedlm(f'{target_col} ~ {features_string}', df, groups=df[group_col]).fit()
    else:
        model = mixedlm(f'{target_col} ~ {features_string}', df, groups=df[group_col]).fit_regularized(L1_wt=L1_wt)
    return model

def anova_test(ols_model):
    anova_stats = sm.stats.anova_lm(ols_model)
    return anova_stats

def df_from_nested_dicts(dicts):
    df = pd.concat({k+'_'+kk: pd.concat({kk:pd.DataFrame(vv, index=[0]).T}, axis=1) for k, v in dicts.items() for kk, vv in v.items()}, axis=1)
    return df

def df_from_nested_dicts_group(dicts):
    df = pd.concat({k+'_'+kk: pd.concat({kk:pd.DataFrame(vv, index=[0]).T}, axis=1) for k, v in dicts.items() for kk, vv in v.items()}, axis=1)
    return df

def highlight_significance(df, threshold=0.05):
    return df.style.applymap(lambda x: 'background-color : yellow' if x==True or x<threshold else '')

def choose_best_features(df, feats, target_col, percentile=50):
    X, y = df[feats], df[target_col]
    columns = X.columns.values
    feat_selection_model = SelectFdr(f_regression, alpha=0.05).fit(X, y)
    col_mask = feat_selection_model.get_support()
    return columns[col_mask]
    

In [6]:

def discrete_individual_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex'], is_classification=False):
    p_values = {}
    for f in feats:
        try:
            x = df[f]
            y = df[target_col]
            x = add_constant(x)
            model = MNLogit(y, x)
            fii = model.fit()
            fii_ = fii.get_margeff()
            print(fii.summary())
            print(fii_.summary())
            feat_dict = fii.pvalues.to_dict()
            coeffs = fii.params
            
            for ko, vo in feat_dict.items():
                 for k, v in vo.items():
                    orig_key = k
                    if k == 'const':
                        k = f+'_Intercept'
                    p_values[str(ko)+'_'+k] = v
                    p_values[str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]
                
            p_values['aic'] = fii.aic
            p_values['bic'] = fii.bic
        except Exception as e:
            print('ERROR:', e)

    return p_values

def discrete_group_feature_stats(feats, df, target_col, categorical_feats=['diabetes_status', 'sex'], is_classification=False):
    p_values = {}
    try:
        x = df[feats]
        y = df[target_col]
        x = add_constant(x)
        model = MNLogit(y, x)
        fii = model.fit()
        fii_ = fii.get_margeff()
        print(fii.summary())
        print(fii_.summary())
        feat_dict = fii.pvalues.to_dict()
        coeffs = fii.params

        for ko, vo in feat_dict.items():
             for k, v in vo.items():
                orig_key = k
#                 if k == 'const':
#                     k = f+'_Intercept'
                p_values[str(ko)+'_'+k] = v
                p_values[str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]

        p_values['aic'] = fii.aic
        p_values['bic'] = fii.bic
    except Exception as e:
        print('ERROR:', e)

    return p_values, fii

def discrete_weighted_group_feats(df, target_col, feats, alpha_col):
    alpha = df[alpha_col].values
    x = df[feats]
    y = df[target_col]
    x = add_constant(x)
    model = MNLogit(y, x, weights=(1/(1-alpha))).fit()
    print(model.summary())
    return model

significance_check = lambda x: False if x>0.05 else True
def group_feature_stats(features_string, df, target_col, return_model=False):
    p_values, fii = None, None
    dicts = {}
    try:
        model = ols(f'{target_col} ~ {features_string}', df)
        fii = model.fit()
        p_values = fii.pvalues.to_dict()
        coeffs = fii.params.to_dict()
        dicts = {}
        for k, v in p_values.items():
                dicts[k] = v
                dicts[k+'_coeff'] = coeffs[k]

    except Exception as e:
        print(e)
    if return_model:
        return dicts, fii, model
    else:
        return dicts, fii

In [7]:
def get_common_vols_based_on_given_col(reference_col='iou_mean', threshold_value=0.51):
    common_vols = []
    model_merged_feats_path_combined = {key:val for d in model_merged_feats_path for key,val in d.items()}
    for key, value in model_merged_feats_path_combined.items():
        df = pd.read_csv(value)
        df_filter = df[df[reference_col]>threshold_value]
        filtered_vols = df_filter.volume_id.values
        print(key, filtered_vols.shape)
        if len(common_vols) == 0:
            common_vols = filtered_vols
        else:
            common_vols = np.intersect1d(common_vols, filtered_vols)    
    return common_vols

common_vols = get_common_vols_based_on_given_col()
if common_vols.shape[0] < 1:
    raise Exception('no common volumes among models')
else:
    print('---------------------------------------------')
    print('Common volumes size:', common_vols.shape)

full_bayesian_0dot01 (148,)
MC_dropout_quicknat (150,)
probabilistic_quicknat (153,)
hierarchical_quicknat (153,)
---------------------------------------------
Common volumes size: (147,)


In [8]:
model_merged_feats_path_combined = {key:val for d in all_paths for key,val in d.items()}
only_models_final_report_path = {key:val for d in final_model_report_path for key,val in d.items()}
ukb_paths_conbined = {key:val for d in ukb_paths for key,val in d.items()}

In [9]:
cols_to_fetch = ['sncc', 'ged', 'iou_spleen', 'iou_liver', 'dice_spleen',
       'dice_liver', 'surface_distance_avg_spleen', 'surface_distance_avg_liver']
dicts = model_evaluation_matrics(only_models_final_report_path, cols_to_fetch)
df_model_eval = pd.DataFrame.from_dict(dicts)
df_model_eval = df_model_eval.T
df_model_eval

Unnamed: 0,sncc,ged,iou_spleen,iou_liver,dice_spleen,dice_liver,surface_distance_avg_spleen,surface_distance_avg_liver
full_bayesian,0.604205,0.223997,0.524697,0.66866,0.828703,0.894704,0.809946,0.736798
full_bayesian_0dot01,0.439956,0.195701,0.778342,0.84361,0.87628,0.921408,0.87853,0.817305
MC_dropout_quicknat,0.399954,0.166135,0.821329,0.878295,0.897294,0.937717,0.915982,0.870747
probabilistic_quicknat,0.163864,0.228308,0.988114,0.992439,0.873666,0.925428,0.875153,0.827462
hierarchical_quicknat,0.187984,0.209171,0.983818,0.988998,0.883146,0.932802,0.897351,0.854833


# Baseline Stats (with input data)

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
def logistic_regression(features, df, target_col, weights=None, multi_class='ovr'):
#     multinomial
    model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                      ('linear', LogisticRegression(multi_class=multi_class, solver='newton-cg'))])
    x = df[features].values
    y = df[target_col].values
#     print(x.shape, y.shape)
    model = model.fit(x, y)
#     print(model.summary())
    return model

# class Logistic_Regression(linear_model.LogisticRegression):
#     """
#     LinearRegression class after sklearn's, but calculate t-statistics
#     and p-values for model coefficients (betas).
#     Additional attributes available after .fit()
#     are `t` and `p` which are of the shape (y.shape[1], X.shape[1])
#     which is (n_features, n_coefs)
#     This class sets the intercept to 0 by default, since usually we include it
#     in X.
#     """

#     def __init__(self, *args, **kwargs):
#         if not "fit_intercept" in kwargs:
#             kwargs['fit_intercept'] = False
#         super(Logistic_Regression, self)\
#                 .__init__(*args, **kwargs)

#     def fit(self, X, y, n_jobs=1):
#         self = super(Logistic_Regression, self).fit(X, y, n_jobs)

#         sse = np.sum((self.predict(X) - y) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
#         se = np.array([
#             np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X.T, X))))
#                                                     for i in range(sse.shape[0])
#                     ])

#         self.t = self.coef_ / se
#         self.p = 2 * (1 - stats.t.cdf(np.abs(self.t), y.shape[0] - X.shape[1]))
#         return self   

In [18]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
best_feats_spleen, best_feats_liver = None, None
for key, value in only_data_paths.items():
    df = pd.read_csv(value)
#     df['volume_id'] = ['KORA']+df['mrt-image-id'].values
    print(df.columns)
    df = df[df.volume_id.isin(common_vols)]
    print(df.shape)
    df = rename(df)
#     df = transform_to_categorical(df, ['diabetes_status'])
    df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
    df = df.fillna(0)
    
    dicts[key] = {}
    target_col = 'diabetes_status'
    p_value_dict = logistic_regression(feats, df, target_col, multi_class='multinomial')
#     dicts[key][target_col] = p_value_dict
    

# p_value_df = df_from_nested_dicts(dicts).T
# p_value_df_styler = highlight_significance(p_value_df, threshold=0.01)
# p_value_df_styler

TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('<U21') dtype('<U21') dtype('<U21')

# Group features Stats

In [None]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
anova_test_dicts = {}
for key, value in model_merged_feats_path_combined.items():
#     if 'KORA' in key:
#         print('dataset cannot be processed!')
#         continue
    df = pd.read_csv(value)
    df = df[df.volume_id.isin(common_vols)]
    print(df.shape)
    df = rename(df)
    df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
    df = df.fillna(0)

    dicts[key] = {}
    
    target_col = 'diabetes_status'
    best_feats_spleen = feats 
    p_value_dict_spleen, model = discrete_group_feature_stats(best_feats_spleen, df, target_col, ['sex'], True)

    dicts[key][target_col] = p_value_dict_spleen
    
    feats_ = feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    p_value_dict_spleen, model = discrete_group_feature_stats(feats_, df, target_col, ['sex'], True)

    dicts[key][target_col+'_with_seg_volumes'] = p_value_dict_spleen

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

# Regularised group feat stats test

In [None]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
anova_test_dicts = {}
for key, value in model_merged_feats_path_combined.items():
    if 'KORA' in key:
#         print('dataset cannot be processed!')
        continue

    df = pd.read_csv(value)
    df = rename(df)
    df = transform_to_categorical(df, ['diabetes_status', 'sex'])
    df, normalised_cols = z_score_column_normalise(df, ['seg_spleen', 'seg_liver'])
    df = df.fillna(0)

    dicts[key] = {}
    anova_test_dicts[key] = {}
    
    target_col = 'diabetes_status'
    best_feats_spleen = feats
    feature_string =  make_feature_string(list(best_feats_spleen), [ 'sex'])

    model = discrete_weighted_group_feats(df, target_col, feats , 'iou_spleen')
    feat_dict = model.pvalues.to_dict()
    coeffs = model.params
    dicts[key][target_col] = {}
    for ko, vo in feat_dict.items():
             for k, v in vo.items():
                orig_key = k
#                 if k == 'const':
#                     k = f+'_Intercept'
                dicts[key][target_col][str(ko)+'_'+k] = v
                dicts[key][target_col][str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]

    dicts[key][target_col]['aic'] = model.aic
    dicts[key][target_col]['bic'] = model.bic
    
    feats_ = feats + [ 'seg_spleen_normalised', 'seg_liver_normalised']
    print(feats_)
    model = discrete_weighted_group_feats(df, target_col, feats_ , 'iou_liver')
#     result = anova_test(model)
#     dicts[key][target_col] = model.pvalues.to_dict()
    feat_dict = model.pvalues.to_dict()
    coeffs = model.params
    dicts[key][target_col+'_seg_volumes'] = {}
    for ko, vo in feat_dict.items():
             for k, v in vo.items():
                orig_key = k
#                 if k == 'const':
#                     k = f+'_Intercept'
                dicts[key][target_col+'_seg_volumes'][str(ko)+'_'+k] = v
                dicts[key][target_col+'_seg_volumes'][str(ko)+'_'+k+'_coeff'] = coeffs[ko][orig_key]

    dicts[key][target_col+'_seg_volumes']['aic'] = model.aic
    dicts[key][target_col+'_seg_volumes']['bic'] = model.bic

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)
p_value_df_styler

# Normalised Mixed Effect Model

In [None]:
feats = flatten(feats_from_paper_for_group_test_no_categorisation)
feats = ['age', 'sex', 'bmi_numeric']
dicts = {}
for key, value in model_merged_feats_path_combined.items():
    if 'KORA' in key:
        print('dataset cannot be processed!')
        continue
    df = pd.read_csv(value)
    df = rename(df)
    df = df.fillna(0)
    df_s = df.copy()
    df_l = df.copy()

    dicts[key] = {}
    
    target_col = 'diabetes_status'
    df_spleen, spleen_normalised_cols_map = z_score_group_normalise(df, spleen_sample_cols)
    df_liver, liver_normalised_cols_map = z_score_group_normalise(df_spleen, liver_sample_cols)
    best_feats_spleen = feats + list(spleen_normalised_cols_map.values()) + list(liver_normalised_cols_map.values()) # choose_best_features(df_spleen, feats, target_col)
    feature_string =  make_feature_string(list(best_feats_spleen), ['sex'])
    p_value_dict_spleen, model = group_feature_stats(feature_string, df_liver, target_col)
    dicts[key][target_col] = p_value_dict_spleen
    dicts[key][target_col]['fitting_score'] = model.rsquared

p_value_df = df_from_nested_dicts(dicts).T
p_value_df_styler = highlight_significance(p_value_df, 0.05)

p_value_df_styler