In [1]:
#Setup
import pickle
import logging
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import openpyxl
import sklearn
from aidp.data.groupings import AdVsDlbGrouping, AdVsAllGrouping, DlbVsAllGrouping, ConVsAllGrouping, AdVsConGrouping, DlbVsConGrouping
from abc import ABC, abstractmethod
from sklearn.metrics import roc_auc_score, confusion_matrix

#Version Control 
skvers = sklearn.__version__
#NEED to use the same ML conda version (scikit0learn==0.19.0)
if skvers != "0.21.3" :
    !pip install scikit-learn==0.19.0
    import sklearn

#Paths
mk="ADDLB021023" #Same as V2 and Verbose
folder_name="ADDLB_021023_finalModel" #contains training and testing
script_path="/media/mcuser/Data1/RChen/SupportVectorMachine"
mod_path=script_path+'/resources/models/'+mk+'/dmri'
xl_path='/media/mcuser/Data1/RChen/SupportVectorMachine/ADDLB_021023_finalModel'
os.chdir(mod_path)
os.getcwd()
print("Current kernel is", os.environ['CONDA_DEFAULT_ENV'])

#Load Dataframes
df_tr=pd.read_excel(xl_path+'/r_ADDLB_NP_train_021023_ADDLB021023.xlsx', header=0, index_col="Subject")
df_te=pd.read_excel(xl_path+'/r_ADDLB_NP_test_021023_ADDLB021023.xlsx', header=0, index_col="Subject")
keep_columns=["GroupID","dmri_ad_v_dlb (AD Probability)", "dmri_ad_v_con (AD Probability)", 
              "dmri_dlb_v_con (DLB Probability)", "dmri_ad_v_all (AD Probability)", 
              "dmri_dlb_v_all (DLB Probability)", "dmri_con_v_dem (CON Probability)"]
df_tr_trim=df_tr[keep_columns]
df_te_trim=df_te[keep_columns]

#Dictionary for label redefining (1 for 'positive' class (higher probability), 0 for 'negative' class)
avd_dict = {"GroupID": {2:0}} #drop 3, 1=AD, 0=DLB
dvc_dict = {"GroupID": {3:0, 2:1}} #drop 1, 1=DLB, 0=CON
avc_dict = {"GroupID": {3:0}} #drop 2, 1=AD, 0=CON
avb_dict = {"GroupID": {2:0, 3:0}}
dvb_dict = {"GroupID": {1:"A", 3:"A", 2:"B"}}
cvb_dict = {"GroupID": {1:"A", 2:"A", 3:"B"}}
tf_dict = {"GroupID": {"B":1, "A":0}}
dict_list = [avd_dict, dvc_dict, avc_dict, avb_dict, dvb_dict, cvb_dict, tf_dict]

def conmatscores(y_true, y_pred):
    y_pred_class = np.array(np.round(y_pred))
    cm = confusion_matrix(y_true, y_pred_class) #assume y_pred is fed in as probabilities
    # print(cm)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    # tn, fp, fn, tp = cm([0,1,0,1], [1,1,1,0]).ravel()
    total=(tn+fp+fn+tp)
    accuracy = (tp+tn)/(total)
    sens = tp / (tp+fn)
    spec = tn / (fp+tn)
    ppv = tp / (tp+fp)
    npv = tn / (tn+fn)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, sens, spec, ppv, npv, auc

def bootstrapper(Y_true, Y_pred, savename: str, r_seed=42, n_bootstraps=1000):
    acc_bs_scores = []
    sens_bs_scores = []
    spec_bs_scores = []
    ppv_bs_scores = []
    npv_bs_scores = []
    auc_bs_scores = []
    rng = np.random.RandomState(r_seed)
    for i in range(n_bootstraps):
        indices = rng.randint(0, len(Y_pred), len(Y_pred))
        if len(np.unique(Y_true[indices])) < 2:
            continue
        acc, sens, spec, ppv, npv, auc = conmatscores(Y_true[indices], Y_pred[indices])
        acc_bs_scores.append(acc)
        sens_bs_scores.append(sens)
        spec_bs_scores.append(spec)
        ppv_bs_scores.append(ppv)
        npv_bs_scores.append(npv)
        auc_bs_scores.append(auc)
    df_report = pd.DataFrame( {"Accuracy": acc_bs_scores, "Sensitivity": sens_bs_scores, "Specificity": spec_bs_scores,
                               "PPV": ppv_bs_scores, "NPV": npv_bs_scores, "AUC": auc})
    df_report.to_excel(script_path+'/'+folder_name+'/mlreport/'+savename+'_report.xlsx', index=0)


def report_mean_ci(dfname: str):
    df = pd.read_excel(script_path+'/'+folder_name+'/mlreport/'+dfname+'_report.xlsx')
    df_sum=pd.DataFrame(index=df.columns, columns=["Mean", "Lower", "Upper"])
    for metric in df.columns:
        metric_array = np.array(df[metric])
        metric_array.sort()
        mean_met = np.mean(metric_array)
        lower_met = metric_array[int(.025*len(metric_array))]
        upper_met = metric_array[int(0.975*len(metric_array))]
        # print(metric, mean_met, lower_met, upper_met)
        df_sum.loc[str(metric)] = [mean_met, lower_met, upper_met]
        
    df_sum.to_excel(script_path+'/'+folder_name+'/mlreport/'+dfname+'_report_summary.xlsx')
    
def test_scores(Y_true, Y_pred):
    accuracy, sens, spec, ppv, npv, auc = conmatscores(Y_true, Y_pred)
    print ("accuracy", accuracy*100, "sens", sens*100, "spec", spec*100, "ppv", ppv*100, "npv", npv*100, "auc", auc)

Current kernel is ML


In [4]:
#### PATH
xl_path2='/media/mcuser/Data1/RChen/SupportVectorMachine'
df_path=pd.read_excel(xl_path+'/r_ADDLB_path_021023_ADDLB021023.xlsx', header=0, index_col="Subject")
df_path_Y_true=df_path[["GroupID"]]
df_path2_Y_true=df_path_Y_true.replace({2:0})
print(np.unique(df_path2_Y_true))
df_path2_Y_pred=df_path[["dmri_ad_v_dlb (AD Probability)"]]

print(conmatscores(df_path2_Y_true, df_path2_Y_pred))

[0 1]
(0.8840579710144928, 0.9230769230769231, 0.25, 0.9523809523809523, 0.16666666666666666, 0.7692307692307693)


In [174]:
# testdf = pd.read_excel(script_path+'/'+folder_name+'/mlreport/'+'addlb_train'+'_report.xlsx')
# testdf.columns
# score_summary = []
# df_sum = pd.DataFrame(index=testdf.columns, columns=["Mean", "Lower", "Upper"])
# for metric in testdf.columns:
#     extracted_metric=np.array(testdf[metric])
#     extracted_metric.sort()
#     mean_met = np.mean(extracted_metric)
#     lower_b = extracted_metric[int(.025*len(extracted_metric))]
#     upper_b = extracted_metric[int(.975*len(extracted_metric))]
#     df_sum.loc[str(metric)] = [mean_met, lower_b, upper_b]
#     # display(df_sum)
#     print(metric, mean_met, lower_b, upper_b)
# df_sum.to_excel(script_path+'/'+folder_name+'/mlreport/'+'addlb_train'+'_report_summary.xlsx')

In [180]:
##### AD vs DLB
advdlb=pickle.load(open('ad_v_dlb.pkl', 'rb'))
df_tr_trim_addlb = df_tr_trim[df_tr_trim.GroupID!=3]
df_tr_trim_addlb_redef = df_tr_trim_addlb.replace(dict_list[0])
Y_true_tr_addlb = np.array(df_tr_trim_addlb_redef.GroupID)
Y_pred_tr_addlb = np.array(df_tr_trim_addlb_redef["dmri_ad_v_dlb (AD Probability)"])
df_te_trim_addlb = df_te_trim[df_te_trim.GroupID!=3]
df_te_trim_addlb_redef = df_te_trim_addlb.replace(dict_list[0])
Y_true_te_addlb = np.array(df_te_trim_addlb_redef.GroupID)
Y_pred_te_addlb = np.array(df_te_trim_addlb_redef["dmri_ad_v_dlb (AD Probability)"])

bootstrapper(Y_true_tr_addlb, Y_pred_tr_addlb, savename="addlb_train")
bootstrapper(Y_true_te_addlb, Y_pred_te_addlb, savename="addlb_test")
report_mean_ci(dfname="addlb_train")
report_mean_ci(dfname="addlb_test")

test_scores(Y_true_te_addlb, Y_pred_te_addlb)

accuracy 85.71428571428571 sens 78.57142857142857 spec 92.85714285714286 ppv 91.66666666666666 npv 81.25 auc 0.9591836734693877


In [181]:
##### DLB vs CON
dlbvcon=pickle.load(open('dlb_v_con.pkl', 'rb'))
df_tr_trim_dlbcon = df_tr_trim[df_tr_trim.GroupID!=1]
df_tr_trim_dlbcon_redef = df_tr_trim_dlbcon.replace(dict_list[1])
Y_true_tr_dlbcon = np.array(df_tr_trim_dlbcon_redef.GroupID)
Y_pred_tr_dlbcon = np.array(df_tr_trim_dlbcon_redef["dmri_dlb_v_con (DLB Probability)"])
df_te_trim_dlbcon = df_te_trim[df_te_trim.GroupID!=1]
df_te_trim_dlbcon_redef = df_te_trim_dlbcon.replace(dict_list[1])
Y_true_te_dlbcon = np.array(df_te_trim_dlbcon_redef.GroupID)
Y_pred_te_dlbcon = np.array(df_te_trim_dlbcon_redef["dmri_dlb_v_con (DLB Probability)"])

bootstrapper(Y_true_tr_dlbcon, Y_pred_tr_dlbcon, savename="dlbcon_train")
bootstrapper(Y_true_te_dlbcon, Y_pred_te_dlbcon, savename="dlbcon_test")
report_mean_ci(dfname="dlbcon_train")
report_mean_ci(dfname="dlbcon_test")

test_scores(Y_true_te_dlbcon, Y_pred_te_dlbcon)

accuracy 82.45614035087719 sens 75.0 spec 89.65517241379311 ppv 87.5 npv 78.78787878787878 auc 0.9298029556650247


In [182]:
##### AD vs CON
advcon=pickle.load(open('ad_v_con.pkl', 'rb'))
df_tr_trim_adcon = df_tr_trim[df_tr_trim.GroupID!=2]
df_tr_trim_adcon_redef = df_tr_trim_adcon.replace(dict_list[2])
Y_true_tr_adcon = np.array(df_tr_trim_adcon_redef.GroupID)
Y_pred_tr_adcon = np.array(df_tr_trim_adcon_redef["dmri_ad_v_con (AD Probability)"])
df_te_trim_adcon = df_te_trim[df_te_trim.GroupID!=2]
df_te_trim_adcon_redef = df_te_trim_adcon.replace(dict_list[2])
Y_true_te_adcon = np.array(df_te_trim_adcon_redef.GroupID)
Y_pred_te_adcon = np.array(df_te_trim_adcon_redef["dmri_ad_v_con (AD Probability)"])

bootstrapper(Y_true_tr_adcon, Y_pred_tr_adcon, savename="adcon_train")
bootstrapper(Y_true_te_adcon, Y_pred_te_adcon, savename="adcon_test")
report_mean_ci(dfname="adcon_train")
report_mean_ci(dfname="adcon_test")

test_scores(Y_true_te_adcon, Y_pred_te_adcon)

accuracy 94.73684210526315 sens 92.85714285714286 spec 96.55172413793103 ppv 96.29629629629629 npv 93.33333333333333 auc 0.9581280788177341


In [183]:
##### AD vs DLB/CON
advall=pickle.load(open('ad_v_all.pkl', 'rb'))
df_tr_trim_advb = df_tr_trim
df_tr_trim_advb_redef = df_tr_trim_advb.replace(dict_list[3])
Y_true_tr_advb = np.array(df_tr_trim_advb_redef.GroupID)
Y_pred_tr_advb = np.array(df_tr_trim_advb_redef["dmri_ad_v_all (AD Probability)"])
# print(Y_true_tr_advb.shape, Y_pred_tr_advb.shape)
df_te_trim_advb = df_te_trim
df_te_trim_advb_redef = df_te_trim_advb.replace(dict_list[3])
Y_true_te_advb = np.array(df_te_trim_advb_redef.GroupID)
Y_pred_te_advb = np.array(df_te_trim_advb_redef["dmri_ad_v_all (AD Probability)"])

bootstrapper(Y_true_tr_advb, Y_pred_tr_advb, savename="advb_train")
bootstrapper(Y_true_te_advb, Y_pred_te_advb, savename="advb_test")
report_mean_ci(dfname="advb_train")
report_mean_ci(dfname="advb_test")

test_scores(Y_true_te_advb, Y_pred_te_advb)

accuracy 92.94117647058823 sens 82.14285714285714 spec 98.24561403508771 ppv 95.83333333333334 npv 91.80327868852459 auc 0.9730576441102756


In [184]:
##### DLB vs AD/CON
dlbvall=pickle.load(open('dlb_v_all.pkl', 'rb'))
df_tr_trim_dlbvb = df_tr_trim
df_tr_trim_dlbvb_redef = df_tr_trim_dlbvb.replace(dict_list[4]).replace(dict_list[6])
Y_true_tr_dlbvb = np.array(df_tr_trim_dlbvb_redef.GroupID)
Y_pred_tr_dlbvb = np.array(df_tr_trim_dlbvb_redef["dmri_dlb_v_all (DLB Probability)"])
df_te_trim_dlbvb = df_te_trim
df_te_trim_dlbvb_redef = df_te_trim_dlbvb.replace(dict_list[4]).replace(dict_list[6])
Y_true_te_dlbvb = np.array(df_te_trim_dlbvb_redef.GroupID)
Y_pred_te_dlbvb = np.array(df_te_trim_dlbvb_redef["dmri_dlb_v_all (DLB Probability)"])

bootstrapper(Y_true_tr_dlbvb, Y_pred_tr_dlbvb, savename="dlbvb_train")
bootstrapper(Y_true_te_dlbvb, Y_pred_te_dlbvb, savename="dlbvb_test")
report_mean_ci(dfname="dlbvb_train")
report_mean_ci(dfname="dlbvb_test")

test_scores(Y_true_te_dlbvb, Y_pred_te_dlbvb)

accuracy 72.94117647058823 sens 25.0 spec 96.49122807017544 ppv 77.77777777777779 npv 72.36842105263158 auc 0.9191729323308271


In [185]:
##### CON vs AD/DLB
convall=pickle.load(open('con_v_dem.pkl', 'rb'))
df_tr_trim_convb = df_tr_trim
df_tr_trim_convb_redef = df_tr_trim_convb.replace(dict_list[5]).replace(dict_list[6])
Y_true_tr_convb = np.array(df_tr_trim_convb_redef.GroupID)
Y_pred_tr_convb = np.array(df_tr_trim_convb_redef["dmri_con_v_dem (CON Probability)"])
df_te_trim_convb = df_te_trim
df_te_trim_convb_redef = df_te_trim_convb.replace(dict_list[5]).replace(dict_list[6])
Y_true_te_convb = np.array(df_te_trim_convb_redef.GroupID)
Y_pred_te_convb = np.array(df_te_trim_convb_redef["dmri_con_v_dem (CON Probability)"])

bootstrapper(Y_true_tr_convb, Y_pred_tr_convb, savename="convb_train")
bootstrapper(Y_true_te_convb, Y_pred_te_convb, savename="convb_test")
report_mean_ci(dfname="convb_train")
report_mean_ci(dfname="convb_test")

test_scores(Y_true_te_convb, Y_pred_te_convb)

accuracy 85.88235294117646 sens 79.3103448275862 spec 89.28571428571429 ppv 79.3103448275862 npv 89.28571428571429 auc 0.9470443349753694
