In [180]:
# first make sure the input data are good 
import numpy as np 
import os 
import sys
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, make_scorer, f1_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate, LeaveOneOut, StratifiedKFold
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import shap
from scipy import stats

if sys.platform == 'darwin':
    print("Current system is macOS")
    main_fold_path = '/Users/shanxiafeng/Documents/Project/Research/fnirs-prognosis/code/fnirs-treatment-response-prediction'
elif sys.platform == 'linux':
    print("Current system is Ubuntu")
    main_fold_path = '/home/jy/Documents/fnirs/treatment_response/fnirs-depression-deeplearning'
else:
    print("Current system is neither macOS nor Ubuntu")
    
sys.path.append(main_fold_path)    
os.chdir(main_fold_path)
from utils.hyperopt_utils import get_best_hyperparameters, get_best_hyperparameters_skf_inside_loocv_monitoring_recall_bacc
from utils.fnirs_utils import print_md_table_val_test_AUC

from scripts.fusion_model.fusion_model_utils import derive_average_MMDR_score
from scripts.fusion_model.fusion_model_utils import replace_nan_with_mean
from scripts.fusion_model.fusion_model_utils import impute_nan_data
from scripts.fusion_model.fusion_model_utils import process_with_nan_using_imputation_zscore
from scripts.fusion_model.fusion_model_utils import read_base_T2_SDS_CGI 
from scripts.fusion_model.fusion_model_utils import read_pychiatry
from scripts.fusion_model.fusion_model_utils import read_HAMD_score
from scripts.fusion_model.fusion_model_utils import read_demographic
from scripts.fusion_model.fusion_model_utils import plot_avg_auc
from scripts.fusion_model.fusion_model_utils import train_xgboost_shuffle_feature 
from scripts.fusion_model.fusion_model_utils import save_shap
from scripts.fusion_model.fusion_model_utils import read_dose_information
import time


import time
start_time = time.time()

K_FOLD = 5
fold_path = 'allData/prognosis_mix_hb/pretreatment_response'
MMDR_path = 'allData/prognosis_mix_hb/pretreatment_response/MDDR/MDDR_derived_from_load_evaluate.npy'
HAMD_path = 'allData/prognosis_mix_hb/pretreatment_response/label_hamd.npy'

base_T2_SDS_CGI = read_base_T2_SDS_CGI(fold_path)
pyschiatry = read_pychiatry(fold_path)
HAMD_score = np.load(HAMD_path, allow_pickle=True)
demographic = read_demographic(fold_path)
dose = read_dose_information(fold_path)



pro_base_T2_SDS_CGI = process_with_nan_using_imputation_zscore(base_T2_SDS_CGI)
pro_pyschiatry = process_with_nan_using_imputation_zscore(pyschiatry)
pro_pyschiatry = np.concatenate((pro_pyschiatry[:, :-3], pro_pyschiatry[:, -2:]), axis=1) # must remove the -3rd column, because its existen will cause nan value of that column which is On antidpressant(s) ONLY
# pro_pyschiatry = np.concatenate((pro_pyschiatry[:, :1], pro_pyschiatry[:, 2:]), axis=1) # delete Current psychiatric comorbidities — Binary because already have Current psychiatric comorbidities — Coded
pro_HAMD_score = HAMD_score# process_with_nan_using_imputation_zscore(HAMD_score)
pro_demographic = process_with_nan_using_imputation_zscore(demographic)
pro_dose = process_with_nan_using_imputation_zscore(dose)


label = np.load(fold_path + '/label.npy', allow_pickle=True)


Current system is macOS


In [181]:
np.mean(HAMD_score)

17.40625

In [182]:
# Process nan value 


dose_0 = dose[:, 0].copy()
dose_0 = np.nan_to_num(dose_0)
median_dose_0 = np.median(dose_0)
dose_0 = dose[:, 0].copy()
dose_0 = np.nan_to_num(dose_0, nan=median_dose_0)

dose_1 = dose[:, 1].copy()  
dose_1 = np.nan_to_num(dose_1)
mean_dose_1 = np.mean(dose_1)
dose_1 = dose[:, 1].copy()
dose_1 = np.nan_to_num(dose_1, nan=mean_dose_1)

pys_6 = pyschiatry[:, 6].copy()
pys_6 = np.nan_to_num(pys_6)
pys_6[11]=0

mean_pys_6 = np.mean(pys_6)
pys_6[11] = mean_pys_6
# print(mean_pys_6)
# pys_6 = pyschiatry[:, 6].copy()
# pys_6 = np.nan_to_num(pys_6, nan=mean_pys_6)
pyschiatry[:, 6] = pys_6
print(pyschiatry[:, 6])


dem_3 = demographic[:, 3].copy()
dem_3 = np.nan_to_num(dem_3)
print(dem_3)
for i in range(len(dem_3)):
    if type(dem_3[i]) in [str, int]:
        pass
    else:
        print(i, dem_3[i])
        dem_3[i] = 1
dem_3 = np.array(dem_3, dtype=int)        
demographic[:, 3] = dem_3


# hamd_1 = HAMD_score[:, 1].copy()
# hamd_1 = np.nan_to_num(hamd_1)
# for i in range(len(hamd_1)):
#     if type(hamd_1[i]) in [str, int]:
#         pass
#     else:
#         hamd_1[i] = 0
# mean_hamd_1 = np.mean(hamd_1)

# hamd_1 = HAMD_score[:, 1]
# for i in range(len(hamd_1)):
#     if type(hamd_1[i]) in [str, int]:
#         pass
#     else:
#         hamd_1[i] = mean_hamd_1

[84 68 108 2 12 5 1 6 18 74 12 56.328125 12 48 276 168 121 4 48 48 3 120 6
 10 96 26 36 6 0 72 144 3 24 24 15 60 108 96 22 84 180 63 24 144 6 120 60
 192 120 60 36 48 8 96 72 48 180 36 3 12 3 0 0 24]
[3 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 nan 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1
 1 1 1 3 1 1 3 1 1 1 3 1 3 1 '1' 3 '1' '1' '1' '1' '1' '1' 3 '1' '1' '1'
 '1' '1']
18 nan


In [183]:
name_to_val = {}

chi_analysis_name = ['Current psychiatric comorbidities - coded', ]


chi_analysis_arr = ['Perceived social support', 
            'Past trauma', 
            'Current psychiatric comorbidities - coded',
            'Family history of psychiatric illness',
            'Past EmD visit(s) because of depression',
            'Type of episode',
            'Antidepressant',
            'Sex',
            'Ethnicity',
            'Handedness'] 


ttest_analysis_arr = ['Age of depression onset (years)',
            'Duration of depression (years)',
            'Duration of untreated depression (months)',
            'Fluoxetine equivalent dose (mg/day)',
            'Age (years)',
            'Education (years)',
            'Baseline HAM-D score',
            'HAM-D score at 6 month'] 

name_to_val['Age (years)'] = demographic[:, 0] # ttest
name_to_val['Sex'] = demographic[:, 1] # chi 
name_to_val['Ethnicity'] = demographic[:, 2] # chi 
name_to_val['Handedness'] = demographic[:, 3] # chi 
name_to_val['Education (years)'] = demographic[:, 4] # ttest
name_to_val['Baseline HAM-D score'] = HAMD_score[:, 0] # chi 
name_to_val['HAM-D score at 6 month'] = HAMD_score[:, 1] # chi 

# print(name_to_val)


name_to_val['Perceived social support'] = demographic[:, -1] # chi 
name_to_val['Past trauma'] = pyschiatry[:, 0] # chi
name_to_val['Current psychiatric comorbidities - coded'] = pyschiatry[:, 2] # chi 
name_to_val['Family history of psychiatric illness'] = pyschiatry[:, 3] # chi 
name_to_val['Age of depression onset (years)'] = pyschiatry[:, 4] # ttest
name_to_val['Duration of depression (years)'] = pyschiatry[:, 5] # ttest
name_to_val['Duration of untreated depression (months)'] = pyschiatry[:, 6] # ttest
name_to_val['Past EmD visit(s) because of depression'] = pyschiatry[:, 7] # chi
name_to_val['Type of episode'] = pyschiatry[:, 8] # chi
name_to_val['Antidepressant'] = dose_0 # pyschiatry[:, ] # chi
name_to_val['Fluoxetine equivalent dose (mg/day)'] = dose_1 # pyschiatry[:, ] # ttest




In [174]:
HAMD_score

array([[25, 12, -0.52],
       [21, 14, -0.3333333333333333],
       [27, 17, -0.37037037037037035],
       [20, 20, 0.0],
       [17, 18, 0.058823529411764705],
       [10, 8, -0.2],
       [15, 17, 0.13333333333333333],
       [21, 21, 0.0],
       [22, 19, -0.13636363636363635],
       [21, 11, -0.47619047619047616],
       [21, 21, 0.0],
       [21, 23, 0.09523809523809523],
       [19, 19, 0.0],
       [15, 16, 0.06666666666666667],
       [23, 15, -0.34782608695652173],
       [21, 18, -0.14285714285714285],
       [19, 21, 0.10526315789473684],
       [27, 21, -0.2222222222222222],
       [15, 11, -0.26666666666666666],
       [19, 25, 0.3157894736842105],
       [15, 19, 0.26666666666666666],
       [25, 22, -0.12],
       [22, 11, -0.5],
       [24, 16, -0.3333333333333333],
       [30, 22, -0.26666666666666666],
       [25, 24, -0.04],
       [26, 28, 0.07692307692307693],
       [24, 16, -0.3333333333333333],
       [26, 26, 0.0],
       [20, 9, -0.55],
       [23, 26, 0.130

In [189]:

def ttest_anlysis(key, value, label):
    """ compute statistics of values using ttest_in from scipy.stats 
    Args: 
        key: str, the name of the value
        value: np.array, the value to be analyzed
        label: np.array, the label of the value, either 0 or 1
    
    Returns:
        None
    
    Outputs:
        | key | all_mean | all_std | non_responder_mean | non_responder_std | responder_mean | responder_std | p_value |
    
    """
    all_mean = np.mean(value)
    all_std = np.std(value)
    
    responder_value = value[label == 1].tolist()
    non_responder_value = value[label == 0].tolist()
    
    responder_mean = np.mean(responder_value)
    responder_std = np.std(responder_value)
    
    non_responder_mean = np.mean(non_responder_value)
    non_responder_std = np.std(non_responder_value)

    _, p_value = stats.ttest_ind(responder_value, non_responder_value)
    
    print(f' | {key} | {all_mean:.2f} | {all_std:.2f} | {non_responder_mean:.2f} | {non_responder_std:.2f} | {responder_mean:.2f} | {responder_std:.2f} | {p_value:.4f} |')

    # # check if the data is normal distribution
    # plt.figure()
    # plt.title(key)
    # plt.hist(value)

def chi_anlysis(key, value, label):
    """ compute statistics of values using chi2_contingency from scipy.stats 
    Args: 
        key: str, the name of the value
        value: np.array, the value to be analyzed
        label: np.array, the label of the value, either 0 or 1
    
    Returns:
        None
    
    Outputs:
        | key | all_mean | all_std | non_responder_mean | non_responder_std | responder_mean | responder_std | p_value |
    
    """
    
    # make sure the value is integer
    value = value.astype(int)
    
    responder_value = value[label == 1]
    non_responder_value = value[label == 0]
    
    unique = np.unique(value)
    if len(unique) == 2:
        # pass
        max_value = np.max(unique)
        
        value_max_arr = value[value == max_value]
        N_value_max_arr = len(value_max_arr)
        percent_value_max_arr = N_value_max_arr / len(value) * 100
        
        responder_max_arr = responder_value[responder_value == max_value]
        N_responder_value_max_arr = len(responder_max_arr)
        percent_responder_value_max_arr = N_responder_value_max_arr / len(responder_value) * 100
        
        non_responder_max_arr = non_responder_value[non_responder_value == max_value]
        N_non_responder_value_max_arr = len(non_responder_max_arr)
        percent_non_responder_value_max_arr = N_non_responder_value_max_arr / len(non_responder_value) * 100
        
        if max_value == 1:
            responder_value +=1 
            non_responder_value += 1
        p_value = stats.chi2_contingency(responder_value.tolist(), non_responder_value.tolist())
        print(f' | {key} | {N_value_max_arr} | {percent_value_max_arr:.2f} | {N_non_responder_value_max_arr} | {percent_non_responder_value_max_arr:.2f} | {N_responder_value_max_arr} | {percent_responder_value_max_arr:.2f} | {p_value[1]:.4f} |')
    else:
        
        if np.min(unique) == 0:
            s_responder_value = responder_value+1
            s_non_responder_value = non_responder_value+1    
        else:
            s_responder_value = responder_value
            s_non_responder_value = non_responder_value
        _, p_value, _, _ = stats.chi2_contingency(s_responder_value.tolist(), s_non_responder_value.tolist())
        
        print(f' | {key} | | | | | | | {p_value:.4f} |')
        for unique_val in unique:
            value_unique = value[value == unique_val]
            N_value_unique = len(value_unique)
            percent_value_unique = N_value_unique / len(value) * 100
            
            responder_unique = responder_value[responder_value == unique_val]
            N_responder_value_unique = len(responder_unique)
            percent_responder_value_unique = N_responder_value_unique / len(responder_value) * 100
            
            non_responder_unique = non_responder_value[non_responder_value == unique_val]
            N_non_responder_value_unique = len(non_responder_unique)
            percent_non_responder_value_unique = N_non_responder_value_unique / len(non_responder_value) * 100
            
            print(f' | {unique_val} | {N_value_unique} | {percent_value_unique:.2f} | {N_non_responder_value_unique} | {percent_non_responder_value_unique:.2f} | {N_responder_value_unique} | {percent_responder_value_unique:.2f} |  |')
        
        
    all_mean = np.mean(value)
    all_std = np.std(value)
    

    
    responder_mean = np.mean(responder_value)
    responder_std = np.std(responder_value)
    
    non_responder_mean = np.mean(non_responder_value)
    non_responder_std = np.std(non_responder_value)

    # _, p_value = stats.chi2_contingency(responder_value, non_responder_value)
    
    # print(f' | {key} | {all_mean:.2f} | {all_std:.2f} | {non_responder_mean:.2f} | {non_responder_std:.2f} | {responder_mean:.2f} | {responder_std:.2f} | {p_value:.4f} |')
    

for key, value in name_to_val.items():

    if key in chi_analysis_arr:
        chi_anlysis(key, value, label)
    elif key in ttest_analysis_arr:
        # print(f' {key} is using ttest analysis')
        # ttest_anlysis(key, value, label)
        pass
    else:
        print(f'- error - {key}' * 100)
print('-' * 100)
for key, value in name_to_val.items():
    mean_val = np.mean(value)
    # print(f' | {key} |')
    # print(mean_val)
    # make sure all values are process 
    if key in chi_analysis_arr:
        pass
        # chi_anlysis(key, value, label)
        # print(f' {key} is using chi analysis')
    elif key in ttest_analysis_arr:
        # print(f' {key} is using ttest analysis')
        ttest_anlysis(key, value, label)
        pass
        
    else:
        print(f'- error - {key}' * 100)
        

    

        


 | Sex | 50 | 78.12 | 39 | 78.00 | 11 | 78.57 | 1.0000 |
 | Ethnicity | | | | | | | 1.0000 |
 | 1 | 44 | 68.75 | 35 | 70.00 | 9 | 64.29 |  |
 | 2 | 12 | 18.75 | 9 | 18.00 | 3 | 21.43 |  |
 | 3 | 7 | 10.94 | 5 | 10.00 | 2 | 14.29 |  |
 | 4 | 1 | 1.56 | 1 | 2.00 | 0 | 0.00 |  |
 | Handedness | 11 | 17.19 | 7 | 14.00 | 4 | 28.57 | 1.0000 |
 | Perceived social support | | | | | | | 1.0000 |
 | 1 | 15 | 23.44 | 10 | 20.00 | 5 | 35.71 |  |
 | 2 | 41 | 64.06 | 35 | 70.00 | 6 | 42.86 |  |
 | 3 | 8 | 12.50 | 5 | 10.00 | 3 | 21.43 |  |
 | Past trauma | 31 | 48.44 | 25 | 50.00 | 6 | 42.86 | 1.0000 |
 | Current psychiatric comorbidities - coded | | | | | | | 1.0000 |
 | 0 | 45 | 70.31 | 32 | 64.00 | 13 | 92.86 |  |
 | 1 | 10 | 15.62 | 10 | 20.00 | 0 | 0.00 |  |
 | 2 | 5 | 7.81 | 5 | 10.00 | 0 | 0.00 |  |
 | 3 | 2 | 3.12 | 2 | 4.00 | 0 | 0.00 |  |
 | 4 | 2 | 3.12 | 1 | 2.00 | 1 | 7.14 |  |
 | Family history of psychiatric illness | 28 | 43.75 | 21 | 42.00 | 7 | 50.00 | 1.0000 |
 | Past EmD visit(s)

SyntaxError: invalid syntax (2565330058.py, line 1)