In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import os
import shap


# DATASET

In [6]:
dataStreamPath = os.getcwd() + "\\"
graphsStreamPath= os.getcwd() + "\\graphs\\"
shapvaluesStreamPath= os.getcwd() + "\\shapvalues\\"
oneheadmodelsStreamPath= os.getcwd() + "\\onehead_models\\"
baselinemodelsStreamPath= os.getcwd() + "\\baseline_models\\"
protoformsStreamPath= os.getcwd() + "\\protoforms_small\\"

## Data loading

In [7]:
df_train = pd.read_csv("df_train_noise.csv")
df_test = pd.read_csv("df_test_noise.csv")

## Data preperation

In [8]:
#TRAIN SET
X_train = df_train.loc[:, 'pcm_LOGenergy_sma':'pcm_fftMag_mfcc_12_']
y_train_symptoms = df_train.loc[:, 'anxiety':'suicide']
y_train_states = df_train.loc[:, 'hamd_ymrs']

# from categorical to numeric target
label_coding = {'euthymia' : 0,
                'depression' : 1,
                'mania' : 2,
                'mixed': 3}

y_train_states_encoded = np.array(y_train_states.map(label_coding).astype(int))

# #TEST SET
X_test = df_test.loc[:, 'pcm_LOGenergy_sma':'pcm_fftMag_mfcc_12_']
y_test_symptoms = df_test.loc[:, 'anxiety':'suicide']
y_test_states = df_test.loc[:, 'hamd_ymrs']

y_test_states_encoded = np.array(y_test_states.map(label_coding).astype(int))

# # standardize data
scaler = preprocessing.StandardScaler()
scaler.fit(X_train.values)
X_train_scaled = scaler.transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)

# XGBOOST

In [None]:
classes_names_states= list(label_coding.keys())

feature_names=X_test.columns

X_train_scale_df = pd.DataFrame(X_train_scaled, columns = feature_names)

#Train the XGBoost model
#We create a dictionary that contains our model hyperparameters
xgb_params = {
    'n_estimators': 500, 
    #'learning_rate': 0.1,
    #'subsample': 0.8,
    #'reg_alpha': 1,
    'max_depth': 3, #it was 10
    'objective': 'multi:softprob', #'binary:logistic',
    'num_class': 4
    #'scale_pos_weight': 5
}
xgb_model = XGBClassifier(**xgb_params,use_label_encoder =False)
xgb_model = xgb_model.fit(X_train_scale_df, y_train_states_encoded) 

In [None]:
y_pred_xgb=xgb_model.predict(X_test)
xgb_cm = confusion_matrix(y_test_states_encoded, y_pred_xgb, labels=xgb_model.classes_)
xgb_cr = classification_report(y_test_states_encoded, y_pred_xgb)


print(xgb_cm)
print(xgb_cr)


# 1. Baseline approach

## 1.a) Neural Network

In [None]:
def build_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=(86,), activation='relu', name='dense'),
        tf.keras.layers.Dropout(0.2, name='dropout'),
        tf.keras.layers.Dense(4, activation='softmax', name='output')])
    model.build()
    return model

model_name = "baseline"
baseline = build_model()

baseline.summary()

In [None]:
baseline.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
                metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3) 

baseline.fit(X_train_scaled, y_train_states_encoded, epochs=15,
            validation_data=(X_test_scaled, y_test_states_encoded),
            callbacks=[early_stopping])

y_pred_states = baseline.predict(X_test_scaled)
y_pred_states = np.argmax(y_pred_states, axis=1)

In [None]:
cm_base = confusion_matrix(y_test_states_encoded, y_pred_states)
cr_base = classification_report(y_test_states_encoded, y_pred_states)

print(cm_base)
print(cr_base)

## 1.b) SHAP

In [None]:
#prepare data for shap
X_train_summary = shap.sample(X_train_scaled, 100)
end = len(X_test_scaled)
feature_names=X_test.columns 
classes_names_states= list(label_coding.keys())

In [None]:
#calculate shap values
explainer = shap.KernelExplainer(baseline.predict, X_train_summary) 
shap_values = explainer.shap_values(X_test_scaled[1:end:100, : ]) 
data_shap_base = pd.DataFrame(X_test_scaled[1:end:100,:], columns = feature_names)

In [None]:
# PLOTS
for max_features in [20, 40, 86]: 
  shap.summary_plot(shap_values, X_test_scaled[1:end:10,:], plot_type="bar", class_names= classes_names_states,
                    feature_names = feature_names, max_display=max_features , show=False)
  plt.gcf()
  figname=graphsStreamPath+model_name+'_global_allclasses_states_'+str(max_features)+'.png'
  plt.savefig(figname,dpi=150, bbox_inches='tight')
  plt.clf()

#I'm plotting the global explanations for all the classes, varying the number of features to show
#I'm iterating on the number of classes (numerical)
for class_id in range(len(shap_values)):
  #I'm iterating on the number of features I want to plot
  for max_features in [20, 40, 86]: 
    shap.summary_plot(shap_values[class_id], X_test_scaled[1:end:100,:], feature_names = feature_names,
                      max_display=max_features,show=False)
    plt.gcf()
    figname=graphsStreamPath+model_name+'_global_class'+str(class_id)+'_features'+ str(max_features)+'.png'
    plt.savefig(figname,dpi=150, bbox_inches='tight')
    plt.clf()

shap_values_0_class_base = pd.DataFrame(shap_values[0], columns = feature_names)
shap_values_1_class_base = pd.DataFrame(shap_values[1], columns = feature_names)
shap_values_2_class_base = pd.DataFrame(shap_values[2], columns = feature_names)
shap_values_3_class_base = pd.DataFrame(shap_values[3], columns = feature_names)

# 2. Compositional MLP approach

## 2.a) Neural Network

In [None]:

model_name='one_head'

input = tf.keras.layers.Input(shape=(86,), name='input')
hidden = tf.keras.layers.Dense(64, activation='relu', name='dense')(input)
dropout = tf.keras.layers.Dropout(0.2, name='dropout')(hidden)
symptom_output = tf.keras.layers.Dense(10, name='symptom_output')(dropout)
state_output = tf.keras.layers.Dense(4, activation='softmax', name='state_output')(symptom_output)

one_head = tf.keras.Model(inputs=input, 
                          outputs=[symptom_output, state_output], 
                          name='one-head-model')

one_head.summary()

one_head.compile(optimizer='adam',
                 loss=[tf.keras.losses.MeanAbsoluteError(),
                       tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)],
                 loss_weights=[0.5, 0.5],
                 metrics=['mae', 'accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='state_output_accuracy', patience=3)

one_head.fit(X_train_scaled, [y_train_symptoms, y_train_states_encoded], epochs=15, 
             validation_data=(X_test_scaled, [y_test_symptoms, y_test_states_encoded]),
             callbacks=[early_stopping])


In [None]:
model_name='one_head'
y_pred_states = one_head.predict(X_test_scaled)

y_pred_states = np.argmax(y_pred_states[1][:], axis=1)

cm_oh_class = confusion_matrix(y_test_states_encoded, y_pred_states)
cr_oh_class = classification_report(y_test_states_encoded, y_pred_states)

print(cm_oh_class)
print(cr_oh_class)

## 2.b) SHAP with states (4 classes)

In [None]:
#prepare data for shap
X_train_summary = shap.sample(X_train_scaled, 100)
end = len(X_test_scaled)
feature_names=X_test.columns 
classes_names_states= list(label_coding.keys())

data_shap_df=pd.DataFrame(X_test_scaled[1:end:100,:], columns = feature_names)
#data_shap_df.to_csv(shapvaluesStreamPath + "/data_shap.csv", index=False)

In [None]:
def f_states(X):
    return one_head.predict(X)[1]# with this function we select the second output of the model: vector of states 

explainer = shap.KernelExplainer(f_states, X_train_summary)  
shap_values = explainer.shap_values(X_test_scaled[1:end:100, : ]) 

In [None]:
#decoding values of BD stated to BD name
classes_names_states = []
for label in y_test_states_encoded:
     classes_names_states.append(list(label_coding.keys())[list(label_coding.values()).index(label)])

In [None]:
#I'm plotting the global explanation for all classes, varying the number of features to share
model_name='one_head'
for max_features in [20, 40, 86]: 
  shap.summary_plot(shap_values, X_test_scaled[1:end:100,:], plot_type="bar", class_names= classes_names_states,
                    feature_names = feature_names, max_display=max_features , show=False)
  plt.gcf()
  figname=graphsStreamPath+model_name+'_global_allclasses_states_'+str(max_features)+'.png'
  plt.savefig(figname,dpi=150, bbox_inches='tight')
  plt.plot()
  plt.clf()

#I'm plotting the global explanations for all the classes, varying the number of features to show
#I'm iterating on the number of classes (numerical)
for class_id in range(len(shap_values)):
  #I'm iterating on the number of features I want to plot
  for max_features in [20, 40, 86]: 
    shap.summary_plot(shap_values[class_id], X_test_scaled[1:end:100,:], feature_names = feature_names, max_display=max_features,show=False)
    plt.gcf()
    figname=graphsStreamPath+model_name+'_global_class'+str(class_id)+'_features'+ str(max_features)+'.png'
    plt.savefig(figname,dpi=150, bbox_inches='tight')
    plt.plot()
    plt.clf()
    


In [None]:

shap_values_0_class_oh = pd.DataFrame(shap_values[0], columns = feature_names)
shap_values_1_class_oh = pd.DataFrame(shap_values[1], columns = feature_names)
shap_values_2_class_oh = pd.DataFrame(shap_values[2], columns = feature_names)
shap_values_3_class_oh = pd.DataFrame(shap_values[3], columns = feature_names)

## 2.c) SHAP with symptoms (10 classes)

In [None]:
#prepare data for shap
X_train_summary = shap.sample(X_train_scaled, 100)
end = len(X_test_scaled)
feature_names=X_test.columns 
classes_names_states= list(label_coding.keys())

data_shap_oh=pd.DataFrame(X_test_scaled[1:end:100,:], columns = feature_names)
data_shap_oh.to_csv(shapvaluesStreamPath + "/data_shap_onehead.csv", index=False)

In [None]:

def f_symptoms(X):
    return one_head.predict(X)[0]# with this function we select the second output of the model: vector of states 

explainer = shap.KernelExplainer(f_symptoms, X_train_summary)
shap_values = explainer.shap_values(X_test_scaled[1:end:100,:]) 
classes_names=y_test_symptoms.columns


In [None]:
max_features=20
#I'm plotting the global summary for all the classes, only for 20 features
shap.summary_plot(shap_values, X_test_scaled[1:end:100,:], plot_type="bar", 
                  class_names= classes_names, max_display=max_features, feature_names = feature_names,show=False)
plt.gcf()
figname=graphsStreamPath+model_name+'_global_allclasses.png'
plt.savefig(figname,dpi=150, bbox_inches='tight')
plt.clf()

for x in range(classes_names.shape[0]):
  classes_names[x]
  shap.summary_plot(shap_values[x], X_test_scaled[1:end:100,:], feature_names = feature_names,
                    max_display=max_features,show=False) #you can change the maximum features to display 
  plt.gcf()
  figname=graphsStreamPath+model_name+'_global_allclasses_'+classes_names[x]+'.png'
  plt.savefig(figname,dpi=150, bbox_inches='tight')
  plt.clf()

In [None]:
shap_values_0_symptom_oh = pd.DataFrame(shap_values[0], columns = feature_names)
shap_values_1_symptom_oh = pd.DataFrame(shap_values[1], columns = feature_names)
shap_values_2_symptom_oh = pd.DataFrame(shap_values[2], columns = feature_names)
shap_values_3_symptom_oh = pd.DataFrame(shap_values[3], columns = feature_names)
shap_values_4_symptom_oh = pd.DataFrame(shap_values[4], columns = feature_names)
shap_values_5_symptom_oh = pd.DataFrame(shap_values[5], columns = feature_names)
shap_values_6_symptom_oh = pd.DataFrame(shap_values[6], columns = feature_names)
shap_values_7_symptom_oh = pd.DataFrame(shap_values[7], columns = feature_names)
shap_values_8_symptom_oh = pd.DataFrame(shap_values[8], columns = feature_names)
shap_values_9_symptom_oh = pd.DataFrame(shap_values[9], columns = feature_names)