In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline
import io

In [2]:
#hide_output
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
import shap
from sklearn.decomposition import PCA

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization

In [3]:
df = pd.read_csv('data/cleaned_ca.csv')
p_df_75 = pd.read_csv(io.StringIO(pd.read_csv('data/principal_component_df.csv').to_csv(index=False)), index_col=0)
p_df_100 = pd.read_csv(io.StringIO(pd.read_csv('data/principal_component_df100.csv').to_csv(index=False)), index_col=0)
X = pd.read_csv('data/X.csv')
y = pd.read_csv('data/y.csv')

## Random Forest 2

In [4]:
#hide_output
# Train test split
Xp75_train, Xp75_test, yp75_train, yp75_test = train_test_split(p_df_75, y, test_size=0.2, random_state=42, shuffle=True)
assert(len(Xp75_train) == len(yp75_train))
assert(len(Xp75_test) == len(yp75_test))

In [5]:
Xp75_train.shape

(8005, 75)

In [None]:
#hide_input
rnd_clf = RandomForestClassifier(n_estimators= 100,n_jobs=-1, random_state=27)
rnd_clf.fit(Xp75_train, yp75_train)

In [None]:
y_test_pred = rnd_clf.predict(Xp75_test)
test_acc = np.sum(yp75_test_pred == yp75_test)/len(yp75_test)
print ("\nAUC - ROC : ", roc_auc_score(yp75_test,rnd_clf.predict(Xp75_test)))
print("test accuracy: "+str(test_acc))

In [None]:
explainer = shap.TreeExplainer(rnd_clf)
shap_values = explainer.shap_values(Xp75_test)
shap.summary_plot(shap_values[1], Xp75_test)

In [None]:
#hide_input
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(yp75_test[:,3], y_test_pred[:,3])
resp_1 = confusion_matrix(yp75_test[:,0], y_test_pred[:,0])
resp_2 = confusion_matrix(yp75_test[:,1], y_test_pred[:,1])
resp_3 = confusion_matrix(yp75_test[:,2], y_test_pred[:,2])
resp_4 = confusion_matrix(yp75_test[:,4], y_test_pred[:,4])
all_resp = np.add(np.add(np.add(conf_mat, resp_1),np.add(resp_2, resp_3)),resp_4)
sns.heatmap(all_resp, cmap="RdYlGn", annot=True).set_title("All resp summed confusion matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
print("resp")
print(conf_mat)
print("resp_1")
print(resp_1)
print("resp_2")
print(resp_2)
print("resp_3")
print(resp_3)
print("resp_4")
print(resp_4)

## Nueral Network

In [None]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )
    return model

In [None]:
#hide_output
batch_size = 124
hidden_units = [150, 150, 150]
dropout_rates = [0.20, 0.20, 0.20, 0.20]
label_smoothing = 1e-2
learning_rate = 3e-3

#with tpu_strategy.scope():
clf = create_mlp(
        Xp75_train.shape[1], 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )

clf.fit(Xp75_train, yp75_train, epochs=10, batch_size=batch_size)

models.append(clf)