In [7]:
import pandas as pd
import numpy as np
import re
import sys
import tqdm
sys.path.append('../src')
import matplotlib.pyplot as plt
import shap
import os
import json
from pathlib import Path
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from skopt import BayesSearchCV
from skopt.space import Real, Integer

import xgboost as xgb

from tensorflow import keras

from sklearn.metrics import confusion_matrix,accuracy_score, ConfusionMatrixDisplay, roc_auc_score, roc_curve, classification_report, RocCurveDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.dummy import DummyClassifier

from skmultilearn.model_selection import IterativeStratification 

from gutatlas.features import clean_feature_names, dead_features




In [8]:
training_set = pd.read_parquet('../data/processed/gi_multilabel_training.parquet')
training_set = training_set.rename({col:clean_feature_names(col) for col in training_set.columns}, axis = 1)

X_train,X_test,y_train,y_test = train_test_split(training_set.iloc[:,13:],training_set.iloc[:,1:13], random_state=42)

# true multilabel

## random forest

In [9]:
X = training_set.iloc[:,13:]
y = training_set.iloc[:,1:13]

#### Why is this classifier so good? I do not trust it....

In [13]:
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    n_jobs=-1
    )
cv = IterativeStratification(6)
macro_scores = []
micro_scores = []
for train,val in cv.split(X,y):
    rf.fit(X.iloc[train],y.iloc[train])
    y_probs = rf.predict_proba(X.iloc[val])
    y_pred = np.column_stack([feature[:,1] for feature in y_probs])
    macro_roc_auc = roc_auc_score(y.iloc[val],y_pred, average='macro')
    macro_scores.append(macro_roc_auc)
    micro_roc_auc = roc_auc_score(y.iloc[val],y_pred, average='micro')
    micro_scores.append(micro_roc_auc)

print('average macro:',np.mean(macro_scores),'+/-',np.std(macro_scores))
print('average micro:',np.mean(micro_scores),'+/-',np.std(micro_scores))



average macro: 0.9595096382552494 +/- 0.005006209960008563
average micro: 0.9788719983684414 +/- 0.001680745421649347


## dummy classifier

In [12]:
dummy = DummyClassifier(strategy="most_frequent", random_state=42)
dummy.fit(X_train, y_train)

y_proba = dummy.predict_proba(X_test)

y_pred = np.column_stack([feature[:,1] for feature in y_proba])
dummy_roc_auc = roc_auc_score(y_test,y_pred, average='micro')
dummy_roc_auc

0.8642678260731079

## neural net

In [None]:
model = keras.Sequential([
    keras.layers.Input((X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32,activation='relu'),
    keras.layers.Dense(16,activation='relu'),
    keras.layers.Dense(12,activation='sigmoid'),
])
model.compile(loss = 'binary_crossentropy',optimizer=keras.optimizers.Adam(learning_rate=0.001))


history = model.fit(X_train,y_train,epochs = 100, batch_size=1028,validation_split=0.1, verbose = 1)



Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 0.6911 - val_loss: 0.6844
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.6831 - val_loss: 0.6779
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.6770 - val_loss: 0.6721
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.6713 - val_loss: 0.6657
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.6647 - val_loss: 0.6582
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.6570 - val_loss: 0.6493
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.6481 - val_loss: 0.6392
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.6379 - val_loss: 0.6274
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [38]:
preds = model.predict(X_test)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [39]:
nn_roc_auc_macro = roc_auc_score(y_test,preds, average='macro')
nn_roc_auc_micro = roc_auc_score(y_test,preds, average='micro')

print('macro:',nn_roc_auc_macro)
print('micro:',nn_roc_auc_micro)

macro: 0.8241276410518933
micro: 0.9682014159021123
