# Load in data

In [1]:
import pandas as pd
import numpy as np

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier

# tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import L2
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.wrappers import scikit_learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend

from scikeras.wrappers import KerasClassifier

pd.set_option('display.max_columns', 100)

In [2]:
# make sure GPU is available
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [3]:
# memory management of GPU
gpu = tf.config.experimental.list_physical_devices('GPU')
if gpu:
    try:
        tf.config.experimental.set_memory_growth(gpu[0], True)
    except RuntimeError as e:
        print(e)

In [4]:
# load in data, 'converters' turns str dtype -> list
df = pd.read_csv('../data/train_cleaned.csv', converters={'cleaned': pd.eval}, index_col='Unnamed: 0')

# join lists as single string
df['words'] = df.cleaned.apply(lambda x: " ".join(x))

df.head(2)

Unnamed: 0,cleaned,class,words
0,"[catheterization, laboratory, event, hospital,...",3,catheterization laboratory event hospital outc...
1,"[renal, abscess, child, three, renal, abscess,...",4,renal abscess child three renal abscess child ...


In [5]:
# drop class 4
df2 = df.loc[df['class'] < 4]

# NN architecture

In [7]:
# function to build a keras model
def build_model(input_dim, node_list):
    '''
    Build and return a keras NN with a specified node count for each hidden layer.
    
    Parameters:
    node_list - list where nth index value corresponds to nth hidden layer node count
    
    Returns:
    model - a keras NN
    
    '''
    
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    
    # add Dense hidden layers
    for node in node_list:
        # dense layer
        model.add(Dense(node, activation=relu))
        
        # standardize layer's weights
        model.add(BatchNormalization())
        
        # 50% dropout
        model.add(Dropout(0.5))
        
    # output layer - 4 possible classes
    model.add(Dense(4, activation='softmax'))

    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Split data

In [8]:
# global random state for reproducibility
random_state = 0

In [9]:
# split train and test 80/20
X_train, X_test, y_train, y_test = train_test_split(df2.words, df2['class'], test_size=0.2, random_state=random_state)

# Pipeline

In [10]:
# build stacking classifier
stack = StackingClassifier([
    ('logreg', KNeighborsClassifier(n_neighbors=20)),
    ('lr', LogisticRegression(max_iter=1000, penalty=None))
])

In [12]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=100, random_state=random_state)),
    ('stack', stack)
])

In [13]:
# train
pipe.fit(X_train, y_train)

In [14]:
# get scores
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8348040487931482, 0.8142189932537623)

- our best stacker achieved 81.4% testing accuracy

# Disease deducer

In [17]:
# get pipeline prediction probs
train_preds = pipe.predict_proba(X_train)
test_preds = pipe.predict_proba(X_test)

# Calculate the residuals
train_residuals = to_categorical(y_train) - train_preds
test_residuals = to_categorical(y_test) - test_preds

In [196]:
# add early stopping -> if model doesn't decrease val_loss every 5 epochs, exit the fitting process
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)

In [18]:
# create the KerasClassifier, use residuals as input data
nn = KerasClassifier(model=build_model(train_residuals.shape[1], [64, 64, 32, 32]), epochs=20, batch_size=64, optimizer=Adam(),
                     validation_split=0.2, verbose=1, loss='sparse_categorical_crossentropy')

In [19]:
# fit nn
nn.fit(train_residuals, y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


- ...
- This model achieved 100% accuracy on both training and validation sets!

In [211]:
# train acc, test acc
nn.score(train_residuals, y_train), nn.score(test_residuals, y_test)



(1.0, 1.0)