# Load in data

In [185]:
import pandas as pd
import numpy as np

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier

# tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import L2
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.wrappers import scikit_learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend

from scikeras.wrappers import KerasClassifier

pd.set_option('display.max_columns', 100)

In [186]:
# make sure GPU is available
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [187]:
# memory management of GPU
gpu = tf.config.experimental.list_physical_devices('GPU')
if gpu:
    try:
        tf.config.experimental.set_memory_growth(gpu[0], True)
    except RuntimeError as e:
        print(e)

In [189]:
# load in data, 'converters' turns str dtype -> list
df = pd.read_csv('../data/train_cleaned.csv', converters={'cleaned': pd.eval}, index_col='Unnamed: 0')

# join lists as single string
df['words'] = df.cleaned.apply(lambda x: " ".join(x))

df.head(2)

Unnamed: 0,cleaned,class,words
0,"[catheterization, laboratory, event, hospital,...",3,catheterization laboratory event hospital outc...
1,"[renal, abscess, child, three, renal, abscess,...",4,renal abscess child three renal abscess child ...


# Target Variable

In [191]:
# look at class distribution
df['class'].value_counts(normalize=True)

class
4    0.332802
0    0.219075
3    0.211317
2    0.133329
1    0.103477
Name: proportion, dtype: float64

- 0: Neoplasms
- 1: Digestive system diseases
- 2: Nervous system diseases
- 3: Cardiovascular diseases
- 4: General pathological conditions

We chose to drop class 4, as the 'general' category hindered our model's ability to generalize.

In [192]:
# drop class 4
df2 = df.loc[df['class'] < 4]

In [193]:
# shapes of data
df.shape[0], df2.shape[0]

(14438, 9633)

- This reduced sample size from 14.4k to 9.6k
- We feel like this drop was for the best, as it allowed our models to classify documents with much greater accuracy

# Custom functions

### Look at input features

In [19]:
# vectorize
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df2.words)

# shape of input data
X.shape

(9633, 27268)

- Over 27k unique unigrams in our dataset
- This number will change based on params passed into the vectorizer
- We used SVD (Singular Value Decomposition) to reduce the dimensionality of our vectorized data, reducing our feature space to around 100

### models df

In [None]:
# empty df to store NN results and params
keras_df = pd.DataFrame(columns=['train_acc', 'train_loss', 'test_acc', 'test_loss', 'num_layers', 'shape', 'optim', 
                                 'epochs', 'batch_size', 'vec_name', 'vec_feats', 'vec_min', 'vec_max'])
keras_df

### Custom functions

In [None]:
# train NN, return acc, loss, and model parameters
# def fit_eval(node_list, vectorizer, data=df, results_df=keras_df):

#     '''
    
#     Parameters:
#     node_list - a list containing node counts for hidden layers

#     Returns:
#     results_df - df (passed in) with the results appended from the nn, params of NN also included
    
#     '''
    
#     # split train and test data
#     X_train, X_test, y_train, y_test = train_test_split(data.words, data['class'], test_size=0.2, random_state=0)
    
#     # preprocess data
#     X_train = vectorizer.fit_transform(X_train).toarray()
#     X_test = vectorizer.transform(X_test).toarray()
    
#     # further split the training data into training and validation sets
#     X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)
    
#     # one-hot encode our labels
#     y_train = to_categorical(y_train)
#     y_val = to_categorical(y_val)
#     y_test = to_categorical(y_test)
    
#     # add early stopping -> if model doesn't decrease val_loss every 5 epochs, exit the fitting process
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)

#     # create the KerasClassifier, use build_model (defined in cell below) as build function
#     nn = KerasClassifier(model=build_model(X_train.shape[1], node_list), epochs=1000, batch_size=32, optimizer=Adam(), 
#                          validation_split=0.2, verbose=0, loss='categorical_crossentropy', callbacks=[early_stopping])
    
#     # fit nn
#     nn.fit(X_train, y_train, validation_data=(X_val, y_val))
    
#     # accuracy
#     train_acc = nn.score(X_train, y_train)
#     test_acc = nn.score(X_test, y_test)
    
#     # prediction probabilities
#     train_preds = nn.predict_proba(X_train)
#     test_preds = nn.predict_proba(X_test)
    
#     # loss
#     train_loss = categorical_crossentropy(y_train, train_preds)
#     test_loss = categorical_crossentropy(y_test, test_preds)

#     # ========================================= get model/vectorizer params =========================================
    
#     # get number of layers
#     num_layers = len(nn.model.layers)

#     # get shape of nn
#     nn_shape = []
#     for i, layer in enumerate(nn.model.layers):
# #         if i % 2 == 0:
#         nn_shape.append(layer.units)

#     # optimizer, epochs, batch_size
#     optim = str(nn.optimizer).split()[0].split('.')[-1]
#     epochs = nn.current_epoch
#     batch_size = nn.batch_size
    
#     # vectorizer name, some parameters
#     vec_name = str(vec)[:-2]
#     vec_feats = vectorizer.max_features
#     vec_min = vectorizer.min_df
#     vec_max = vectorizer.max_df
    
    
#     # append results to the df
#     results_df.loc[len(results_df.index)] = [train_acc, train_loss, test_acc, test_loss, num_layers, nn_shape, optim, 
#                                              epochs, batch_size, vec_name, vec_feats, vec_min, vec_max]
    
    
    
#     return results_df

In [194]:
# function to build a keras model
def build_model(input_dim, node_list):
    '''
    Build and return a keras NN with a specified node count for each hidden layer.
    
    Parameters:
    node_list - list where nth index value corresponds to nth hidden layer node count
    
    Returns:
    model - a keras NN
    
    '''
    
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    
    # add Dense hidden layers
    for node in node_list:
        model.add(Dense(node, activation=relu))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        
    # output layer - 4 possible classes
    model.add(Dense(4, activation='softmax'))

    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Format and split data

In [195]:
# global random state for reproducibility
random_state = 0

In [123]:
# create our vectorizer
vectorizer = TfidfVectorizer()

# split train and test
X_train, X_test, y_train, y_test = train_test_split(df2.words, df2['class'], test_size=0.2, random_state=random_state)

# LSA
svd = TruncatedSVD(n_components=100, random_state=random_state)

# vectorize and SVD transform the input data
X_train = svd.fit_transform(vectorizer.fit_transform(X_train).toarray())
X_test = svd.transform(vectorizer.transform(X_test).toarray())

# further split the training df2 into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)

# NN

In [196]:
# add early stopping -> if model doesn't decrease val_loss every 5 epochs, exit the fitting process
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1)

# create the KerasClassifier, use build_model to create our model
nn = KerasClassifier(model=build_model(X_train.shape[1], [100, 100]), epochs=100, batch_size=16, optimizer=Adam(),
                     callbacks=[early_stopping], validation_split=0.2, verbose=1, loss='sparse_categorical_crossentropy')

In [198]:
# fit nn
nn.fit(X_train, y_train, validation_data=(X_val, y_val))

# free up GPU memory
backend.clear_session()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 26: early stopping


- our best NN achieved a val_loss of about 0.45 with a val_acc of 83.2%

In [199]:
y_preds = nn.predict(X_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       638
           1       0.82      0.78      0.80       304
           2       0.72      0.75      0.73       382
           3       0.89      0.85      0.87       603

    accuracy                           0.82      1927
   macro avg       0.81      0.81      0.81      1927
weighted avg       0.82      0.82      0.82      1927



- noteicably worse recall on classes 1 and 2
- will use class_weight to attempt to improve

### Add class weights

In [127]:
# what 'balanced' class weights would look like
y_train.shape[0] / (4 * np.bincount(y_train))

array([0.76704828, 1.59358842, 1.24074074, 0.79188078])

In [136]:
# less-intense class weights than the 'balanced' approach
class_weight={0: 0.8, 1: 1.5, 2: 1.2, 3: 0.8}

# create the KerasClassifier
nn = KerasClassifier(model=build_model(X_train.shape[1], [100, 100]), epochs=100, batch_size=16, optimizer=Adam(), 
                     callbacks=[early_stopping], class_weight=class_weight, validation_split=0.2, verbose=1, 
                     loss='sparse_categorical_crossentropy')

In [200]:
# fit nn
nn.fit(X_train, y_train, validation_data=(X_val, y_val))

# free up GPU memory
backend.clear_session()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 12: early stopping


In [138]:
y_preds = nn.predict(X_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.86      0.78      0.82       638
           1       0.75      0.83      0.78       304
           2       0.70      0.78      0.74       382
           3       0.88      0.86      0.87       603

    accuracy                           0.81      1927
   macro avg       0.80      0.81      0.80      1927
weighted avg       0.82      0.81      0.81      1927



- class weight did up recall for classes 1 and 2, and kept overall acc very similar

# Stacking classifier

In [201]:
# build stacking classifier
stack = StackingClassifier([
    ('logreg', KNeighborsClassifier(n_neighbors=20)),
    ('lr', LogisticRegression(max_iter=1000, penalty=None))
])

In [202]:
# train
stack.fit(X_train, y_train)

In [203]:
# get scores
stack.score(X_train, y_train), stack.score(X_test, y_test)

(0.8348475016223231, 0.8111053450960042)

- our best stacker achieved 81.1% testing accuracy

# Manual stacking
- we are using the prediction probs from our best stacking classifier as input to a NN

### putting ONLY preds from stack into NN

In [204]:
# get pred probs from the best StackingClassifier
train_preds = stack.predict_proba(X_train)
val_preds = stack.predict_proba(X_val)
test_preds = stack.predict_proba(X_test)

# create the KerasClassifier, input is stacker's preds
nn = KerasClassifier(model=build_model(train_preds.shape[1], [100, 100]), epochs=100, batch_size=16,
                     optimizer=Adam(), callbacks=[early_stopping],
                     validation_split=0.2, verbose=1, loss='sparse_categorical_crossentropy')

In [205]:
# fit nn
nn.fit(train_preds, y_train, validation_data=(val_preds, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 21: early stopping


- still at an 83% val acc

### putting preds from stack + input data into NN

In [207]:
# combine pred probs with input data
train_full = np.concatenate((X_train, train_preds), axis=1)
val_full = np.concatenate((X_val, val_preds), axis=1)
test_full = np.concatenate((X_test, test_preds), axis=1)

# create the KerasClassifier, pred probs + input data
nn = KerasClassifier(model=build_model(train_full.shape[1], [104, 104]), epochs=100, batch_size=64, optimizer=Adam(), 
                     callbacks=[early_stopping], validation_split=0.2, verbose=1, loss='sparse_categorical_crossentropy')

In [208]:
# fit nn
nn.fit(train_full, y_train, validation_data=(val_full, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 37: early stopping


- val acc still at 83%

### putting residuals from stack into NN

In [209]:
# Calculate the residuals
# convert target vars to one-hot-encoded rows using to_categorical
train_residuals = to_categorical(y_train) - train_preds
val_residuals = to_categorical(y_val) - val_preds
test_residuals = to_categorical(y_test) - test_preds

# create the KerasClassifier, use residuals as input data
nn = KerasClassifier(model=build_model(train_residuals.shape[1], [64, 64, 32, 32]), epochs=100, batch_size=64, optimizer=Adam(), 
                     callbacks=[early_stopping], validation_split=0.2, verbose=1, loss='sparse_categorical_crossentropy')

In [210]:
# fit nn
nn.fit(train_residuals, y_train, validation_data=(val_residuals, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 49: early stopping


- ...
- This model achieved 100% accuracy on both training and validation sets!

In [211]:
# train acc, test acc
nn.score(train_residuals, y_train), nn.score(test_residuals, y_test)



(1.0, 1.0)