## Multi-Class Prediction of Cirrhosis Outcomes
@misc{playground-series-s3e26,
    author = {Walter Reade, Ashley Chow},
    title = {Multi-Class Prediction of Cirrhosis Outcomes},
    publisher = {Kaggle},
    year = {2023},
    url = {https://kaggle.com/competitions/playground-series-s3e26}
}

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv')

In [None]:
train_df.describe(), train_df.info()

In [None]:
train_df.shape, test_df.shape
missing_vals_train = train_df.isnull().sum()
missing_vals_train = missing_vals_train[missing_vals_train > 0].sort_values(ascending=False)

missing_vals_test = test_df.isnull().sum()
missing_vals_test = missing_vals_test[missing_vals_test > 0].sort_values(ascending=False)

# we can see there are no missing values
len(missing_vals_train), len(missing_vals_test)

<quote>
    According to [https://www.kaggle.com/datasets/joebeachcapital/cirrhosis-patient-survival-prediction](https://www.kaggle.com/datasets/joebeachcapital/cirrhosis-patient-survival-prediction)
    'Stage' is categorical
</quote>

In [None]:
# convert 'Stage' to Categorical
train_df['Stage'] = pd.Categorical(train_df.Stage)
test_df['Stage'] = pd.Categorical(test_df.Stage)

In [None]:
# get column lists to use in one-hot and scaling
category_col_list = list(train_df.select_dtypes(include=['object','category']))
numeric_col_list = list(test_df.select_dtypes(exclude=['object','category']))

# drop 'Status' from category_colelct_list
category_col_list.remove('Status')

# drop 'id' from numeric_col_list
numeric_col_list.remove('id')
print(category_col_list, numeric_col_list)

In [None]:
# shuffle the dataset and then remove 'ids' for futher preocessing
# set random seed
rast = np.random.RandomState(42)
from sklearn.model_selection import train_test_split

# since the training dataset is less than 8k 
# we will shuffle training and split into training and validation
shuffeT = np.random.permutation(train_df.shape[0])
train_shuffle = train_df.iloc[shuffeT,:]

# drop ID column from training and test sets
train_shuffle = train_shuffle.drop(['id'], axis=1)

test_id = test_df['id']
test_X = test_df.drop(['id'], axis=1)
test_X.shape, train_shuffle.shape

In [None]:
# test data did not scale properly using code to scale
train_X = train_shuffle.iloc[:,:18]
mean = train_X[numeric_col_list].mean(axis = 0)
std = train_X[numeric_col_list].std(axis = 0)

train_X[numeric_col_list] -= mean
train_X[numeric_col_list]  /= std
print(train_X[numeric_col_list].head(5))

# test_X1 = test_X.copy()
test_X[numeric_col_list] -= mean
test_X[numeric_col_list]  /= std
print(test_X[numeric_col_list].head(5))

In [None]:
# remove '-' from 'Drug' in train and test sets
import re
train_X['Drug'] = train_X['Drug'].map(lambda x: re.sub("-","", x))
test_X['Drug'] = test_X['Drug'].map(lambda x: re.sub("-","", x))

In [None]:
# split test set
# split train dataset into train_X and train_y
train_X = train_shuffle.iloc[:,:18]
train_y = train_shuffle.iloc[:,18:]

In [None]:
# one-hot encoding using Pandas Dataframe get_dummies
train_y = pd.get_dummies(train_y, dtype='int')

In [None]:
train_X_1 = pd.get_dummies(train_X, columns=category_col_list, dtype=int)
test_X_1 = pd.get_dummies(test_X, columns=category_col_list, dtype=int)

In [None]:
print(train_X_1.shape, test_X_1.shape)
print(train_X_1.iloc[:5,10:15])
print(test_X_1.iloc[:5,10:15])

In [None]:
# split training set into test and val sets
X_train, X_val, y_train, y_val = train_test_split(train_X_1, train_y, test_size=0.3, random_state=rast)

In [None]:
print(X_train.shape,X_val.shape, y_train.shape, y_val.shape)

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Dense, Dropout, Input, BatchNormalization
from keras.models import Sequential, Model
from keras.optimizers import SGD, Adam, RMSprop
from keras.callbacks import EarlyStopping, BackupAndRestore, Callback

import keras.backend as K

In [None]:
checkpoint_filepath = 'checkpoint_dir'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,   # where to save - is it a file or directory - it must end in .keras
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
# callback to stope early
callback = EarlyStopping(monitor='loss', 
                        min_delta=0.02,
                        patience=3,
                        mode='auto',
                        verbose=1,
                        restore_best_weights=True,
                        )

In [None]:
class callback_interrupts(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        if epoch==30:
            keys = list(logs.keys())
            print("Reached epoch #:{} log keys: {}".format(epoch, keys))
            raise RuntimeError('running too long - interrupting')
    
    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        print("End of epoch #:{} log keys: {}".format(epoch, keys))

    def on_predict_begin(self, logs=None):
        keys = list(logs.keys())
        print("Starting prediction:- log keys: {}".format(keys))

    def on_predict_end(self, logs=None):
        keys = list(logs.keys())
        print("End prediction:- log keys: {}".format(keys))

In [None]:
# create TensorFlow Keras Function model
def create_model(inpt,print_sum=False):
    keras.backend.clear_session()    # keras manages a golbal state - works ony in Functional mode

    inp = Input(shape=(inpt.shape[1],))
    # inp = Input(input_dim=X_train.shape[1])
    l1 = Dense(512, activation='relu')(inp)
    l2 = Dense(1024, activation='relu')(l1)
    l3 = Dropout(0.4)(l2)
    l4 = Dense(1024, activation='relu')(l3)
    l5 = BatchNormalization()(l4)
    l6 = Dense(512, activation='relu')(l5)
    l7 = Dropout(0.4)(l6)
    l8 = Dense(512, activation='relu')(l7)
    # l9 = Dropout(0.4)(l8)
    # l9 = BatchNormalization()(l8)
    # l10 = Dense(128, activation='relu')(l9)
    
    l9 = Dense(512, activation='relu')(l8)
    l10 = Dense(128, activation='relu')(l9)
    l11 = Dropout(0.4)(l10)
    l12 = Dense(64, activation='relu')(l11)
    output = Dense(3, activation='sigmoid')(l12)
    model = Model(inp, output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    if print_sum:
        model.summary()
    return model

In [None]:
# usign callback to limit run time
x_model = create_model(X_train)
x_model.summary()
history = x_model.fit(X_train, y_train, 
                      batch_size=32, 
                      epochs=200, 
                      validation_data=(X_val, y_val),
                      callbacks=[callback, callback_interrupts()])
# callbacks=[callback, callback_interrupts()])
# callbacks=[callback_interrupts()])

In [None]:
# plot loss and accuracy
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(['train', 'val'], loc='upper right')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(['train', 'val'], loc='upper right')
plt.show()

In [None]:
# predict on validation dataset
val_pred_model = x_model.predict(X_val)

# now predict on Test dataset and copy to CSV
test_model_predictions = x_model.predict(test_X_1)


In [None]:
# create DataFrame to write CSV file
test_model_data = pd.DataFrame(test_model_predictions, columns=['Status_C','Status_CL','Status_D'])
test_model_data.round(5)
test_model_data.insert(0, 'id', test_id)
test_model_data.to_csv('NeuralNetwork_Multi-Class_Prediction_JY.csv', index = False)

In [None]:
# check output
test_model_data.head(20)

In [None]:
test_model_data.to_csv('submission.csv', index = False)
from IPython.display import FileLink
FileLink(r'submission.csv')