In [1]:
### Import dependancies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
os.listdir()

['.DS_Store',
 'structured_data_classifier',
 'Categorical Summary.docx',
 'Project Draft 1.ipynb',
 'Data Sets',
 'healthcareClanedTrain3.csv',
 'healthcareClanedTrain2.csv',
 'Modeling .ipynb',
 'Decision and Naives - healthcareClanedTrain3.ipynb',
 '.ipynb_checkpoints',
 'MIS 776 Group Project.pdf',
 '11.16.2020 Dannica Update.ipynb',
 'healthcareClanedTrain.csv',
 'Ryan Script.ipynb']

In [3]:
#### Load the dataset 

health = pd.read_csv('healthcareClanedTrain3.csv')
health.head()

Unnamed: 0,hosp_code,hosp_type,city_code,hosp_region,rooms_available,department,ward_type,ward_code,bed_grade,patient_id,admission_type,severity,num_visitors,age,deposit,stay
0,8,0,3,0,3,0,0,0,2,31397,0,0,2,5,4911,0
1,2,0,5,0,2,0,1,0,2,31397,1,0,2,5,5954,4
2,10,1,1,1,2,1,1,1,2,31397,1,0,2,5,4745,3
3,26,2,2,2,2,0,0,2,2,31397,1,0,2,5,7272,4
4,26,2,2,2,2,0,1,2,2,31397,1,0,2,5,5558,4


In [4]:
health.columns

Index(['hosp_code', 'hosp_type', 'city_code', 'hosp_region', 'rooms_available',
       'department', 'ward_type', 'ward_code', 'bed_grade', 'patient_id',
       'admission_type', 'severity', 'num_visitors', 'age', 'deposit', 'stay'],
      dtype='object')

In [5]:
## Define X and y variables 
X = health.drop('stay', axis=1)
y = health.stay

In [6]:
X.head()

Unnamed: 0,hosp_code,hosp_type,city_code,hosp_region,rooms_available,department,ward_type,ward_code,bed_grade,patient_id,admission_type,severity,num_visitors,age,deposit
0,8,0,3,0,3,0,0,0,2,31397,0,0,2,5,4911
1,2,0,5,0,2,0,1,0,2,31397,1,0,2,5,5954
2,10,1,1,1,2,1,1,1,2,31397,1,0,2,5,4745
3,26,2,2,2,2,0,0,2,2,31397,1,0,2,5,7272
4,26,2,2,2,2,0,1,2,2,31397,1,0,2,5,5558


In [None]:
### Split the data

##from sklearn.model_selection import train_test_split

##X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=1)

In [None]:
### Reshape the target data to include the levels 
from keras.utils import to_categorical

y = to_categorical(y)

print('y shape: ', y.shape)

### Train a neural network with the entire dataset 
   **without standardizing any variables**

In [None]:
### Creating Neural Network 
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization

In [None]:
# Define input shape
input_shape = X.shape[1]

# Instantiate the Sequential model 
nn1 = Sequential()

# Add Layers 
nn1.add(Dense(32, activation='relu',input_shape=(input_shape,)))
nn1.add(Dense(16, activation='relu'))
nn1.add(BatchNormalization())
nn1.add(Dense(8, activation='relu'))

# Add output layer 
nn1.add(Dense(11, activation='relu'))

In [None]:
### Compile the model 
nn1.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
## Check model summary
nn1.summary()

In [None]:
## Fit the model 
batch_size = 25
n_epochs = 15

hist = nn1.fit(X, y, epochs=n_epochs, batch_size=batch_size, validation_split=0.1)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist.history['loss']), label='Training Loss')
ax.plot(np.log(hist.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

- Both Training Loss and Validation Loss seem to decline at every epoch. 
- Training loss is a bit lower than validation loss (not unusual). 


In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist.history['accuracy']), label='Training Accuracy')
ax.plot(np.log(hist.history['val_accuracy']), label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

***The model is not performing well. Both accuracy and vaidation accuracy are on 30%. The model needs to be improved a lot.***

### Train a new model with standardized numeric data

In [None]:
X1 = X.copy()
y1 = health.stay

In [None]:
### Standardize the numerical data 
from sklearn.preprocessing import StandardScaler

numeraical_data = ['rooms_available','num_visitors','deposit', 'patient_id']

scaler = StandardScaler()
X1.loc[:,numeraical_data] = scaler.fit_transform(X1[numeraical_data])

In [None]:
### Reshape the target data to include the levels 
from keras.utils import to_categorical

y1 = to_categorical(y1)

print('y1 shape: ', y1.shape)

In [None]:
X1.head()

In [None]:
## Create a new neural network 
# Define input shape
input_shape = X1.shape[1]

# Instantiate the Sequential model 
nn2 = Sequential()

# Add Layers 
nn2.add(Dense(128, activation='relu',input_shape=(input_shape,)))
nn2.add(BatchNormalization())
nn2.add(Dense(64, activation='relu'))
nn2.add(Dense(32, activation='relu'))
nn2.add(Dense(16, activation='relu'))
nn2.add(Dense(8, activation='relu'))

# Add output layer 
nn2.add(Dense(11, activation='relu'))

In [None]:
### Compile the model 
nn2.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

nn2.summary()

In [None]:
## Fit the model 
batch_size = 50
n_epochs = 20

hist = nn2.fit(X1, y1, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist.history['loss']), label='Training Loss')
ax.plot(np.log(hist.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist.history['accuracy']), label='Training Accuracy')
ax.plot(np.log(hist.history['val_accuracy']), label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

- There is a massive gap between training loss and validation loss. 
- Both training loss and valoidation loss seem to decline. 
- Both training and validation accuracies did increase. 
- After 17 epochs validation accuracy seem to start dipping. 
- The overfitting is getting better. 

#### The model needs to be much more improved. Accuracy scores are stil in the 30s and loss is high. 

In [None]:
### Train a new network with changed architecture 

## Create a new neural network 
# Define input shape
input_shape = X1.shape[1]

# Instantiate the Sequential model 
nn3 = Sequential()

# Add Layers 
nn3.add(Dense(64, activation='relu', input_shape=(input_shape,)))
nn3.add(Dense(32, activation='relu'))
nn3.add(Dense(16, activation='relu'))
nn3.add(Dense(8, activation='relu'))

# Add output layer 
nn3.add(Dense(11, activation='relu'))

In [None]:
### Compile the model 
nn3.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

nn3.summary()

In [None]:
## Fit the model 
batch_size = 50
n_epochs = 20

hist2 = nn3.fit(X1, y1, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist2.history['loss']), label='Training Loss')
ax.plot(np.log(hist2.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(hist2.history['accuracy'], label='Training Accuracy')
ax.plot(hist2.history['val_accuracy'], label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

- Both the accuracy and loss seem to have been improved a lot. 
- However accuracy is still below 40%
- The model needs to be much more improved. 

#### Retrain a new network with different parameters

In [None]:
from sklearn.utils import shuffle
from tensorflow import keras

X1, y1 = shuffle(X1, y1)

In [None]:
### Train a new network with changed architecture 

## Create a new neural network 
# Define input shape
input_shape = X1.shape[1]

# Instantiate the Sequential model 
nn4 = Sequential()

# Add Layers 
nn4.add(Dense(32, activation='relu',kernel_initializer='he_uniform', input_shape=(input_shape,)))
nn4.add(Dense(32, activation='relu'))
nn4.add(Dense(32, activation='relu'))
nn4.add(Dense(32, activation='relu'))

# Add output layer 
nn4.add(Dense(11, activation='softmax'))

In [None]:
### Compile the model

# Define a custom optimizer 
opt = keras.optimizers.Adam(lr=0.01)
nn4.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt)

nn4.summary()

In [None]:
## Fit the model 
batch_size = 100
n_epochs = 20

hist3 = nn4.fit(X1, y1, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist3.history['loss']), label='Training Loss')
ax.plot(np.log(hist3.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(hist3.history['accuracy'], label='Training Accuracy')
ax.plot(hist3.history['val_accuracy'], label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

#### Still no improvement 

### Scale all features and retrain 

In [None]:
# Create new datasets

X2 = X.copy()
y2 = health.stay

In [None]:
### Standardize the numerical data 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X2.loc[:,:] = scaler.fit_transform(X1)

In [None]:
X2.head()

In [None]:
y2 = to_categorical(y2)

print('y2 shape: ', y2.shape)

In [None]:
## Create a new neural network 
# Define input shape
input_shape = X2.shape[1]

# Instantiate the Sequential model 
nn5 = Sequential()

# Add Layers 
nn5.add(Dense(32, activation='relu',kernel_initializer='he_uniform', input_shape=(input_shape,)))
nn5.add(Dense(32, activation='relu'))
nn5.add(Dense(32, activation='relu'))
nn5.add(Dense(32, activation='relu'))

# Add output layer 
nn5.add(Dense(11, activation='softmax'))

In [None]:
### Compile the model

# Define a custom optimizer 
opt = keras.optimizers.Adam(lr=0.01)
nn5.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt)

nn5.summary()

In [None]:
## Fit the model 
batch_size = 100
n_epochs = 20

hist4 = nn5.fit(X2, y2, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

***The results are much worse.***

In [None]:
# Shuffle the data 
X2, y2 = shuffle(X2, y2)

In [None]:
## Create a new neural network 
# Define input shape
input_shape = X2.shape[1]

# Instantiate the Sequential model 
nn6 = Sequential()

# Add Layers 
nn6.add(Dense(64, activation='relu',kernel_initializer='he_uniform', input_shape=(input_shape,)))
nn6.add(Dense(32, activation='relu'))
nn6.add(Dense(16, activation='relu'))
nn6.add(Dense(8, activation='relu'))

# Add output layer 
nn6.add(Dense(11, activation='relu'))

In [None]:
### Compile the model

# Define a custom optimizer 
opt = keras.optimizers.Adam(lr=0.001)

nn6.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt)

nn6.summary()

In [None]:
## Fit the model 
batch_size = 100
n_epochs = 20

hist5 = nn6.fit(X2, y2, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

- The model performs worse than before. Both the accuracy and  the loss values are not optimal. 
- **Standardizing all values seems to be less effective**
- **Standardizing only the  numeric values seem to bring better results**

### Rescale all the data using a Robust Scaler to deal with outliers and retrain a new model 

In [None]:
# Create new datasets

X4 = X.copy()
y4 = health.stay

In [None]:
### Standardize the numerical data using a robust scaler 
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X4.loc[:,:] = scaler.fit_transform(X4)

In [None]:
X4.head()

In [None]:
y4 = to_categorical(y4)

print('y4 shape: ', y4.shape)

In [None]:
## Create a new neural network 
# Define input shape
input_shape = X4.shape[1]

# Instantiate the Sequential model 
nn7 = Sequential()

# Add Layers 
nn7.add(Dense(64, activation='relu', input_shape=(input_shape,)))
nn7.add(Dense(32, activation='relu'))
nn7.add(Dense(16, activation='relu'))
nn7.add(Dense(8, activation='relu'))

# Add output layer 
nn7.add(Dense(11, activation='relu'))

In [None]:
### Compile the model

# Define a custom optimizer 
#opt = keras.optimizers.Adam(lr=0.001)
nn7.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
nn7.summary()

In [None]:
## Fit the model 
batch_size = 75
n_epochs = 20

hist6 = nn7.fit(X4, y4, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist6.history['loss']), label='Training Loss')
ax.plot(np.log(hist6.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(hist6.history['accuracy'], label='Training Accuracy')
ax.plot(hist6.history['val_accuracy'], label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

### Use the robust scaler but only scale the numeric columns 

In [None]:
# Create new datasets

X5 = X.copy()
y5 = health.stay

In [None]:
### Standardize the numerical data using a robust scaler 
from sklearn.preprocessing import RobustScaler

numerical_data = ['rooms_available','num_visitors','deposit', 'patient_id']

scalerR = RobustScaler()
X5.loc[:,numerical_data] = scalerR.fit_transform(X5[numerical_data])

In [None]:
# reshape target variable
y5 = to_categorical(y5)

print('y5 shape: ', y5.shape)

In [None]:
## Create a new neural network 
# Define input shape
input_shape = X5.shape[1]

# Instantiate the Sequential model 
nn8 = Sequential()

# Add Layers 
nn8.add(Dense(64, activation='relu', input_shape=(input_shape,)))
nn8.add(Dense(32, activation='relu'))
nn8.add(Dense(16, activation='relu'))
nn8.add(Dense(8, activation='relu'))

# Add output layer 
nn8.add(Dense(11, activation='relu'))

In [None]:
### Compile the model

# Define a custom optimizer 
#opt = keras.optimizers.Adam(lr=0.001)
nn8.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
nn8.summary()

In [None]:
## Fit the model 
batch_size = 75
n_epochs = 20

hist7 = nn8.fit(X5, y5, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist7.history['loss']), label='Training Loss')
ax.plot(np.log(hist7.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(hist7.history['accuracy'], label='Training Accuracy')
ax.plot(hist7.history['val_accuracy'], label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

### Retarain with different parameters

In [None]:
# Shuffle the data 
X5, y5 = shuffle(X5,y5)

print(X5.shape)
print(y5.shape)

In [None]:
# Convert the X5 dataframe into floats

X5 = X5.astype('float32')

In [None]:
X5.info()

In [None]:
## Create a new neural network 
# Define input shape
input_shape = X5.shape[1]

# Instantiate the Sequential model 
nn9 = Sequential()

# Add Layers 
nn9.add(Dense(32, activation='relu', input_shape=(input_shape,)))
nn9.add(Dense(16, activation='relu'))


# Add output layer 
nn9.add(Dense(11, activation='relu'))

In [None]:
### Compile the model

# Define a custom optimizer 
#opt = keras.optimizers.Adam(lr=0.001)
nn9.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

nn9.summary()

In [None]:
## Fit the model 
batch_size = 75
n_epochs = 20

hist8 = nn9.fit(X5, y5, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(np.log(hist8.history['loss']), label='Training Loss')
ax.plot(np.log(hist8.history['val_loss']), label='Validation Loss')
ax.set_title("log(Loss) vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(hist8.history['accuracy'], label='Training Accuracy')
ax.plot(hist8.history['val_accuracy'], label='Validation Accuracy')
ax.set_title("Accuracy vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();