In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import tqdm
import tensorflow as tf
sns.set(font_scale=1.5, style='darkgrid')

# Import Data

We use the Cleveland heart disease dataset

**Context**

This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to 
this date. The "goal" field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.

**Content**


*Attribute Information:*
  * age 
  * sex 
  * chest pain type (4 values) 
  * resting blood pressure 
  * serum cholestoral in mg/dl 
  * fasting blood sugar > 120 mg/dl
  * resting electrocardiographic results (values 0,1,2)
  * maximum heart rate achieved 
  * exercise induced angina 
  * oldpeak = ST depression induced by exercise relative to rest 
  * the slope of the peak exercise ST segment 
  * number of major vessels (0-3) colored by flourosopy 
  * thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

**Acknowledgements**

*Creators:*

Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D.
Donor: 
David W. Aha (aha '@' ics.uci.edu) (714) 856-8779

We import the data using kaggle. You can also find the dataset in the UCI repository.

In [None]:
import kaggle
kaggle.api.authenticate()

kaggle.api.dataset_download_files(
    'ronitf/heart-disease-uci',
    path='./data',
    quiet=False,
    unzip=True,
    force=False,
)

In [None]:
data = pd.read_csv('./data/heart.csv')

In [None]:
data.head(10)

There are some columns containing multi-category data. In this case, 

In [None]:
data = pd.get_dummies(data, columns=['cp', 'restecg', 'slope', 'ca', 'thal'])

In [None]:
data

Scale data and do a train-validation-test split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
y = data.pop('target').values
x = data.values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)

Remember, only fit the scaler on the training set so that no information about the test set enters the training process at all

In [None]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Baseline Model

We will write some simple functions to build models, train models and evaluate models. This is so that we can avoid writing repeat code, as much as possible.

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tqdm.keras import TqdmCallback

In [None]:
def build_model(**layer_kwargs):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(27, ), **layer_kwargs))
    model.add(Dense(128, activation='relu', **layer_kwargs))
    model.add(Dense(1, activation='sigmoid', **layer_kwargs))
    return model

def train_and_save(model, path, force=True, **kwargs):
    """
    Looks for saved model in path, if found, load.
    If not, compile, train and save model to path
    If force=True, will always retrain
    """
    model_save_dir = pathlib.Path(path)
    model_save_dir.mkdir(parents=True, exist_ok=True)
    model_path = model_save_dir.joinpath('model.h5')
    history_path = model_save_dir.joinpath('history.json')
    
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['binary_accuracy'],
    )

    if model_path.exists() and history_path.exists() and not force:
        model.load_weights(str(model_path))
        history = pd.read_json(history_path)
    else:
        batch_size = kwargs.get('batch_size', 128)
        callbacks = kwargs.get('callbacks', [TqdmCallback(verbose=0)])
        epochs = kwargs.get('epochs', 500)
        validation_data = kwargs.get('validation_data', (x_test, y_test))
        validation_split = kwargs.get('validation_split', 0)
        history = model.fit(
            x=x_train,
            y=y_train,
            batch_size=batch_size,
            validation_data=validation_data,
            validation_split=validation_split,
            verbose=0,
            epochs=epochs,
            callbacks=callbacks,
        )
        model.save_weights(str(model_path))
        history = pd.DataFrame(history.history)
        history.to_json(history_path)
    return history

def evaluate(model, train_data, test_data):
    """
    Evaluate model on train/test sets
    """
    eval_train = model.evaluate(*train_data, verbose=0)
    eval_test = model.evaluate(*test_data, verbose=0)
    print(f'Train - loss = {eval_train[0]:.3f}, acc = {eval_train[1]:.3f} ')
    print(f'Test - loss = {eval_test[0]:.3f}, acc = {eval_test[1]:.3f} ')

In [None]:
model = build_model()

In [None]:
history = train_and_save(model=model, path='baseline')

In [None]:
evaluate(model=model, train_data=(x_train, y_train), test_data=(x_test, y_test))
history.plot(x=None, y=['loss', 'val_loss'])
history.plot(x=None, y=['binary_accuracy', 'val_binary_accuracy'])

# Parameter Norm Penalties

## $L^2$ Regularized Model

In [None]:
from tensorflow.keras.regularizers import l2

In [None]:
alpha = 0.01
model = build_model(kernel_regularizer=l2(alpha))

In [None]:
history = train_and_save(model=model, path='l2_regularized')

In [None]:
evaluate(model=model, train_data=(x_train, y_train), test_data=(x_test, y_test))
history.plot(x=None, y=['loss', 'val_loss'])
history.plot(x=None, y=['binary_accuracy', 'val_binary_accuracy'])

## $L^1$ Regularized Model

In [None]:
from tensorflow.keras.regularizers import l1

In [None]:
alpha = 0.001
model = build_model(kernel_regularizer=l1(alpha))

In [None]:
history = train_and_save(model=model, path='l1_regularized', force=True)

In [None]:
evaluate(model=model, train_data=(x_train, y_train), test_data=(x_test, y_test))
history.plot(x=None, y=['loss', 'val_loss'])
history.plot(x=None, y=['binary_accuracy', 'val_binary_accuracy'])

### Sparsity

In [None]:
def plot_kernel_dist(model, path):
    _ = train_and_save(model=model, path=path, force=False)

    kernel_abs_values = np.concatenate(
        [
            np.abs(w.numpy().ravel()) for w in model.weights if 'kernel' in w.name
        ]
    )

    ax = sns.histplot(kernel_abs_values, bins=50)
    ax.set(yscale='log')
    ax.set_xlabel('Weight Abs Value')
    ax.set_ylabel('Frequency')
    ax.set_title(path)
    
    return kernel_abs_values

In [None]:
aa = plot_kernel_dist(model, 'l1_regularized')

Compare with the $L^2$ case

In [None]:
plot_kernel_dist(model, 'l2_regularized')

Compare with baseline

In [None]:
plot_kernel_dist(model, 'baseline')

# Early Stopping

Here we will implement the early stopping method. This is supported in keras via the `EarlyStopping` callback. 

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stopper = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=1,
)

In [None]:
model = build_model()

Here, we need to be careful about how to perform early stopping. First, we will define a validation set to use for early stopping.

In [None]:
history = train_and_save(
    model=model,
    path='early_stopping',
    callbacks=[TqdmCallback(verbose=0), early_stopper],
    validation_data=None,
    validation_split=0.1,
)

Now, we retrain on the whole training set with the number of epochs equal to the stopping point.

In [None]:
model = build_model()

In [None]:
history = train_and_save(
    model=model,
    path='early_stopping',
    callbacks=[TqdmCallback(verbose=0)],
    epochs=len(history),
)

In [None]:
evaluate(model=model, train_data=(x_train, y_train), test_data=(x_test, y_test))
history.plot(x=None, y=['loss', 'val_loss'])
history.plot(x=None, y=['binary_accuracy', 'val_binary_accuracy'])

# Adding Noise

Now, we consider adding noise to the inputs and features using the `GaussianNoise` layer, which adds i.i.d. Gaussian noise to each input/hidden features.

In [None]:
from tensorflow.keras.layers import GaussianNoise

In [None]:
model = Sequential()
model.add(GaussianNoise(0.5, input_shape=(27, )))
model.add(Dense(256, activation='relu'))
model.add(GaussianNoise(0.5))
model.add(Dense(128, activation='relu'))
model.add(GaussianNoise(0.5))
model.add(Dense(1, activation='sigmoid'))

In [None]:
history = train_and_save(model=model, path='adding_noise')

In [None]:
evaluate(model=model, train_data=(x_train, y_train), test_data=(x_test, y_test))
history.plot(x=None, y=['loss', 'val_loss'])
history.plot(x=None, y=['binary_accuracy', 'val_binary_accuracy'])

# Dropout

We will implement the efficient model-ensembling technique called dropout. This can be easily done using the pre-defined `Dropout` layer in `keras`. 

In [None]:
from tensorflow.keras.layers import Dropout

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(27, )))
model.add(Dropout(rate=0.95))
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=0.95))
model.add(Dense(1, activation='sigmoid'))

Here we will use a rather large dropout rate because of data scarcity. Usually a rate of 0.4-0.6 are used. 

In [None]:
history = train_and_save(model=model, path='dropout')

In [None]:
evaluate(model=model, train_data=(x_train, y_train), test_data=(x_test, y_test))
history.plot(x=None, y=['loss', 'val_loss'])
history.plot(x=None, y=['binary_accuracy', 'val_binary_accuracy'])

# Exercises

1. Combine the different techniques above to further improve performance.
2. Explore how the hyper-parameters, such as regularization strengths, patience parameters or dropout rates affect the performance.