# Episode "24-keras-classify.md" workbook

This is supposed to be a literal "work out" of the Python codes in the "24-keras-classify" episode of the NN lesson.
No new code should exist here other than those in the official lesson page *and* some necessary hacks!


FIXME:

* Some scratch cells exist below temporarily which should be moved elsewhere where the appropriate trials/experiments should be done.


In [None]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Tools for machine learning:
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# for evaluating model performance
from sklearn.metrics import accuracy_score, confusion_matrix
# classic machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Tools for deep learning:
import tensorflow as tf
import tensorflow.keras as keras

# Import key Keras objects
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
%matplotlib inline

In [None]:
# Print key diagnostics information -- Python environment
print(pd)
print(pd.__version__)
print(sklearn)
print(sklearn.__version__)
print(tf)
print(tf.__version__)

In [None]:
print(tf.keras)
print(tf.keras.__version__)

## Initial Exploration

In [None]:
df = pd.read_csv("sherlock/sherlock_18apps.csv", index_col=0)

## Summarize the dataset
print("* shape:", df.shape)
print()
print("* info::\n")
df.info()
print()
print("* describe::\n")
print(df.describe().T)
print()

In [None]:
#EXTRAS
df.head(10)

In [None]:
#EXTRAS
print(df.head(10))

In [None]:
df['state'].count()

### Exploration of the Labels

In [None]:
df

In [None]:
app_frequencies = df['ApplicationName'].value_counts()
print(app_frequencies)
print('Total num of apps = ', len(app_frequencies))

In [None]:
app_frequencies2 = df.groupby('ApplicationName')['ApplicationName'].count().sort_values(ascending=False)
print(app_frequencies2)
print("Total num of apps = ", len(app_frequencies2))

## Data Cleaning and Preprocessing

Cleaning (comprehensive)

In [None]:
# Missing data or bad data or irrelevant data
del_features_bad = [
    'cminflt', # all-missing feature
    'guest_time', # all-flat feature
]
df2 = df.drop(del_features_bad, axis=1)

print("Cleaning:")
print("- dropped %d columns: %s" % (len(del_features_bad), del_features_bad))

In [None]:
print("- remaining missing data (per feature):")

isna_counts = df2.isna().sum()
print(isna_counts[isna_counts > 0])
print("- dropping the rest of missing data")

df2.dropna(inplace=True)

print("- remaining shape: %s" % (df2.shape,))

### Extracting & Preprocessing the Labels

Separating labels from features

In [None]:
labels = df2['ApplicationName']
df_features = df2.drop('ApplicationName', axis=1)

One-hot encoding: labels

In [None]:
df_labels_onehot = pd.get_dummies(labels)

In [None]:
labels.head()

In [None]:
df_labels_onehot.head()

In [None]:
print(df_labels_onehot.head(5))

### Preprocess Features

In [None]:
df_features.head()

In [None]:
df_features['state'].value_counts()

In [None]:
"""Perform one-hot encoding for **all** categorical features."""
print("Step: Converting all non-numerical features to one-hot encoding.")
# This will be explained later
df_features = pd.get_dummies(df_features)

In [None]:
df_features.head()

#### Feature scaling with StandardScaler

In [None]:
print("Step: Feature scaling with StandardScaler")

# keep the unscaled feature matrix under a different name:
df_features_unscaled = df_features
scaler = preprocessing.StandardScaler()
scaler.fit(df_features_unscaled)

# Recast the features still in a dataframe form
df_features = pd.DataFrame(scaler.transform(df_features_unscaled),
                           columns=df_features_unscaled.columns,
                           index=df_features_unscaled.index)
print("After scaling:")
print(df_features.head(10))
print()

### Train-test split

Step: Perform train-test split on the master dataset.
This should be the last step before constructing & training the model.

In [None]:
# percent size reserved for validation dataset
val_size = 0.2
# for this notebook (reproducibility)
random_state = 34
# for lesson:
#random_state = np.random.randint(1000000)

print("Step: Train-validation split  val_size=%s  random_state=%s" \
      % (val_size, random_state))

train_features, val_features, train_L_onehot, val_L_onehot = \
    train_test_split(df_features, df_labels_onehot,
                     test_size=val_size, random_state=random_state)

print("- training dataset:   %d records" % (len(train_features),))
print("- validation dataset: %d records" % (len(val_features),))
print("Now the data is ready for machine learning!")
sys.stdout.flush()

In [None]:
train_features.head()

## Model 0: No Hidden Layer

### Learner-facing model

This is the model definition function that will be for the learners:

In [None]:
def NN_Model_no_hidden(learning_rate):
    """Definition of a neural network model with no hidden layer"""
    # (optional if these were already imported earlier)
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.optimizers import Adam

    model = Sequential([
        Dense(18, activation='softmax', input_shape=(19,),
              kernel_initializer='random_normal')
    ])
    adam_opt = Adam(learning_rate=learning_rate,
                    beta_1=0.9, beta_2=0.999,
                    amsgrad=False)
    model.compile(optimizer=adam_opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

### (Developer's version of model 0)

In [None]:
model_0 = NN_Model_no_hidden(0.0003)
model_0.summary()

In [None]:
history_0 = model_0.fit(train_features,
                        train_L_onehot,
                        epochs=5, batch_size=32,
                        validation_data=(val_features, val_L_onehot),
                        verbose=2)

Original training outputs, see Git commit 7034350b26f810e4a46228553c05d506da55738a dated 2024-07-24.
Generated with TensorFlow/Keras 2.6.0 on an Intel x86_64 CPU:

```
Epoch 1/5
6827/6827 - 5s - loss: 1.6855 - accuracy: 0.5588 - val_loss: 1.2626 - val_accuracy: 0.7095
Epoch 2/5
6827/6827 - 6s - loss: 1.1122 - accuracy: 0.7378 - val_loss: 1.0053 - val_accuracy: 0.7692
Epoch 3/5
6827/6827 - 5s - loss: 0.9280 - accuracy: 0.7854 - val_loss: 0.8720 - val_accuracy: 0.7924
Epoch 4/5
6827/6827 - 5s - loss: 0.8196 - accuracy: 0.8032 - val_loss: 0.7852 - val_accuracy: 0.8079
Epoch 5/5
6827/6827 - 5s - loss: 0.7454 - accuracy: 0.8167 - val_loss: 0.7229 - val_accuracy: 0.8214
```

### Reviewing Training History

In [None]:
print(history_0)

In [None]:
history_0.history

In [None]:
print(history_0.history)

In [None]:
print(history_0.epoch)

In [None]:
df_history_0 = pd.DataFrame(data=history_0.history, index=history_0.epoch)

In [None]:
df_history_0

Plot the loss value w/o a plotting function:

In [None]:
epochs_0 = np.array(history_0.epoch)
plt.plot(epochs_0, history_0.history['loss'])
plt.plot(epochs_0+1, history_0.history['val_loss'])
plt.show()

In [None]:
def plot_loss(model_history):
    '''Plots the values of the loss function for the training
    and validation datasets.
    '''
    epochs = np.array(model_history.epoch)
    plt.plot(epochs, model_history.history['loss'])
    plt.plot(epochs+1, model_history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper right')
    fig = plt.gcf()
    plt.show()
    return fig

def plot_acc(model_history):
    '''Plots the values of the accuracy for the training
    and validation datasets.
    '''
    epochs = np.array(model_history.epoch)
    plt.plot(epochs, model_history.history['accuracy'])
    plt.plot(epochs+1, model_history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    fig = plt.gcf()
    plt.show()
    return fig

In [None]:
fig_loss = plot_loss(history_0)
fig_acc = plot_acc(history_0)

## Model 1: One Hidden Layer

### Learner-facing model

In [None]:
def NN_Model_1H(hidden_neurons,learning_rate):
    """Definition of deep learning model with one dense hidden layer"""
    # (optional if these were already imported earlier)
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.optimizers import Adam
    # define the network
    model = Sequential([
        Dense(hidden_neurons, activation='relu',
              input_shape=(19,),
              kernel_initializer='random_normal'),
        Dense(18, activation='softmax',
              kernel_initializer='random_normal')
    ])
    # define the optimization algorithm
    adam_opt = Adam(learning_rate=learning_rate,
                    beta_1=0.9, beta_2=0.999,
                    amsgrad=False)
    model.compile(optimizer=adam_opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

### (Developer's version of model 1H)

In [None]:
model_1 = NN_Model_1H(18, 0.0003)
model_1.summary()
history_1 = model_1.fit(train_features,
                        train_L_onehot,
                        epochs=10, batch_size=32,
                        validation_data=(val_features, val_L_onehot),
                        verbose=2)

In [None]:
fig_loss_1 = plot_loss(history_1)
fig_acc_1 = plot_acc(history_1)

In [None]:
fig_loss_1.savefig("figs/18apps/model_1H_trn1_plot_loss.png")
fig_acc_1.savefig("figs/18apps/model_1H_trn1_plot_acc.png")

In [None]:
df_history_1 = pd.DataFrame(data=history_1.history, index=history_1.epoch)

In [None]:
df_history_1

### Saving Model

In [None]:
model_1.save("model_1_e10.h5")

### Reloading Model

In [None]:
model_1_reload = tf.keras.models.load_model("model_1_e10.h5")

In [None]:
model_1_reload.summary()

### Inference

Example one-point inference, using the first datapoint of the training data

In [None]:
sample0 = train_features.iloc[[0], :]
sample0

In [None]:
print(sample0)

In [None]:
pred0 = model_1_reload.predict(sample0)
pred0

In [None]:
print(pred0)

In [None]:
pred0.argmax()

In [None]:
label0 = train_L_onehot.iloc[0]
print(label0)

In [None]:
train_L_onehot.iloc[0].argmax()

In [None]:
label0.argmax()

In [None]:
pred0.argmax() == train_L_onehot.iloc[0].argmax()

In [None]:
sample1 = val_features.iloc[[0], :]
sample1

In [None]:
pred1 = model_1_reload.predict(sample1)
pred1.argmax()

In [None]:
val_L_onehot.iloc[0].argmax()

Batch prediction & validation

In [None]:
samples_b1 = val_features.head(10)
samples_b1

In [None]:
print(samples_b1)

In [None]:
preds_b1_onehot = model_1_reload.predict(samples_b1)
print(preds_b1_onehot.shape)

with np.printoptions(precision=3, suppress=True, linewidth=150):
    print(preds_b1_onehot)

In [None]:
preds_b1 = preds_b1_onehot.argmax(axis=1)
print(preds_b1)

In [None]:
val_L_onehot.head(10)

In [None]:
labels_b1_onehot = val_L_onehot.head(10)
print(labels_b1_onehot)

In [None]:
labels_b1 = labels_b1_onehot.values.argmax(axis=1)
print(labels_b1)

In [None]:
print(np.equal(preds_b1, labels_b1))

Comment: Nine right predictions and one wrong prediction.

### Checking Convergence



In [None]:
h1_val_accuracy = np.array(history_1.history['val_accuracy'])

In [None]:
h1_val_accuracy[6:10] - h1_val_accuracy[5:9]

### Better Plotting Routines

In [None]:
# Function to plot the training and validation loss over epochs
def plot_loss2(model_history, epoch_shifts= None, show=True):
    # If no epoch shifts are provided, default to (0,1)
    if epoch_shifts is None:
        epoch_shifts = (0, 1)

    # Calculate the shifted epochs for both training and validation
    epochs_train = np.array(model_history.epoch) + epoch_shifts[0]
    epochs_val = np.array(model_history.epoch) + epoch_shifts[1]
    
    # Plot training loss with circle markers
    plt.plot(epochs_train, model_history.history['loss'], '-o', label='Train Loss')
    # Plot validation loss with cross markers
    plt.plot(epochs_val, model_history.history['val_loss'], '-x', label='Val Loss')

    # Set plot title and axis labels
    plt.title('Model Loss', fontsize=14)
    plt.ylabel('Loss', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)

    # Adjust x-axis limits to include all epochs
    plt.xlim([min(np.min(epochs_train), np.min(epochs_val)), max(np.max(epochs_train), np.max(epochs_val))])

    # Position the legend in the upper right corner
    plt.legend(loc='upper right')

    # Increase font size for axis ticks
    plt.tick_params(axis='x', labelsize=14)
    plt.tick_params(axis='y', labelsize=14)

    fig = plt.gcf()
    # Display the plot if 'show' is True
    if show:
        plt.show()

    # Return the current figure if further manipulations are needed
    return fig

In [None]:
# Function to plot the training and validation accuracy over epochs
def plot_acc2(model_history, epoch_shifts=None, show=True):
    # Default to (0,1) if no epoch shifts are provided
    if epoch_shifts is None:
        epoch_shifts = (0, 1)

    # Calculate the shifted epochs for both training and validation
    epochs_train = np.array(model_history.epoch) + epoch_shifts[0]
    epochs_val = np.array(model_history.epoch) + epoch_shifts[1]

    # Plot training accuracy with circle markers
    plt.plot(epochs_train, model_history.history['accuracy'], '-o', label='Train Accuracy')
    # Plot validation accuracy with cross markers
    plt.plot(epochs_val, model_history.history['val_accuracy'], '-x', label='Val Accuracy')

    # Set plot title and axis labels
    plt.title('Model Accuracy', fontsize=14)
    plt.ylabel('Accuracy', fontsize=14)
    plt.xlabel('Epoch', fontsize=14)

    # Adjust x-axis limits to include all epochs
    plt.xlim([min(np.min(epochs_train), np.min(epochs_val)), max(np.max(epochs_train), np.max(epochs_val))])

    # Position the legend in the lower right corner
    plt.legend(loc='lower right')

    # Increase font size for axis ticks
    plt.tick_params(axis='x', labelsize=14)
    plt.tick_params(axis='y', labelsize=14)

    fig = plt.gcf()
    # Display the plot if 'show' is True
    if show:
        plt.show()
    
    # Return the current figure if further manipulations are needed
    return fig

In [None]:
# Function to combine the loss and accuracy plots into a single figure with two subplots
def combine_plots(model_history, 
                  plot_loss_func, 
                  plot_acc_func, 
                  figsize=(10.0, 5.0),
                  loss_epoch_shifts= None,
                  acc_epoch_shifts= None,
                  show=True,
                  wspace=0.4):  # Controls space between subplots
    
    # Create a new figure with the specified size
    plt.figure(figsize=figsize)
    
    # Subplot for loss
    plt.subplot(1, 2, 1)  # 1 row, 2 columns, first subplot
    plot_loss_func(model_history, epoch_shifts=loss_epoch_shifts, show=False)
    
    # Subplot for accuracy
    plt.subplot(1, 2, 2)  # 1 row, 2 columns, second subplot
    plot_acc_func(model_history, epoch_shifts=acc_epoch_shifts, show=False)
    
    # Adjust the space between subplots
    plt.subplots_adjust(wspace=wspace)
    
    fig = plt.gcf()
    # Display the combined plot if 'show' is True
    if show:
        plt.show()
    
    # Return the current figure if further manipulations are needed
    return fig

### Continuing the Training

In [None]:
# real epochs: 11-20
history_2 = model_1.fit(train_features,
                        train_L_onehot,
                        epochs=10, batch_size=32,
                        validation_data=(val_features, val_L_onehot),
                        verbose=2)

In [None]:
fig_loss_2 = plot_loss(history_2)
fig_acc_2 = plot_acc(history_2)

In [None]:
fig_loss_2.savefig("figs/18apps/model_1H_trn2_plot_loss.png")
fig_acc_2.savefig("figs/18apps/model_1H_trn2_plot_acc.png")

In [None]:
# real epochs: 21-30
history_3 = model_1.fit(train_features,
                        train_L_onehot,
                        epochs=10, batch_size=32,
                        validation_data=(val_features, val_L_onehot),
                        verbose=2)

In [None]:
fig_loss_3 = plot_loss(history_3)
fig_acc_3 = plot_acc(history_3)

In [None]:
fig_loss_3.savefig("figs/18apps/model_1H_trn3_plot_loss.png")
fig_acc_3.savefig("figs/18apps/model_1H_trn3_plot_acc.png")

In [None]:
# real epochs: 31-60
history_4 = model_1.fit(train_features,
                        train_L_onehot,
                        epochs=30, batch_size=32,
                        validation_data=(val_features, val_L_onehot),
                        verbose=2)

In [None]:
plt.figure(figsize=(15, 4.8))
fig_loss_4 = plot_loss(history_4)

plt.figure(figsize=(15, 4.8))
fig_acc_4 = plot_acc(history_4)

In [None]:
fig_loss_4.savefig("figs/18apps/model_1H_trn4_plot_loss.png")
fig_acc_4.savefig("figs/18apps/model_1H_trn4_plot_acc.png")

In [None]:
# real epochs: 61-100
history_5 = model_1.fit(train_features,
                        train_L_onehot,
                        epochs=40, batch_size=32,
                        validation_data=(val_features, val_L_onehot),
                        verbose=2)

In [None]:
plt.figure(figsize=(15, 4.8))
fig_loss_5 = plot_loss(history_5)

plt.figure(figsize=(15, 4.8))
fig_acc_5 = plot_acc(history_5)

In [None]:
fig_loss_5.savefig("figs/18apps/model_1H_trn5_plot_loss.png")
fig_acc_5.savefig("figs/18apps/model_1H_trn5_plot_acc.png")

### Examining Results of Training

(Epochs 1 -- 100, or however long you took for your own experiment)

Combine the loss & accuracy figures in one panel plot:

In [None]:
fig_combined_1 = combine_plots(history_1, plot_loss2, plot_acc2,
                               acc_epoch_shifts=(0,1), loss_epoch_shifts=(0,1))

In [None]:
fig_combined_1.savefig("figs/18apps/model_1H_trn1_plot_combined.png")

In [None]:
fig_combined_2 = combine_plots(history_2, plot_loss2, plot_acc2,
                               acc_epoch_shifts=(10,11), loss_epoch_shifts=(10,11))

In [None]:
fig_combined_2.savefig("figs/18apps/model_1H_trn2_plot_combined.png")

In [None]:
fig_combined_3 = combine_plots(history_3, plot_loss2, plot_acc2,
                               acc_epoch_shifts=(20,21), loss_epoch_shifts=(20,21))

In [None]:
fig_combined_3.savefig("figs/18apps/model_1H_trn3_plot_combined.png")

In [None]:
fig_combined_4 = combine_plots(history_4, plot_loss2, plot_acc2,
                               acc_epoch_shifts=(30,31), loss_epoch_shifts=(30,31),
                               figsize=(15,5))

In [None]:
fig_combined_4.savefig("figs/18apps/model_1H_trn4_plot_combined.png")

In [None]:
fig_combined_5 = combine_plots(history_5, plot_loss2, plot_acc2,
                               acc_epoch_shifts=(60,61), loss_epoch_shifts=(60,61),
                               figsize=(15,5))

In [None]:
fig_combined_5.savefig("figs/18apps/model_1H_trn5_plot_combined.png")

# The Postscript

## Solution: Data Cleaning

```python
#RUNIT
"""Perform cleaning of a Sherlock "18-apps" dataset.
Columns with  the obviously bad and missing data are removed.
"""
# Missing data or bad data
del_features_bad = [
    'cminflt', # all-missing feature
    'guest_time', # all-flat feature
]
df2 = df.drop(del_features_bad, axis=1)

print("Cleaning:")
print("- dropped %d columns: %s" % (len(del_features_bad), del_features_bad))
```
Output:
```
Cleaning:
- dropped 2 columns: ['cminflt', 'guest_time']
```


```python
#RUNIT
print("- remaining missing data (per feature):")

isna_counts = df2.isna().sum()
print(isna_counts[isna_counts > 0])
print("- dropping the rest of missing data")

df2.dropna(inplace=True)

print("- remaining shape: %s" % (df2.shape,))
```
Output:
```
- remaining missing data (per feature):
CPU_USAGE      52
cutime         52
num_threads    52
priority       52
rss            52
state          52
stime          52
utime          52
vsize          52
dtype: int64
- dropping the rest of missing data
- remaining shape: (273077, 17)
```