# This notebook corresponds to the MNIST Day 11 Activity

In [None]:
#imports
import numpy as np
import statistics as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras as ke
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

#importing NMIST data
from keras.datasets import mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
#splitting into train/validation data
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, train_size = 0.15, random_state=2)

### K-Fold Cross Validation
    * While our traditional approach of just splitting a macro Dataset into test/training data is fine,
    it is more accurate to test our model with train data pulled from different subsets of the entire Dataset.
    * K Fold CV splits a DS into various sub-bins (folds). Each fold is set as the test data once, while all other
    folds are used to train the model (find optimal parameters).
    *Model accuracy on the K sub-test datasets provides a more robust metric on overall model performance

In [None]:

### Showing some exploratory data plots

In [None]:
#Digit Distrbutions
plt.figure(figsize=(15,7))
g = sns.countplot(Y_train, palette="icefire")
plt.title("Number of digit classes")
plt.show()

In [None]:
#example image from x_train/ Dataset

#pulling 0th x image array from x_train
x_img_arr = X_train[0,:]
#configuring plot
plt.imshow(x_img_arr,cmap='gray')
#titling w/ the image's respective #
plt.title(f'{Y_train[0]}')
plt.axis("off")
plt.show()

### As per the kaggle notebook, we must normalize+reshape+label encode data for easier model training

#### Similar taking a z score in stats, this division by 255 will normalize our data and squish pixel values to be bound from 0-->1
#### Pixel values in x arrays can only range from 0 --> 255, therefore dividing x arrays through by 255 will make nice data

In [None]:
#dividing through by 255
X_train = X_train/255.0
X_test = X_test/255.0

In [None]:
#reshaping x data so it is of the form (# of images,image width,image height,ColorDimension)
"""
Note:
ColorDimension = the # of data components associated with the pixels which give rise to our image.
RGB images have a CD of 3 (each pixel contains data for Red,Green,Blue)
By Contrast, Grayscale images have a CD of 1 (each pixel only contains data for its degree of black)
"""
X_train = X_train.reshape(X_train.shape[0],28,28,1)
X_test = X_test.reshape(X_test.shape[0],28,28,1)

#### label encoding = converting non numeric data labels (such as tall/short) to numbers (such as 1/0)
#### label encoding is critical because it allows our model to calculate loss duing Supervised Learning and perform GD
#### in this case, y labels will be encoded w.r.t to the number they represent
#### ex: if the y output number is 7, we will label encode such that the model recieves a y output vector:

[[0],[0],[0],[0],[0],[0],[0],[1],[0],[0]]

#### For each feed forward, we will compare the ith neuron's prediction vs. the ith position in our label encoded y array
#### Very easy to directly calcualte associated log losses from each neuron in the output layer and backpropogate


In [None]:
#label encoding y train & test data
Y_train = to_categorical(Y_train, num_classes = 10)
#Y_train = Y_train.reshape(Y_train.shape[0],1,Y_train.shape[1])
Y_test = to_categorical(Y_test, num_classes = 10)
#Y_test = Y_test.reshape(Y_test.shape[0],1,Y_test.shape[1])

#### Data has now been standardized, reshpaed, & label encoded. Now we can import CNN models for fitting


In [None]:
#making model imports from keras

from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop,Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

#### Our CNN will have 2 Conv layers, 2 Pool Layers. We will then flatten data & Feed into Fully Connected NN wtih layers (1 input, 1 output)
#### We will perform 2 droupouts during forward propogation to combat overfitting

##### conv --> pool --> [dropout regularization] conv --> pool --> [dropout regularization] Flatten Data --> Fully Connnected ANN

In [None]:
#creating a function to define a CNN model

#I will be messing with:
    #the dropout rate (convo layer)
    #the dropout rate (fully-connected layer)
    #Activation function used by the Convo/FC layers

def gen_model(convo_dropout_rate,fc_nn_dropout_rate,hidden_layer_activiation_func):
    model = Sequential()
    #conv layer 1
    model.add(Conv2D(filters = 8, kernel_size = (5,5),padding = 'Same',
                     activation ='relu', input_shape = (28,28,1)))
    #pooling layer 1
    model.add(MaxPool2D(pool_size=(2,2)))
    #first dropout regularization
    model.add(Dropout(convo_dropout_rate))
    #conv layer 2
    model.add(Conv2D(filters = 16, kernel_size = (3,3),padding = 'Same',
                     activation ='relu'))
    #pooling layer 2
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    #second dropout regularization
    model.add(Dropout(convo_dropout_rate))

    #flattening model tensors before feeding into FC NN
    model.add(Flatten())

    #we have 256 Neurons b/c we have flattend our pooled data matrix to be of length 256 (we need 1 neruon per input variable)
    model.add(Dense(256, activation = "relu"))
    #third dropout regularization
    model.add(Dropout(0.5))
    #output layer w/ 10 neurons to perform the digit classification
    model.add(Dense(10, activation = "softmax")) #emply softmax b/c mulitonomial classification [0-->9]

    #defining the GD optimizer for our model
    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

    #defining our multinomial cost function
    model.compile(optimizer = optimizer , loss='categorical_crossentropy', metrics=["accuracy"])

    return model

In [None]:
#as another stall against overfitting we will inject some noise into our images
datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # dimesion reduction
        rotation_range=5,  # randomly rotate images in the range 5 degrees
        zoom_range = 0.1, # Randomly zoom image 10%
        width_shift_range=0.1,  # randomly shift images horizontally 10%
        height_shift_range=0.1,  # randomly shift images vertically 10%
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)

datagen.fit(X_train)

In [None]:
#this method trains the passed model in more robust manner through K-Fold CV w/ 5 Folds
# fun tweak parameter = # of epochs we train for
# to save computing power, all models will be trained w/ a constant batch size (200)

from sklearn.model_selection import KFold

def k_fold_train(epoch_num,convo_dropout_rate,fc_nn_dropout_rate,hidden_layer_activiation_func):
    #init list of scores and model fit logs during sub-training
    scores, histories = [], []

    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    for train_ix, test_ix in kfold.split(X_train):
        #spltting our macro train data into k subsets
        xtrain, xtest = X_train[train_ix], X_train[test_ix]
        ytrain, ytest = Y_train[train_ix], Y_train[test_ix]
        #fitting model and pulling history
        model = gen_model(convo_dropout_rate,fc_nn_dropout_rate,hidden_layer_activiation_func)
        history  = model.fit(datagen.flow(xtrain,ytrain, batch_size=200),
                              epochs = epoch_num, validation_data = (xtest,ytest), steps_per_epoch=xtrain.shape[0] //200)
        #pulling model accuracy on the subtrain data
        _, acc = model.evaluate(xtest, ytest, verbose=0)
        print(f'Model accuracy during current subfold: {round(acc*100,4)}%')
        #storing accuracies/histories for model metadata analysis
        scores.append(acc)
        histories.append(history)

    return scores, histories

In [None]:
def summarize_performance(scores):
	# print summary
	print(f'Accuracy Metrics: mean= {round(stats.mean(scores),4)} | std = {stats.stdev(scores)} | n={len(scores)} 3f std=%.3f, n=%d')
	# box and whisker plots of results
	pyplot.boxplot(scores)
	pyplot.show()

In [None]:
def summarize_diagnostics(histories):
	for i in range(len(histories)):
		# plot loss
		plt.subplot(2, 1, 1)
		plt.title('Cross Entropy Loss')
		plt.plot(histories[i].history['loss'], color='blue', label='train')
		plt.plot(histories[i].history['val_loss'], color='orange', label='test')
		# plot accuracy
		plt.subplot(2, 1, 2)
		plt.title('Classification Accuracy')
		plt.plot(histories[i].history['accuracy'], color='blue', label='train')
		plt.plot(histories[i].history['val_accuracy'], color='orange', label='test')
	plt.show()

In [None]:
#init arrays of various values for our hyperparameters of interest

convo_dropout_rates  = [0,0.25,0.5,0.75,1]
fc_nn_dropout_rate = [0,0.25,0.5,0.75,1]
hidden_layer_activiation_func = ['logistic','relu','tanh']


mean_scores_log = [] #log of model average under different hyperparams
histories_dict = {} #meanscore,history obj

#this function is used to plot the optimal 3 models from the above set of hyperparameters
for convo_drop_rate in convo_dropout_rates:
    for fc_nn_dop_rate in fc_nn_dropout_rate:
        for act_func in hidden_layer_activiation_func:
            scores,histories =  k_fold_train(epoch_num=5,convo_dropout_rate=convo_drop_rate,fc_nn_dropout_rate=fc_nn_dop_rate,
                                             hidden_layer_activiation_func=act_func)
            mean_score = stats.mean(scores)
            if len(mean_scores_log)<3:
                mean_scores_log.append(mean_score) #adding w/o discrimination until we have 3 scores arrays
                histories_dict[mean_score] = histories
            else:
                if mean_score>min(mean_scores_log):
                    #deleting the lowest score from our mean score log
                    min_mean_score = min(mean_scores_log)
                    del mean_scores_log[mean_scores_log.index(min_mean_score)]
                    mean_scores_log.append(mean_score)
                    #adding the new higher score to the dict and removing the older lower score
                    histories_dict[mean_score] = histories
                    histories_dict.pop(min_mean_score)
                else:
                    pass

#plotting the metrics of the 3 best models:
for score in mean_scores_log:
    print(f'This model had a mean accuracy score of {score}')
    summarize_diagnostics(histories_dict[score])