In [1]:
%load_ext autoreload
%autoreload 1
%aimport helpers, loaders, plotters

In [2]:
# This helps with joblib "No space left on device" errors
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


In [3]:
from pprint import pprint
from math import ceil, floor
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from helpers import balanced_accuracy_scorer, balanced_accuracy, load_best_search, load_best_learning, save_learning_curve, save_search_result, scikit_cv_result_to_df
from loaders import load_adult, load_mnist
from plotters import plot_means_w_stds, gen_and_plot_learning_curve, plot_learning_curve, gen_and_plot_validation_curve, plot_validation_curve

### Load Data

In [4]:
dataset = 'MNIST'
learner_type = 'ANN'
loader_func = load_mnist

In [5]:
df = loader_func(preprocess=True)

Target column encoded with the following classes [ 4.  9.]
Dataset shape (13782, 785)
Value composition:
1    0.504861
0    0.495139
Name: target, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  filtered_df['target'] = le.fit_transform(filtered_df['target'])


### Split data into training and testing

In [6]:
from sklearn.model_selection import train_test_split

X = df[[c for c in df.columns if c != 'target']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

### Scale Data

In [7]:
# Use min max scaling because input dataset is sparse, i.e. mostly zeros
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Neural Nets

In [12]:
# Build a KerasClassifier for use with scikit-learn
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier


n_features = X_train.shape[1]


def build_keras_clf(hidden_layer_sizes=(10,10), learning_rate_init=0.01, momentum=0.8):
    """This function builds a Keras model for use with scikit's GridSearch"""
    if not isinstance(hidden_layer_sizes, tuple):
        hidden_layer_sizes = (hidden_layer_sizes,)
    
    model = Sequential()
    
    model.add(Dense(units=n_features, input_shape=(n_features,), activation='relu'))
    
    for layer_size in hidden_layer_sizes:
        assert layer_size > 0
        model.add(Dense(units=layer_size, activation='relu'))
    
    # Add output layer
    model.add(Dense(units=1, activation='sigmoid'))
        
    sgd = SGD(lr=learning_rate_init, momentum=momentum, nesterov=True)
    model.compile(loss='mean_squared_error', optimizer=sgd, metrics=["accuracy"])
    
    return model

clf = KerasClassifier(build_fn=build_keras_clf, verbose=0)

### GridSearch for model complexity curves

In [9]:
layer_sizes = [
    (10,10),
    (50,50),
    (100,100),
    (200,200),
]

param_grid = {
    'hidden_layer_sizes': layer_sizes,
    'learning_rate_init': [0.01, 0.05, .1, .2],
    'momentum': [0.7, 0.8, 0.9],
    'epochs': [1, 10, 100, 200, 500],
    'batch_size': [32, 64, 128]
}

In [15]:
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring='accuracy',
    return_train_score=True,
    cv=4,
    verbose=10,
    n_jobs=-1,
)

In [16]:
grid_search.fit(X_train_scaled, y_train)

Fitting 4 folds for each of 720 candidates, totalling 2880 fits
[CV] batch_size=32, epochs=1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, momentum=0.7 
[CV] batch_size=32, epochs=1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, momentum=0.7 
[CV] batch_size=32, epochs=1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, momentum=0.7 
[CV] batch_size=32, epochs=1, hidden_layer_sizes=(10, 10), learning_rate_init=0.01, momentum=0.7 


KeyboardInterrupt: 

In [None]:
save_search_result(grid_search, dataset, learner_type)

### Model Complexity Analysis

In [None]:
res = grid_search.cv_results_
# res = load_best_search(dataset, learner_type)
res_df = scikit_cv_result_to_df(res)
res_df.sort_values(by='rank_test_score').head(10)

In [None]:
# small_network = res_df.xs((10,10), level='hidden_layer_sizes')
# large_network = res_df.xs((100,100), level='hidden_layer_sizes')

In [None]:
# # Plot combined validation curve
# plot_means_w_stds(
#     means=(small_network['mean_train_score'], large_network['mean_train_score'], small_network['mean_test_score'], large_network['mean_test_score']),
#     stds=(small_network['std_train_score'], large_network['std_train_score'], small_network['std_test_score'], large_network['std_test_score']),
#     xrange=param_grid['epochs'],
#     xlabel='Epochs',
#     ylabel='Accuracy',
#     series_labels=('Train: small network', 'Train: large network', 'CV: small network', 'CV: large network'),
#     title='MNIST - ANN validation curves',
#     legend_kwargs={'loc': 'best'}
# )

### Timing Analysis

In [17]:
# means = [small_network['mean_fit_time'], large_network['mean_fit_time']]
# stds = [small_network['std_fit_time'], large_network['std_fit_time']]
# labels = ['Small network', 'Large network']

# # Calculate per-sample mean
# n_samples = X_train_scaled.shape[0] * 3/4 # three-quarters of dataset was fit for 4-fold validation
# # convert to milliseconds
# means_per_samp = [mean / n_samples * 1000 for mean in means]
# stds_per_samp = [std / n_samples * 1000 for std in stds]

# plot_means_w_stds(
#     means_per_samp,
#     stds_per_samp,
#     param_grid['epochs'],
#     series_labels=labels,
#     title='MNIST - ANN train time complexity',
#     legend_kwargs={'loc': 'best'},
#     ylabel='Fit time per sample (ms)',
#     xlabel='Epochs',
#     linestyles=['-']*2,
# #     fig_kwargs={'figsize': (10,6)},
# )

## Learning Curves for best learner

In [None]:
best_params = res['params'][np.argmax(res['mean_test_score'])]
clf = KerasClassifier(build_fn=build_keras_clf, **best_params)
clf.get_params()

In [None]:
train_sizes, train_mean, train_std, test_mean, test_std =gen_and_plot_learning_curve(clf, X_train_scaled, y_train, ylim=None, cv=4, title='MNIST - ANN learning curves')

In [None]:
save_learning_curve(dataset, learner_type, train_sizes, train_mean, train_std, test_mean, test_std)

# Test Set Performance

In [None]:
best_clf = KerasClassifier(**best_params)

best_clf.fit(X_train_scaled, y_train)
y_test_pred = best_clf.predict(X_test_scaled)
test_score = accuracy_score(y_test, y_test_pred)
print('%.4f' % test_score)