# <center>IEE 520: Fall 2019</center>

# <center>Neural Networks (9/19/19)</center>

## <center>Klim Drobnyh (klim.drobnyh@asu.edu)</center>

In [None]:
# For compatibility with Python 2
from __future__ import print_function

# To not to show warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# To load datasets
from sklearn import datasets

# To import the classifier (Neural Networks)
import sklearn.neural_network as NN

# To scale the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# To measure accuracy
from sklearn import metrics

# To split data to train and test
from sklearn.model_selection import train_test_split

# To perform parameter search
from sklearn.model_selection import GridSearchCV

# To support plots
import matplotlib.pyplot as plt

# To display all the plots inline
%matplotlib inline

In [None]:
# To increase quality of figures
plt.rcParams["figure.figsize"] = (10, 5)

## <center>1. Breast cancer dataset (classification)</center>

### <center>Load the dataset</center>

In [None]:
X, y = datasets.load_breast_cancer(True)

In [None]:
X.shape

The dataset consists of 569 instances, 357 benign and 212 malignant. 30 features were collected. This is a classification problem with 2 target classes.

### <center>Split to train and test</center>

Here, the data will be split to train and test. Only 10% of data will be used for testing purpose.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=520)

### <center>Train a simple neural network</center>

Now, let's train a simple neural network with default parameters, just 10 neurons and one hidden layer.

In [None]:
model = NN.MLPClassifier(hidden_layer_sizes=(10,), random_state=520)
model.fit(X_train, y_train)
y_hat_test = model.predict(X_test)

Scores (accuracies):

In [None]:
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))

Confusion matrix:

In [None]:
# You need to install pandas_ml in order to use that!
# conda install -c conda-forge pandas_ml

# Uncomment the next line to install a missing package to Google Colab Environment
# !pip install pandas_ml
from pandas_ml import ConfusionMatrix

In [None]:
cm = ConfusionMatrix(y_test, y_hat_test)
cm.print_stats()
cm.plot(backend='seaborn', annot=True, fmt='g')
plt.show()

#### Conclusion: the model is simple and underfits the data

## <center>Random state</center>

In [None]:
model = NN.MLPClassifier(hidden_layer_sizes=(10,), random_state=520)
model.fit(X_train, y_train)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))

In [None]:
model = NN.MLPClassifier(hidden_layer_sizes=(10,), random_state=521)
model.fit(X_train, y_train)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))

In [None]:
model = NN.MLPClassifier(hidden_layer_sizes=(10,), random_state=522)
model.fit(X_train, y_train)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))

We can see here that result might change depending just on weight initialization.

### <center>Scale the data</center>

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Now, let's train exactly the same classifier, but using the scaled data

In [None]:
model = NN.MLPClassifier(hidden_layer_sizes=(10,), random_state=520)
model.fit(X_train_scaled, y_train)
y_hat_test = model.predict(X_test_scaled)

Scores:

In [None]:
print('Train score:', model.score(X_train_scaled, y_train))
print('Test score:', model.score(X_test_scaled, y_test))

Confusion matrix:

In [None]:
cm = ConfusionMatrix(y_test, y_hat_test)
cm.print_stats()
cm.plot(backend='seaborn', annot=True, fmt='g')
plt.show()

## <center>2. Diabetes dataset (regression)</center>

### <center>Load the dataset</center>

In [None]:
X, y = datasets.load_diabetes(True)

In [None]:
X.shape

The dataset consists of 442 instances. 10 features were collected. This is a regression problem, the target is a quantitative measure of disease progression one year after baseline.

Here, the data will be split to train and test. Only 10% of data will be used for testing purpose.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=520)

### <center>Scale the data</center>

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Scale target variable, so we can use logistic activation function
scaler = MinMaxScaler(feature_range=(0.1, 0.9))
scaler.fit(y_train.reshape((y_train.shape[0], 1)))
y_train = scaler.transform(y_train.reshape((y_train.shape[0], 1))).reshape((y_train.shape[0],))
y_test = scaler.transform(y_test.reshape((y_test.shape[0], 1))).reshape((y_test.shape[0],))

### <center>Train a simple neural network</center>

Now, let's train a simple neural network with default parameters.

In [None]:
model = NN.MLPRegressor(random_state=520)
model.fit(X_train, y_train)
y_hat_test = model.predict(X_test)
y_hat_train = model.predict(X_train)

Scores (coefficients of determination, might be negative also):

In [None]:
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))

Visualize the results (predicted vs actual):

In [None]:
def predicted_vs_actual(predicted, actual):
    plt.plot(predicted, actual, 'ro')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Predicted vs Actual')
    plt.show()
    
results_plot = predicted_vs_actual

In [None]:
results_plot(y_hat_train, y_train)
results_plot(y_hat_test, y_test)

### <center>Train more complex neural networks</center>

In [None]:
model = NN.MLPRegressor(hidden_layer_sizes = (40, 40, 40, 40), activation='relu', random_state=520, max_iter = 10000, tol=1e-10)
model.fit(X_train, y_train)
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))
results_plot(y_hat_train, y_train)
results_plot(y_hat_test, y_test)

What problem do we have? Overfitting!

In [None]:
# Based on the following:
# Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

import numpy as np
from sklearn.model_selection import validation_curve

def plot_validation_curve(estimator, title, X, y, ylim=None, cv=None,
                          n_jobs=1, iterations=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Number of iterations")
    plt.ylabel("Score")
    train_scores, test_scores = validation_curve(model, X, y, "max_iter",
                                                 iterations,
                                                 cv=cv)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(iterations, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(iterations, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(iterations, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(iterations, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

Let's plot a validation curve:

In [None]:
model = NN.MLPRegressor(hidden_layer_sizes=(40, 40, 40, 40), activation='relu', random_state=520, tol=1e-10)
plot_validation_curve(model, "Validation Curve (Neural Network, MLP)", X_train, y_train, ylim=(0.0, 1.01), cv=5, iterations=list(np.arange(5, 150, 3)))
plt.show()

Let's train the optimal number of iterations:

In [None]:
model = NN.MLPRegressor(hidden_layer_sizes=(40, 40, 40, 40), activation='relu', random_state=520, max_iter=23, tol=1e-10, verbose=True)
model.fit(X_train, y_train)
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))
results_plot(y_hat_train, y_train)
results_plot(y_hat_test, y_test)

Let's use cross-validation to find optimal parameters:

In [None]:
# Hidden layer sizes
# hidden_layer_sizes : tuple, default (100,)

# Activation funciton
# activation : {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’

# Optimization solver
# solver : {‘lbfgs’, ‘sgd’, ‘adam’}, default ‘adam’

# L2 penatly parameter
# alpha : float, optional, default 0.0001

# Batch size
# batch_size : int, optional, default ‘auto’

# Learning rate (for sgd solver only)
# learning_rate : {‘constant’, ‘invscaling’, ‘adaptive’}, default ‘constant’

# Learning rate init (for sgd or adam only)
# learning_rate_init : double, optional, default 0.001

# Exponent for inverse scaling learning rate (for sgd with invscaling only)
# power_t : double, optional, default 0.5

# Maximum number of iterations
# max_iter : int, optional, default 200

# Shuffle (for sgd or adam only)
# shuffle : bool, optional, default True

# Random state
# random_state : int, RandomState instance or None, optional, default None

# Tolerance for the optimization
# tol : float, optional, default 1e-4

# Varbose: print progress
# verbose : bool, optional, default False

# Warm start: do we want to reuse the solution of the previous call?
# warm_start : bool, optional, default False

# Momentum (for sgd only)
# momentum : float, default 0.9

# Nesterov's momentum (for sgd only, when momentum > 0)
# nesterovs_momentum : boolean, default True

# Early stopping
# early_stopping : bool, default False

# Validation set for early stopping (only when early_stopping is True)
# validation_fraction : float, optional, default 0.1

# Beta 1 (for adam only). 
# Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1).
# beta_1 : float, optional, default 0.9

# Beta 2 (for adam only).
# Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1).
# beta_2 : float, optional, default 0.999

# Epsilon (for adam only). Value for numerical stability
# epsilon : float, optional, default 1e-8

# Maximum number of epochs to not meet tol improvement (for sgd or adam only)
# n_iter_no_change : int, optional, default 10

NN_CV = GridSearchCV(NN.MLPRegressor(hidden_layer_sizes=(40, 40, 40, 40), activation='relu', random_state=520, max_iter=10000, tol=1e-10, early_stopping=True),
                     cv=5,
                     param_grid={
                         "alpha": [0.001, 0.1, 1, 10],
                         "learning_rate_init": [0.001, 0.01, 0.1, 1]
                     })

NN_CV.fit(X_train, y_train)
print('The parameters found by CV search:')
print(NN_CV.best_params_)
model = NN_CV
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))
results_plot(y_hat_train, y_train)
results_plot(y_hat_test, y_test)

Now, let's try logistic activation function:

In [None]:
NN_CV = GridSearchCV(NN.MLPRegressor(hidden_layer_sizes=(40, 40, 40, 40), activation='logistic', random_state=520, max_iter=10000, early_stopping=True),
                     cv=5,
                     param_grid={
                         "alpha": [0.0001, 0.001, 0.1, 1],
                         "learning_rate_init": [0.001, 0.01, 0.1, 1]
                     })

NN_CV.fit(X_train, y_train)
print('The parameters found by CV search:')
print(NN_CV.best_params_)
model = NN_CV
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)
print('Train score:', model.score(X_train, y_train))
print('Test score:', model.score(X_test, y_test))
results_plot(y_hat_train, y_train)
results_plot(y_hat_test, y_test)