## MLP Cross Validation for Hyperparameter TUning

In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import dok_matrix, coo_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
#load data
# f1='dsga-1003-extreme-classification/data/train_features_sparse.pickle'
# f2='dsga-1003-extreme-classification/data/train_labels_sparse.pickle'
f1='data/train_features_sparse.pickle'
f2='data/train_labels_sparse.pickle'
train_features = pickle.load(open(f1, 'rb'))
train_labels = pickle.load(open(f2, 'rb'))

train_features = train_features.toarray()
train_labels = train_labels.toarray()*1

### Multi-Layer Perceptron Classifier
This model is a multi-layer Perceptron-based neural network. The model optimizes the log-loss function using stochastic gradient descent or a quasi-Newton method called limited memory BFGS, and the model is trained iteratively (at each step the partial derivatives of the loss function are recomputed to update the parameters). Using a logistic activation function allows us to get independent scores for each label; combining these scores with a cutoff lets us assign multiple labels to an input.

In [4]:
#80/20 train/val split
X, y = train_features, train_labels
X = StandardScaler().fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=2020)

In [5]:
#Model using default configuration
clf = MLPClassifier()
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [6]:
print('Accuracy on training data: {:.4f}'.format(clf.score(X_train, y_train)))
print('Accuracy on validation data: {:.4f}'.format(clf.score(X_val, y_val)))

Accuracy on training data: 0.0000
Accuracy on validation data: 0.0000


---

In [7]:
#K-fold cross validation split
X, y = train_features, train_labels
X = StandardScaler().fit_transform(X)
kf = KFold(n_splits=5, random_state=2020, shuffle=True)

In [None]:
params = {
    'alpha': [1e-7, 1e-5, 1e-3, 0.01, 0.1, 1, 5],
    'activation': ['logistic'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate_init': [0.0001, 0.001, 0.01]
}
res = []
for a in params['alpha']:
    for act in params['activation']:
        for s in params['solver']:
            for lr in params['learning_rate_init']:
                clf = MLPClassifier(alpha=a, activation=act, solver=s, learning_rate_init=lr)
                scores = []
                for k, (train, val) in enumerate(kf.split(X, y)):
                    clf.fit(X[train], y[train])
                    preds = clf.predict_proba(X[val])
                    score = label_ranking_average_precision_score(y[val], preds)
                    scores.append(score)
                LRAP = np.mean(scores)
                res.append([a,act,s,lr,LRAP])

In [13]:
results = pd.DataFrame(res, columns=['alpha', 'activation', 'solver','learning_rate_init','LRAP score'])
results

Unnamed: 0,alpha,activation,solver,learning_rate_init,LRAP score
0,1e-07,logistic,lbfgs,0.0001,0.388539
1,1e-07,logistic,lbfgs,0.001,0.388837
2,1e-07,logistic,lbfgs,0.01,0.384564
3,1e-07,logistic,sgd,0.0001,0.071043
4,1e-07,logistic,sgd,0.001,0.191409
5,1e-07,logistic,sgd,0.01,0.429566
6,1e-07,logistic,adam,0.0001,0.256408
7,1e-07,logistic,adam,0.001,0.511895
8,1e-07,logistic,adam,0.01,0.404816
9,1e-05,logistic,lbfgs,0.0001,0.387626


In [16]:
results.sort_values('LRAP score',ascending=False)

Unnamed: 0,alpha,activation,solver,learning_rate_init,LRAP score
22,0.01,logistic,adam,0.001,0.51422
16,1e-05,logistic,adam,0.001,0.512833
20,0.001,logistic,adam,0.001,0.512389
7,1e-07,logistic,adam,0.001,0.511895
24,0.1,logistic,adam,0.001,0.499702
25,0.1,logistic,adam,0.01,0.48697
23,0.01,logistic,adam,0.01,0.462161
14,1e-05,logistic,sgd,0.01,0.431127
5,1e-07,logistic,sgd,0.01,0.429566
17,1e-05,logistic,adam,0.01,0.406926


The best configuration found used alpha=0.01, logistic activation function, solver='adam', and learning rate of 0.001. The LRAP score attained was 0.514220.

---