## Extreme Multilabel Classification Model Evaluation: Multilayer Perceptron, Classifier Chains

In [12]:
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import dok_matrix, coo_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [13]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
#load data
# f1='dsga-1003-extreme-classification/data/train_features_sparse.pickle'
# f2='dsga-1003-extreme-classification/data/train_labels_sparse.pickle'
f1='data/train_features_sparse.pickle'
f2='data/train_labels_sparse.pickle'
train_features = pickle.load(open(f1, 'rb'))
train_labels = pickle.load(open(f2, 'rb'))

train_features = train_features.toarray()
train_labels = train_labels.toarray()*1

### Multi-Layer Perceptron Classifier
This model is a multi-layer Perceptron-based neural network. The model optimizes the log-loss function using stochastic gradient descent or a quasi-Newton method called limited memory BFGS, and the model is trained iteratively (at each step the partial derivatives of the loss function are recomputed to update the parameters). Using a logistic activation function allows us to get independent scores for each label; combining these scores with a cutoff lets us assign multiple labels to an input.

In [4]:
#80/20 train/val split
X, y = train_features, train_labels
X = StandardScaler().fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=2020)

In [5]:
#Model using default configuration
clf = MLPClassifier()
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [6]:
print('Accuracy on training data: {:.4f}'.format(clf.score(X_train, y_train)))
print('Accuracy on validation data: {:.4f}'.format(clf.score(X_val, y_val)))

Accuracy on training data: 0.8220
Accuracy on validation data: 0.0225


---

In [7]:
#K-fold cross validation split
X, y = train_features, train_labels
X = StandardScaler().fit_transform(X)
kf = KFold(n_splits=5, random_state=2020, shuffle=True)

In [8]:
# Tweaks to configuration:
# Using 5-fold cross validation
# Using a logistic activation function -- will allow us to get independent scores for each label
# Using 'adam' for weight optimization - per sklearn doc this SGD-based optimizer works well for large datasets 
# Increasing 'max-iter' (# of epochs) from default of 200 to 300
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=200)
for k, (train, val) in enumerate(kf.split(X, y)):
    clf.fit(X[train], y[train])
    preds = clf.predict_proba(X[val])
    score = label_ranking_average_precision_score(y[val], preds)
    print("[fold {0}] LRAP score: {1:.5f}".format(k,score))

[fold 0] LRAP score: 0.51200
[fold 1] LRAP score: 0.51635
[fold 2] LRAP score: 0.51630
[fold 3] LRAP score: 0.51175
[fold 4] LRAP score: 0.50617


In [9]:
# # Grid search -- GridSearchCV doesn't work with LRAP :(
# params = {
#     'alpha': [1e-7, 1e-5, 1e-3, 0.01, 0.1, 1, 5],
#     'activation': ['logistic'],
#     'solver': ['lbfgs', 'sgd', 'adam'],
#     'learning_rate_init': [0.0001, 0.001, 0.01]
# }

# clf = MLPClassifier()
# search = GridSearchCV(clf, params, cv=5, scoring='label_ranking_average_precision_score')
# search.fit(X,y)

# ValueError: 'label_ranking_average_precision_score' is not a valid scoring value.

In [10]:
params = {
    'alpha': [1e-7, 1e-5, 1e-3, 0.01, 0.1, 1, 5],
    'activation': ['logistic'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate_init': [0.0001, 0.001, 0.01]
}
res = []
for a in params['alpha']:
    for act in params['activation']:
        for s in params['solver']:
            for lr in params['learning_rate_init']:
                clf = MLPClassifier(alpha=a, activation=act, solver=s, learning_rate_init=lr)
                scores = []
                for k, (train, val) in enumerate(kf.split(X, y)):
                    clf.fit(X[train], y[train])
                    preds = clf.predict_proba(X[val])
                    score = label_ranking_average_precision_score(y[val], preds)
                    scores.append(score)
                LRAP = np.mean(scores)
                res.append([a,act,s,lr,LRAP])

KeyboardInterrupt: 

In [11]:
res

[[1e-07, 'logistic', 'lbfgs', 0.0001, 0.38853850563939674],
 [1e-07, 'logistic', 'lbfgs', 0.001, 0.3888370467396299],
 [1e-07, 'logistic', 'lbfgs', 0.01, 0.3845644126310316],
 [1e-07, 'logistic', 'sgd', 0.0001, 0.0710433161019608],
 [1e-07, 'logistic', 'sgd', 0.001, 0.1914086508914954],
 [1e-07, 'logistic', 'sgd', 0.01, 0.42956595519014373],
 [1e-07, 'logistic', 'adam', 0.0001, 0.2564076052341975],
 [1e-07, 'logistic', 'adam', 0.001, 0.5118946836227639],
 [1e-07, 'logistic', 'adam', 0.01, 0.40481569235954834],
 [1e-05, 'logistic', 'lbfgs', 0.0001, 0.38762618653352077],
 [1e-05, 'logistic', 'lbfgs', 0.001, 0.38540683195536396],
 [1e-05, 'logistic', 'lbfgs', 0.01, 0.3875600246271057],
 [1e-05, 'logistic', 'sgd', 0.0001, 0.07097464719495938],
 [1e-05, 'logistic', 'sgd', 0.001, 0.19065994696829042],
 [1e-05, 'logistic', 'sgd', 0.01, 0.4311268391928934],
 [1e-05, 'logistic', 'adam', 0.0001, 0.25661302809290504],
 [1e-05, 'logistic', 'adam', 0.001, 0.5128328375295367],
 [1e-05, 'logistic', '

In [None]:
results = pd.DataFrame(res, columns=['alpha', 'activation', 'solver','learning_rate_init','LRAP score'])
results

---

### Classifier chains
Classifier chains are commonly used in multilabel setting though they may prove too computationally expensive for the number of labels in our problem. They build a sequence of classifiers where dowstream models take the output of upstream models as input.

In [15]:
X, y = train_features, train_labels
X = StandardScaler().fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=2020)

In [None]:
# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
base_lr = LogisticRegression()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, y_train)
preds = ovr.predict(X_val)
# ovr_jaccard_score = jaccard_score(Y_val, preds, average='samples')
ovr_LRAP_score = label_ranking_average_precision_score(Y_val, preds)
print('Independent OneVsRest Jaccard score: {}'.format(ovr_jaccard_score))
print('Independent OneVsRest LRAP score: {}'.format(ovr_LRAP_score))

In [None]:
# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains.
chains = [ClassifierChain(base_lr, order='random', random_state=i)
          for i in range(10)]
for chain in chains:
    chain.fit(X_train, Y_train)
preds = np.array([chain.predict(X_val) for chain in
                          chains])

In [None]:
# chain_jaccard_scores = [jaccard_score(Y_val, pred >= .5, average='samples') for pred in preds]
chain_LRAP_scores = [label_ranking_average_precision_score(Y_val, pred) for pred in preds]

pred_ensemble = Y_pred_chains.mean(axis=0)
# ensemble_jaccard_score = jaccard_score(Y_val, pred_ensemble >= .5, average='samples')
ensemble_LRAP_score = LRAP_score(Y_val, pred_ensemble)

# scores_J = [ovr_jaccard_score] + chain_jaccard_scores
# scores_J.append(ensemble_jaccard_score)

scores_LRAP = [ovr_LRAP_score] + chain_LRAP_scores
scores_LRAP.append(ensemble_LRAP_score)
model_names = ('Independent','Chain 1','Chain 2','Chain 3','Chain 4','Chain 5',
               'Chain 6','Chain 7','Chain 8','Chain 9','Chain 10','Ensemble')
x_pos = np.arange(len(model_names))


In [None]:
# Plot Jaccard scores for the independent model, each of the chains, and the ensemble
fig, ax = plt.subplots(figsize=(7, 4))
ax.grid(True)
ax.set_title('Classifier Chain Ensemble Performance Comparison')
ax.set_xticks(x_pos)
ax.set_xticklabels(model_names, rotation='vertical')
ax.set_ylabel('Jaccard Similarity Score')
ax.set_ylim([min(scores_J) * .9, max(scores_J) * 1.1])
colors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']
ax.bar(x_pos, scores_J, alpha=0.5, color=colors)
plt.tight_layout()
plt.show()

In [None]:
# Plot LRAP scores for the independent model, each of the chains, and the ensemble
fig, ax = plt.subplots(figsize=(7, 4))
ax.grid(True)
ax.set_title('Classifier Chain Ensemble Performance Comparison')
ax.set_xticks(x_pos)
ax.set_xticklabels(model_names, rotation='vertical')
ax.set_ylabel('LRAP Score')
ax.set_ylim([min(scores_LRAP) * .9, max(scores_LRAP) * 1.1])
colors = ['r'] + ['b'] * len(chain_LRAP_scores) + ['g']
ax.bar(x_pos, scores_LRAP, alpha=0.5, color=colors)
plt.tight_layout()
plt.show()