In [1]:
import autograd.numpy as ag_np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.neural_network as NN
import sklearn.model_selection
import sklearn.metrics
import scipy.stats
from AbstractBaseCollabFilterSGD import AbstractBaseCollabFilterSGD
from train_valid_test_loader import load_train_valid_test_datasets

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem

In [2]:
# Load the dataset
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
    load_train_valid_test_datasets()
user_info = pd.read_csv('./data_movie_lens_100k/user_info.csv')

In [3]:
all_data = (ag_np.concatenate((train_tuple[0], valid_tuple[0], test_tuple[0])),
            ag_np.concatenate((train_tuple[1], valid_tuple[1], test_tuple[1])),
            ag_np.concatenate((train_tuple[2], valid_tuple[2], test_tuple[2])))

In [4]:
# Create the model and initialize its parameters
# to have right scale as the dataset (right num users and items)
model = CollabFilterOneVectorPerItem(
    n_epochs=156, batch_size=508, step_size=0.8108726428612864,
    alpha=0.061177501974355154, n_factors=170)
model.init_parameter_dict(n_users, n_items, all_data)

In [5]:
# Fit the model with SGD
model.fit(all_data, valid_tuple)

In [6]:
U = model.param_dict['U']
x_tr_NF = U[train_tuple[0]]
x_va_MF = U[valid_tuple[0]]
x_te_LF = U[test_tuple[0]]

y_tr_N = user_info['is_male'][train_tuple[0]]
y_va_M = user_info['is_male'][valid_tuple[0]]
y_te_L = user_info['is_male'][test_tuple[0]]

In [None]:
mlp = NN.MLPClassifier(
    hidden_layer_sizes=[32],
    solver='lbfgs',
    max_iter=1000)

In [None]:
mlp.fit(x_tr_NF, y_tr_N)

In [None]:
yhat_te_L = mlp.predict(x_te_LF)
BA = sklearn.metrics.balanced_accuracy_score(y_te_L, yhat_te_L)
print(BA)
print(yhat_te_L[0:10])
print(y_te_L[0:10])

In [None]:
my_scoring_metric_name = 'balanced_accuracy'

In [9]:
x_search_NFpMF = ag_np.vstack([x_tr_NF, x_va_MF])
y_search_NpM = ag_np.hstack([y_tr_N, y_va_M])

In [None]:
valid_indicators_NpM = ag_np.hstack([
    -1 * ag_np.ones(y_tr_N.size), # -1 means never include this example in any test split
    0  * ag_np.ones(y_va_M.size), #  0 means include in the first test split (we count starting at 0 in python)
    ])

In [None]:
# Create splitter object using Predefined Split

my_splitter = sklearn.model_selection.PredefinedSplit(valid_indicators_NpM)

In [None]:
# Verify the splitter only produces one split and it is the intended one
for tr_idx, te_idx in my_splitter.split(x_search_NFpMF, y_search_NpM):
    assert ag_np.allclose(x_search_NFpMF[te_idx], x_va_MF)
    assert ag_np.allclose(y_search_NpM[te_idx], y_va_M)

In [None]:
my_parameter_distributions_by_name = dict(
    hidden_layer_sizes=scipy.stats.randint(10, 150),
    alpha=scipy.stats.uniform(0.0, 1.0),
    random_state=[  # try two possible seeds to initialize parameters
        13, 169,
        ],
    max_iter=scipy.stats.randint(20, 1000)
    )

In [None]:
n_trials_rand_search = 16

In [None]:
my_rand_searcher = sklearn.model_selection.RandomizedSearchCV(
    mlp,
    my_parameter_distributions_by_name,
    scoring=my_scoring_metric_name,
    cv=my_splitter,
    n_iter=n_trials_rand_search,
    random_state=13, # same seed means same results everytime we repeat this code
    verbose=4,
    )

In [None]:
my_rand_searcher.fit(x_search_NFpMF, y_search_NpM)

In [None]:
rsearch_results_df = pd.DataFrame(my_rand_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(rsearch_results_df.shape)))

print("Dataframe has columns:")
for c in rsearch_results_df.columns:
    print("-- %s" % c)

In [None]:
param_keys = ['param_hidden_layer_sizes', 'param_alpha', 'param_random_state', 'param_max_iter']
rsearch_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

In [7]:
#bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
bestr_mlp = NN.MLPClassifier(
    alpha=0.7777024105738202,
    hidden_layer_sizes=[84],
    solver='lbfgs',
    max_iter=548,
    random_state=13)
print(bestr_mlp)

MLPClassifier(alpha=0.7777024105738202, hidden_layer_sizes=[84], max_iter=548,
              random_state=13, solver='lbfgs')


In [10]:
bestr_mlp.fit(x_search_NFpMF, y_search_NpM)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=0.7777024105738202, hidden_layer_sizes=[84], max_iter=548,
              random_state=13, solver='lbfgs')

In [11]:
yhat_te_L = bestr_mlp.predict(x_te_LF)
BA = sklearn.metrics.balanced_accuracy_score(y_te_L, yhat_te_L)
print(BA)

1.0


In [12]:
CM = sklearn.metrics.confusion_matrix(y_te_L, yhat_te_L)
print(CM)

[[2558    0]
 [   0 7442]]


In [13]:
print(sum(y_te_L))

7442


In [None]:
bestr_mlp.fit(x_te_LF, y_te_L)

In [None]:
yhat_search_NpM = bestr_mlp.predict(x_search_NFpMF)
BA = sklearn.metrics.balanced_accuracy_score(y_search_NpM, yhat_search_NpM)
print(BA)