In [1]:
import autograd.numpy as ag_np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.neural_network as NN
import sklearn.model_selection
import scipy.stats
from AbstractBaseCollabFilterSGD import AbstractBaseCollabFilterSGD
from train_valid_test_loader import load_train_valid_test_datasets

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem

In [2]:
# Load the dataset
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
    load_train_valid_test_datasets()
user_info = pd.read_csv('./data_movie_lens_100k/user_info.csv')

In [3]:
# Create the model and initialize its parameters
# to have right scale as the dataset (right num users and items)
model = CollabFilterOneVectorPerItem(
    n_epochs=156, batch_size=508, step_size=0.8108726428612864,
    alpha=0.061177501974355154, n_factors=170)
model.init_parameter_dict(n_users, n_items, train_tuple)

In [4]:
# Fit the model with SGD
model.fit(train_tuple, valid_tuple)

In [20]:
pDict = model.param_dict
yhat_model_tr = model.predict(train_tuple[0], train_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
yhat_model_va = model.predict(valid_tuple[0], valid_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
yhat_model_te = model.predict(test_tuple[0], test_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])

In [21]:
y_tr_N = train_tuple[2] - yhat_model_tr
y_va_M = valid_tuple[2] - yhat_model_va
y_te_L = test_tuple[2] - yhat_model_te

In [22]:
pDict = model.param_dict
x_tr_NF = ag_np.empty((len(train_tuple[0]), 346))
for i in range(len(train_tuple[0])):
    user_id = train_tuple[0][i]
    item_id = train_tuple[1][i]
    x_tr_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_tr[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][train_tuple[0][i]]], [user_info['is_male'][train_tuple[0][i]]]))

pDict = model.param_dict
x_va_MF = ag_np.empty((len(valid_tuple[0]), 346))
for i in range(len(valid_tuple[0])):
    user_id = valid_tuple[0][i]
    item_id = valid_tuple[1][i]
    x_va_MF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_va[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][valid_tuple[0][i]]], [user_info['is_male'][valid_tuple[0][i]]]))

pDict = model.param_dict
x_te_LF = ag_np.empty((len(test_tuple[0]), 346))
for i in range(len(test_tuple[0])):
    user_id = test_tuple[0][i]
    item_id = test_tuple[1][i]
    x_te_LF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_te[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][test_tuple[0][i]]], [user_info['is_male'][test_tuple[0][i]]]))

In [23]:
mlp = NN.MLPRegressor(
    hidden_layer_sizes=[32],
    solver='lbfgs',
    max_iter=1000)

In [24]:
mlp.fit(x_va_MF, y_va_M)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor(hidden_layer_sizes=[32], max_iter=1000, solver='lbfgs')

In [25]:
yhat = mlp.predict(x_te_LF) + yhat_model_te

In [26]:
mae_model = ag_np.mean(ag_np.absolute(yhat_model_te - test_tuple[2]))
print(mae_model)

0.713054268774339


In [27]:
mae = ag_np.mean(ag_np.absolute(yhat - test_tuple[2]))
print(mae)

0.7193858889676837


In [28]:
my_scoring_metric_name = 'neg_mean_absolute_error'

In [29]:
xall_N2 = ag_np.vstack([x_va_MF, x_te_LF])
yall_N = ag_np.hstack([y_va_M, y_te_L])

In [30]:
valid_indicators_N = ag_np.hstack([
    -1 * ag_np.ones(y_va_M.size), # -1 means never include this example in any test split
    0  * ag_np.ones(y_te_L.size), #  0 means include in the first test split (we count starting at 0 in python)
    ])

In [31]:
# Create splitter object using Predefined Split

my_splitter = sklearn.model_selection.PredefinedSplit(valid_indicators_N)

In [32]:
# Verify the splitter only produces one split and it is the intended one
for tr_idx, te_idx in my_splitter.split(xall_N2, yall_N):
    assert ag_np.allclose(xall_N2[te_idx], x_te_LF)
    assert ag_np.allclose(yall_N[te_idx], y_te_L)

In [33]:
my_parameter_distributions_by_name = dict(
    hidden_layer_sizes=scipy.stats.randint(10, 150),
    alpha=scipy.stats.uniform(0.0, 1.0),
    random_state=[  # try two possible seeds to initialize parameters
        13, 169,
        ],
    max_iter=scipy.stats.randint(20, 1000)
    )

In [34]:
n_trials_rand_search = 16

In [35]:
my_rand_searcher = sklearn.model_selection.RandomizedSearchCV(
    mlp,
    my_parameter_distributions_by_name,
    scoring=my_scoring_metric_name,
    cv=my_splitter,
    n_iter=n_trials_rand_search,
    random_state=13, # same seed means same results everytime we repeat this code
    verbose=4,
    )

In [36]:
my_rand_searcher.fit(xall_N2, yall_N)

Fitting 1 folds for each of 16 candidates, totalling 16 fits
[CV] alpha=0.7777024105738202, hidden_layer_sizes=84, max_iter=548, random_state=13 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.8s remaining:    0.0s


[CV]  alpha=0.7777024105738202, hidden_layer_sizes=84, max_iter=548, random_state=13, score=-0.719, total=  53.8s
[CV] alpha=0.8929826912712245, hidden_layer_sizes=136, max_iter=982, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.4min remaining:    0.0s


[CV]  alpha=0.8929826912712245, hidden_layer_sizes=136, max_iter=982, random_state=13, score=-0.720, total= 2.5min
[CV] alpha=0.6416133447590692, hidden_layer_sizes=84, max_iter=861, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.8min remaining:    0.0s


[CV]  alpha=0.6416133447590692, hidden_layer_sizes=84, max_iter=861, random_state=169, score=-0.720, total= 1.4min
[CV] alpha=0.2984494708891794, hidden_layer_sizes=149, max_iter=490, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.2984494708891794, hidden_layer_sizes=149, max_iter=490, random_state=169, score=-0.719, total= 1.4min
[CV] alpha=0.44626619112700183, hidden_layer_sizes=34, max_iter=432, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.44626619112700183, hidden_layer_sizes=34, max_iter=432, random_state=169, score=-0.716, total=  22.9s
[CV] alpha=0.6144541007156207, hidden_layer_sizes=68, max_iter=554, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.6144541007156207, hidden_layer_sizes=68, max_iter=554, random_state=13, score=-0.718, total=  52.0s
[CV] alpha=0.21789900913168891, hidden_layer_sizes=143, max_iter=140, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.21789900913168891, hidden_layer_sizes=143, max_iter=140, random_state=169, score=-0.717, total=  24.3s
[CV] alpha=0.6244325270137528, hidden_layer_sizes=59, max_iter=556, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.6244325270137528, hidden_layer_sizes=59, max_iter=556, random_state=169, score=-0.717, total=  43.1s
[CV] alpha=0.05283655977782398, hidden_layer_sizes=81, max_iter=923, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.05283655977782398, hidden_layer_sizes=81, max_iter=923, random_state=169, score=-0.722, total= 1.5min
[CV] alpha=0.8128411710026234, hidden_layer_sizes=16, max_iter=275, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.8128411710026234, hidden_layer_sizes=16, max_iter=275, random_state=169, score=-0.716, total=   8.2s
[CV] alpha=0.5092622000835182, hidden_layer_sizes=96, max_iter=186, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.5092622000835182, hidden_layer_sizes=96, max_iter=186, random_state=169, score=-0.717, total=  20.5s
[CV] alpha=0.6681904420257511, hidden_layer_sizes=148, max_iter=812, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.6681904420257511, hidden_layer_sizes=148, max_iter=812, random_state=13, score=-0.721, total= 2.3min
[CV] alpha=0.7122326779115835, hidden_layer_sizes=71, max_iter=210, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.7122326779115835, hidden_layer_sizes=71, max_iter=210, random_state=13, score=-0.718, total=  20.6s
[CV] alpha=0.6181132100183178, hidden_layer_sizes=15, max_iter=276, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.6181132100183178, hidden_layer_sizes=15, max_iter=276, random_state=169, score=-0.717, total=   9.8s
[CV] alpha=0.2444757021979903, hidden_layer_sizes=51, max_iter=264, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.2444757021979903, hidden_layer_sizes=51, max_iter=264, random_state=169, score=-0.717, total=  21.9s
[CV] alpha=0.37933329148306616, hidden_layer_sizes=69, max_iter=348, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 14.1min finished


[CV]  alpha=0.37933329148306616, hidden_layer_sizes=69, max_iter=348, random_state=169, score=-0.717, total=  29.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=MLPRegressor(hidden_layer_sizes=[32],
                                          max_iter=1000, solver='lbfgs'),
                   n_iter=16,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa0f53d3fd0>,
                                        'hidden_layer_sizes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa0c9667e20>,
                                        'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa0f53c6610>,
                                        'random_state': [13, 169]},
                   random_state=13, scoring='neg_mean_absolute_error',
                   verbose=4)

In [37]:
rsearch_results_df = pd.DataFrame(my_rand_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(rsearch_results_df.shape)))

print("Dataframe has columns:")
for c in rsearch_results_df.columns:
    print("-- %s" % c)

Dataframe has shape: (16, 13)
Dataframe has columns:
-- mean_fit_time
-- std_fit_time
-- mean_score_time
-- std_score_time
-- param_alpha
-- param_hidden_layer_sizes
-- param_max_iter
-- param_random_state
-- params
-- split0_test_score
-- mean_test_score
-- std_test_score
-- rank_test_score


In [39]:
param_keys = ['param_hidden_layer_sizes', 'param_alpha', 'param_random_state', 'param_max_iter']
rsearch_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

Unnamed: 0,param_hidden_layer_sizes,param_alpha,param_random_state,param_max_iter,split0_test_score,rank_test_score
0,84,0.777702,13,548,-0.718615,11
1,136,0.892983,13,982,-0.720323,14
2,84,0.641613,169,861,-0.720062,13
3,149,0.298449,169,490,-0.71868,12
4,34,0.446266,169,432,-0.716421,2
5,68,0.614454,13,554,-0.717714,10
6,143,0.217899,169,140,-0.71684,5
7,59,0.624433,169,556,-0.716906,7
8,81,0.0528366,169,923,-0.721518,16
9,16,0.812841,169,275,-0.715605,1


In [40]:
bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
print(bestr_mlp)

MLPRegressor(alpha=0.8128411710026234, hidden_layer_sizes=16, max_iter=275,
             random_state=169, solver='lbfgs')


In [41]:
all_data = (ag_np.concatenate((valid_tuple[0], test_tuple[0])),
            ag_np.concatenate((valid_tuple[1], test_tuple[1])),
            ag_np.concatenate((valid_tuple[2], test_tuple[2])))
# Fit the model with SGD
#model.fit(all_data, valid_tuple)

In [42]:
yhat_model_final = model.predict(all_data[0], all_data[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
y_final_N = all_data[2] - yhat_model_final

In [43]:
pDict = model.param_dict
x_final_NF = ag_np.empty((len(all_data[0]), 346))
for i in range(len(all_data[0])):
    user_id = all_data[0][i]
    item_id = all_data[1][i]
    x_final_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_final[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][all_data[0][i]]], [user_info['is_male'][all_data[0][i]]]))


In [44]:
bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
bestr_mlp.fit(x_final_NF, y_final_N)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor(alpha=0.8128411710026234, hidden_layer_sizes=16, max_iter=275,
             random_state=169, solver='lbfgs')

In [45]:
import pandas as pd
select_movies_df = pd.read_csv("./data_movie_lens_100k/ratings_masked_leaderboard_set.csv")
leaderboard_user_id = select_movies_df["user_id"]
leaderboard_item_id = select_movies_df["item_id"]

yhat_model_preds = model.predict(leaderboard_user_id, leaderboard_item_id, pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])

x_preds_NF = ag_np.empty((len(leaderboard_user_id), 346))
for i in range(len(leaderboard_user_id)):
    user_id = leaderboard_user_id[i]
    item_id = leaderboard_item_id[i]
    x_preds_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_preds[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][leaderboard_user_id[i]]], [user_info['is_male'][leaderboard_user_id[i]]]))

yhat = bestr_mlp.predict(x_preds_NF) + yhat_model_preds

print(yhat)
yhat_df = pd.DataFrame(yhat)
print(yhat_df)
yhat_df.to_csv(r'predicted_ratings_leaderboard.txt', header=None, index=None)

[3.82678153 4.25898307 3.71281202 ... 4.28046232 3.397163   2.91977935]
             0
0     3.826782
1     4.258983
2     3.712812
3     3.563962
4     4.536685
...        ...
9995  3.754703
9996  4.277174
9997  4.280462
9998  3.397163
9999  2.919779

[10000 rows x 1 columns]
