In [1]:
import autograd.numpy as ag_np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.neural_network as NN
import sklearn.model_selection
import scipy.stats
from AbstractBaseCollabFilterSGD import AbstractBaseCollabFilterSGD
from train_valid_test_loader import load_train_valid_test_datasets

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem

In [2]:
# Load the dataset
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
    load_train_valid_test_datasets()
user_info = pd.read_csv('./data_movie_lens_100k/user_info.csv')

In [3]:
# Create the model and initialize its parameters
# to have right scale as the dataset (right num users and items)
model = CollabFilterOneVectorPerItem(
    n_epochs=10, batch_size=1000, step_size=0.9,
    alpha=0, n_factors=50)
model.init_parameter_dict(n_users, n_items, train_tuple)

In [4]:
# Fit the model with SGD
model.fit(train_tuple, valid_tuple)

epoch       0.000 | loss_total     1.50100 | train_MAE     1.00262 | valid_MAE     1.00801 | grad_wrt_mu     0.93400 | grad_wrt_b_per_user     0.00164 | grad_wrt_c_per_item     0.00095 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.014 | loss_total     1.33303 | train_MAE     0.90895 | valid_MAE     0.90623 | grad_wrt_mu     0.66030 | grad_wrt_b_per_user     0.00148 | grad_wrt_c_per_item     0.00084 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.029 | loss_total     1.28150 | train_MAE     0.97313 | valid_MAE     0.97625 | grad_wrt_mu     0.52353 | grad_wrt_b_per_user     0.00150 | grad_wrt_c_per_item     0.00091 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.043 | loss_total     1.30380 | train_MAE     0.92011 | valid_MAE     0.91874 | grad_wrt_mu     0.26083 | grad_wrt_b_per_user     0.00142 | grad_wrt_c_per_item     0.00085 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.129 | loss_total     1.11861 | train_MAE     0

In [5]:
pDict = model.param_dict
yhat_model_tr = model.predict(train_tuple[0], train_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
yhat_model_va = model.predict(valid_tuple[0], valid_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])

In [6]:
y_tr_N = train_tuple[2] - yhat_model_tr
y_va_M = valid_tuple[2] - yhat_model_va

In [7]:
pDict = model.param_dict
x_tr_NF = ag_np.empty((len(train_tuple[0]), 106))
for i in range(len(train_tuple[0])):
    user_id = train_tuple[0][i]
    item_id = train_tuple[1][i]
    x_tr_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_tr[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][train_tuple[0][i]]], [user_info['is_male'][train_tuple[0][i]]]))

pDict = model.param_dict
x_va_MF = ag_np.empty((len(valid_tuple[0]), 106))
for i in range(len(valid_tuple[0])):
    user_id = valid_tuple[0][i]
    item_id = valid_tuple[1][i]
    x_va_MF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_va[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][valid_tuple[0][i]]], [user_info['is_male'][valid_tuple[0][i]]]))

In [8]:
mlp = NN.MLPRegressor(
    hidden_layer_sizes=[32],
    solver='lbfgs',
    max_iter=1000)

In [9]:
mlp.fit(x_tr_NF, y_tr_N)

MLPRegressor(hidden_layer_sizes=[32], max_iter=1000, solver='lbfgs')

In [10]:
yhat = mlp.predict(x_va_MF) + yhat_model_va

In [11]:
mae_model = ag_np.mean(ag_np.absolute(yhat_model_va - valid_tuple[2]))
print(mae_model)

0.7780449277653884


In [12]:
mae = ag_np.mean(ag_np.absolute(yhat - valid_tuple[2]))
print(mae)

0.7632915219476523


In [13]:
my_scoring_metric_name = 'neg_mean_absolute_error'

In [14]:
xall_N2 = ag_np.vstack([x_tr_NF, x_va_MF])
yall_N = ag_np.hstack([y_tr_N, y_va_M])

In [15]:
valid_indicators_N = ag_np.hstack([
    -1 * ag_np.ones(y_tr_N.size), # -1 means never include this example in any test split
    0  * ag_np.ones(y_va_M.size), #  0 means include in the first test split (we count starting at 0 in python)
    ])

In [16]:
# Create splitter object using Predefined Split

my_splitter = sklearn.model_selection.PredefinedSplit(valid_indicators_N)

In [17]:
# Verify the splitter only produces one split and it is the intended one
for tr_idx, te_idx in my_splitter.split(xall_N2, yall_N):
    assert ag_np.allclose(xall_N2[te_idx], x_va_MF)
    assert ag_np.allclose(yall_N[te_idx], y_va_M)

In [26]:
my_parameter_distributions_by_name = dict(
    hidden_layer_sizes=scipy.stats.randint(10, 100),
    alpha=scipy.stats.uniform(0.0, 1.0),
    random_state=[  # try two possible seeds to initialize parameters
        13, 169,
        ],
    max_iter=scipy.stats.randint(20, 500)
    )

In [27]:
n_trials_rand_search = 16

In [28]:
my_rand_searcher = sklearn.model_selection.RandomizedSearchCV(
    mlp,
    my_parameter_distributions_by_name,
    scoring=my_scoring_metric_name,
    cv=my_splitter,
    n_iter=n_trials_rand_search,
    random_state=13, # same seed means same results everytime we repeat this code
    verbose=4,
    )

In [29]:
my_rand_searcher.fit(xall_N2, yall_N)

Fitting 1 folds for each of 16 candidates, totalling 16 fits
[CV] alpha=0.7777024105738202, hidden_layer_sizes=84, max_iter=36, random_state=13 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.2s remaining:    0.0s


[CV]  alpha=0.7777024105738202, hidden_layer_sizes=84, max_iter=36, random_state=13, score=-0.764, total=  18.2s
[CV] alpha=0.8929826912712245, hidden_layer_sizes=35, max_iter=256, random_state=13 
[CV]  alpha=0.8929826912712245, hidden_layer_sizes=35, max_iter=256, random_state=13, score=-0.763, total=   9.4s
[CV] alpha=0.4534492474173122, hidden_layer_sizes=76, max_iter=450, random_state=13 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.6s remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.8min remaining:    0.0s


[CV]  alpha=0.4534492474173122, hidden_layer_sizes=76, max_iter=450, random_state=13, score=-0.759, total= 3.4min
[CV] alpha=0.38804298432184925, hidden_layer_sizes=84, max_iter=349, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.38804298432184925, hidden_layer_sizes=84, max_iter=349, random_state=169, score=-0.763, total= 3.2min
[CV] alpha=0.2984494708891794, hidden_layer_sizes=42, max_iter=423, random_state=169 
[CV]  alpha=0.2984494708891794, hidden_layer_sizes=42, max_iter=423, random_state=169, score=-0.763, total=   9.0s
[CV] alpha=0.47159228006641585, hidden_layer_sizes=96, max_iter=95, random_state=169 
[CV]  alpha=0.47159228006641585, hidden_layer_sizes=96, max_iter=95, random_state=169, score=-0.763, total=  31.5s
[CV] alpha=0.5305754270224363, hidden_layer_sizes=38, max_iter=31, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.5305754270224363, hidden_layer_sizes=38, max_iter=31, random_state=13, score=-0.763, total=   7.3s
[CV] alpha=0.35833378270496974, hidden_layer_sizes=32, max_iter=310, random_state=13 
[CV]  alpha=0.35833378270496974, hidden_layer_sizes=32, max_iter=310, random_state=13, score=-0.763, total=   9.9s
[CV] alpha=0.9391065487220239, hidden_layer_sizes=15, max_iter=140, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.9391065487220239, hidden_layer_sizes=15, max_iter=140, random_state=169, score=-0.762, total=  22.9s
[CV] alpha=0.6244325270137528, hidden_layer_sizes=95, max_iter=325, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.6244325270137528, hidden_layer_sizes=95, max_iter=325, random_state=13, score=-0.760, total= 3.9min
[CV] alpha=0.8738134432795387, hidden_layer_sizes=55, max_iter=347, random_state=169 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.8738134432795387, hidden_layer_sizes=55, max_iter=347, random_state=169, score=-0.761, total= 2.7min
[CV] alpha=0.5258438573244301, hidden_layer_sizes=16, max_iter=275, random_state=169 
[CV]  alpha=0.5258438573244301, hidden_layer_sizes=16, max_iter=275, random_state=169, score=-0.763, total=   2.6s
[CV] alpha=0.5092622000835182, hidden_layer_sizes=96, max_iter=186, random_state=169 
[CV]  alpha=0.5092622000835182, hidden_layer_sizes=96, max_iter=186, random_state=169, score=-0.763, total=  26.4s
[CV] alpha=0.6681904420257511, hidden_layer_sizes=20, max_iter=300, random_state=13 
[CV]  alpha=0.6681904420257511, hidden_layer_sizes=20, max_iter=300, random_state=13, score=-0.763, total=   4.5s
[CV] alpha=0.7122326779115835, hidden_layer_sizes=71, max_iter=210, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV]  alpha=0.7122326779115835, hidden_layer_sizes=71, max_iter=210, random_state=13, score=-0.761, total= 1.1min
[CV] alpha=0.6181132100183178, hidden_layer_sizes=15, max_iter=276, random_state=169 
[CV]  alpha=0.6181132100183178, hidden_layer_sizes=15, max_iter=276, random_state=169, score=-0.763, total=   6.4s


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 16.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=MLPRegressor(hidden_layer_sizes=[32],
                                          max_iter=1000, solver='lbfgs'),
                   n_iter=16,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f972d995f70>,
                                        'hidden_layer_sizes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f972d995a90>,
                                        'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f972d995d90>,
                                        'random_state': [13, 169]},
                   random_state=13, scoring='neg_mean_absolute_error',
                   verbose=4)

In [30]:
rsearch_results_df = pd.DataFrame(my_rand_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(rsearch_results_df.shape)))

print("Dataframe has columns:")
for c in rsearch_results_df.columns:
    print("-- %s" % c)

Dataframe has shape: (16, 13)
Dataframe has columns:
-- mean_fit_time
-- std_fit_time
-- mean_score_time
-- std_score_time
-- param_alpha
-- param_hidden_layer_sizes
-- param_max_iter
-- param_random_state
-- params
-- split0_test_score
-- mean_test_score
-- std_test_score
-- rank_test_score


In [31]:
param_keys = ['param_hidden_layer_sizes', 'param_alpha', 'param_random_state']
rsearch_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

Unnamed: 0,param_hidden_layer_sizes,param_alpha,param_random_state,split0_test_score,rank_test_score
0,84,0.777702,13,-0.763776,16
1,35,0.892983,13,-0.763308,10
2,76,0.453449,13,-0.759324,1
3,84,0.388043,169,-0.763393,15
4,42,0.298449,169,-0.763298,8
5,96,0.471592,169,-0.763335,14
6,38,0.530575,13,-0.76326,6
7,32,0.358334,13,-0.763305,9
8,15,0.939107,169,-0.762337,5
9,95,0.624433,13,-0.760466,2


In [32]:
bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
print(bestr_mlp)

MLPRegressor(alpha=0.4534492474173122, hidden_layer_sizes=76, max_iter=450,
             random_state=13, solver='lbfgs')


In [33]:
bestr_mlp.fit(x_tr_NF, y_tr_N)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor(alpha=0.4534492474173122, hidden_layer_sizes=76, max_iter=450,
             random_state=13, solver='lbfgs')

In [34]:
yhat = bestr_mlp.predict(x_va_MF) + yhat_model_va
mae = ag_np.mean(ag_np.absolute(yhat - valid_tuple[2]))
print(mae)

0.7593244441537232


In [35]:
yhat_model_va = model.predict(valid_tuple[0], valid_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
mae_model = ag_np.mean(ag_np.absolute(yhat_model_va - valid_tuple[2]))
print(mae_model)

0.7780449277653884


In [None]:
all_data = (ag_np.concatenate((train_tuple[0], valid_tuple[0], test_tuple[0])),
            ag_np.concatenate((train_tuple[1], valid_tuple[1], test_tuple[1])),
            ag_np.concatenate((train_tuple[2], valid_tuple[2], test_tuple[2])))
# Fit the model with SGD
model.fit(all_data, valid_tuple)

In [None]:
yhat_model_final = model.predict(all_data[0], all_data[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
y_final_N = all_data[2] - yhat_model_final

In [None]:
pDict = model.param_dict
x_final_NF = ag_np.empty((len(all_data[0]), 106))
for i in range(len(all_data[0])):
    user_id = all_data[0][i]
    item_id = all_data[1][i]
    x_final_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_final[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][all_data[0][i]]], [user_info['is_male'][all_data[0][i]]]))


In [None]:
bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
bestr_mlp.fit(x_final_NF, y_final_N)

In [None]:
import pandas as pd
select_movies_df = pd.read_csv("./data_movie_lens_100k/ratings_masked_leaderboard_set.csv")
leaderboard_user_id = select_movies_df["user_id"]
leaderboard_item_id = select_movies_df["item_id"]

yhat_model_preds = model.predict(leaderboard_user_id, leaderboard_item_id, pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])

x_preds_NF = ag_np.empty((len(leaderboard_user_id), 106))
for i in range(len(leaderboard_user_id)):
    user_id = leaderboard_user_id[i]
    item_id = leaderboard_item_id[i]
    x_preds_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat_model_preds[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][leaderboard_user_id[i]]], [user_info['is_male'][leaderboard_user_id[i]]]))

yhat = bestr_mlp.predict(x_preds_NF) + yhat_model_preds

print(yhat)
yhat_df = pd.DataFrame(yhat)
print(yhat_df)
yhat_df.to_csv(r'predicted_ratings_leaderboard.txt', header=None, index=None)