In [1]:
import autograd.numpy as ag_np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.neural_network as NN
import sklearn.model_selection
import scipy.stats
from AbstractBaseCollabFilterSGD import AbstractBaseCollabFilterSGD
from train_valid_test_loader import load_train_valid_test_datasets

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem

In [2]:
# Load the dataset
train_tuple, valid_tuple, test_tuple, n_users, n_items = \
    load_train_valid_test_datasets()
user_info = pd.read_csv('./data_movie_lens_100k/user_info.csv')

In [3]:
# Create the model and initialize its parameters
# to have right scale as the dataset (right num users and items)
model = CollabFilterOneVectorPerItem(
    n_epochs=10, batch_size=1000, step_size=0.9,
    alpha=0, n_factors=50)
model.init_parameter_dict(n_users, n_items, train_tuple)

In [4]:
# Fit the model with SGD
model.fit(train_tuple, valid_tuple)

epoch       0.000 | loss_total     0.97700 | train_MAE     1.00262 | valid_MAE     1.00801 | grad_wrt_mu     0.33400 | grad_wrt_b_per_user     0.00070 | grad_wrt_c_per_item     0.00041 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.014 | loss_total     0.94994 | train_MAE     0.96915 | valid_MAE     0.97165 | grad_wrt_mu     0.09800 | grad_wrt_b_per_user     0.00065 | grad_wrt_c_per_item     0.00040 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.029 | loss_total     0.94163 | train_MAE     0.95895 | valid_MAE     0.96060 | grad_wrt_mu     0.07800 | grad_wrt_b_per_user     0.00070 | grad_wrt_c_per_item     0.00041 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.043 | loss_total     0.97028 | train_MAE     0.95074 | valid_MAE     0.95173 | grad_wrt_mu     0.14200 | grad_wrt_b_per_user     0.00066 | grad_wrt_c_per_item     0.00041 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.129 | loss_total     0.85179 | train_MAE     0

In [5]:
pDict = model.param_dict
yhat = model.predict(train_tuple[0], train_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
x_tr_NF = ag_np.empty((len(train_tuple[0]), 106))
for i in range(len(train_tuple[0])):
    user_id = train_tuple[0][i]
    item_id = train_tuple[1][i]
    x_tr_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][train_tuple[0][i]]], [user_info['is_male'][train_tuple[0][i]]]))
y_tr_N = train_tuple[2]

pDict = model.param_dict
x_va_MF = ag_np.empty((len(valid_tuple[0]), 106))
yhat = model.predict(valid_tuple[0], valid_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
for i in range(len(valid_tuple[0])):
    user_id = valid_tuple[0][i]
    item_id = valid_tuple[1][i]
    x_va_MF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][valid_tuple[0][i]]], [user_info['is_male'][valid_tuple[0][i]]]))
y_va_M = valid_tuple[2]

In [6]:
mlp = NN.MLPClassifier(
    hidden_layer_sizes=[32],
    solver='lbfgs',
    max_iter=1000)

In [7]:
mlp.fit(x_tr_NF, y_tr_N)

MLPClassifier(hidden_layer_sizes=[32], max_iter=1000, solver='lbfgs')

In [8]:
yhat = mlp.predict(x_va_MF)

In [9]:
mae = ag_np.mean(ag_np.absolute(yhat - valid_tuple[2]))
print(mae)

0.7501000800640513


In [10]:
my_parameter_grid_by_name = dict(
    hidden_layer_sizes=[
        4,
        16,
        64,
        ],
    alpha=[
        0.0,
        0.0001,
        0.01,
        1.00,
        ],
    random_state=[  # try two possible seeds to initialize parameters
        101, 202,
        ],
    )

In [11]:
my_scoring_metric_name = 'neg_mean_absolute_error'

In [12]:
xall_N2 = ag_np.vstack([x_tr_NF, x_va_MF])
yall_N = ag_np.hstack([y_tr_N, y_va_M])

In [13]:
valid_indicators_N = ag_np.hstack([
    -1 * ag_np.ones(y_tr_N.size), # -1 means never include this example in any test split
    0  * ag_np.ones(y_va_M.size), #  0 means include in the first test split (we count starting at 0 in python)
    ])

In [14]:
# Create splitter object using Predefined Split

my_splitter = sklearn.model_selection.PredefinedSplit(valid_indicators_N)

In [15]:
# Verify the splitter only produces one split and it is the intended one
for tr_idx, te_idx in my_splitter.split(xall_N2, yall_N):
    assert ag_np.allclose(xall_N2[te_idx], x_va_MF)
    assert ag_np.allclose(yall_N[te_idx], y_va_M)

In [16]:
## Create a custom searcher object with all our settings in place.
#
#grid_searcher = sklearn.model_selection.GridSearchCV(
#    mlp,
#    my_parameter_grid_by_name,
#    scoring=my_scoring_metric_name,
#    cv=my_splitter,
#    refit=False)

In [17]:
#grid_searcher.fit(xall_N2, yall_N)

In [18]:
#gsearch_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()
#print("Dataframe has shape: %s" % (str(gsearch_results_df.shape)))
#n_trials_grid_search = gsearch_results_df.shape[0]
#
#print("Dataframe has columns:")
#for c in gsearch_results_df.columns:
#    print("-- %s" % c)

In [19]:
#param_keys = ['param_hidden_layer_sizes', 'param_alpha', 'param_random_state']
#
# Rearrange row order so it is easy to skim
#gsearch_results_df.sort_values(param_keys, inplace=True)

In [20]:
#gsearch_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

In [21]:
my_parameter_distributions_by_name = dict(
    hidden_layer_sizes=scipy.stats.randint(2, 70),
    alpha=scipy.stats.uniform(0.0, 1.0),
    random_state=[  # try two possible seeds to initialize parameters
        13, 169,
        ],
    )

In [22]:
n_trials_rand_search = 4

In [23]:
my_rand_searcher = sklearn.model_selection.RandomizedSearchCV(
    mlp,
    my_parameter_distributions_by_name,
    scoring=my_scoring_metric_name,
    cv=my_splitter,
    n_iter=n_trials_rand_search,
    random_state=13, # same seed means same results everytime we repeat this code
    verbose=4,
    )

In [24]:
my_rand_searcher.fit(xall_N2, yall_N)

Fitting 1 folds for each of 4 candidates, totalling 4 fits
[CV] alpha=0.7777024105738202, hidden_layer_sizes=18, random_state=13 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min remaining:    0.0s


[CV]  alpha=0.7777024105738202, hidden_layer_sizes=18, random_state=13, score=-0.751, total= 3.5min
[CV] alpha=0.8929826912712245, hidden_layer_sizes=27, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.3min remaining:    0.0s


[CV]  alpha=0.8929826912712245, hidden_layer_sizes=27, random_state=13, score=-0.750, total= 3.8min
[CV] alpha=0.7585840035486909, hidden_layer_sizes=28, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 11.0min remaining:    0.0s


[CV]  alpha=0.7585840035486909, hidden_layer_sizes=28, random_state=13, score=-0.750, total= 3.7min
[CV] alpha=0.6073433442080506, hidden_layer_sizes=48, random_state=13 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 16.5min finished


[CV]  alpha=0.6073433442080506, hidden_layer_sizes=48, random_state=13, score=-0.748, total= 5.5min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=MLPClassifier(hidden_layer_sizes=[32],
                                           max_iter=1000, solver='lbfgs'),
                   n_iter=4,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f68d55d26d0>,
                                        'hidden_layer_sizes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f68d55d3c40>,
                                        'random_state': [13, 169]},
                   random_state=13, scoring='neg_mean_absolute_error',
                   verbose=4)

In [25]:
rsearch_results_df = pd.DataFrame(my_rand_searcher.cv_results_).copy()
print("Dataframe has shape: %s" % (str(rsearch_results_df.shape)))

print("Dataframe has columns:")
for c in rsearch_results_df.columns:
    print("-- %s" % c)

Dataframe has shape: (4, 12)
Dataframe has columns:
-- mean_fit_time
-- std_fit_time
-- mean_score_time
-- std_score_time
-- param_alpha
-- param_hidden_layer_sizes
-- param_random_state
-- params
-- split0_test_score
-- mean_test_score
-- std_test_score
-- rank_test_score


In [26]:
param_keys = ['param_hidden_layer_sizes', 'param_alpha', 'param_random_state']
rsearch_results_df[param_keys + ['split0_test_score', 'rank_test_score']]

Unnamed: 0,param_hidden_layer_sizes,param_alpha,param_random_state,split0_test_score,rank_test_score
0,18,0.777702,13,-0.750701,4
1,27,0.892983,13,-0.7497,2
2,28,0.758584,13,-0.75,3
3,48,0.607343,13,-0.748399,1


In [27]:
bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
print(bestr_mlp)

MLPClassifier(alpha=0.6073433442080506, hidden_layer_sizes=48, max_iter=1000,
              random_state=13, solver='lbfgs')


In [28]:
bestr_mlp.fit(x_tr_NF, y_tr_N)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=0.6073433442080506, hidden_layer_sizes=48, max_iter=1000,
              random_state=13, solver='lbfgs')

In [29]:
yhat = bestr_mlp.predict(x_va_MF)
mae = ag_np.mean(ag_np.absolute(yhat - y_va_M))
print(mae)

0.7483987189751802


In [30]:
yhat_model_va = model.predict(valid_tuple[0], valid_tuple[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
mae_model = ag_np.mean(ag_np.absolute(yhat_model_va - valid_tuple[2]))
print(mae_model)

0.7924751255557589


In [31]:
all_data = (ag_np.concatenate((train_tuple[0], valid_tuple[0], test_tuple[0])),
            ag_np.concatenate((train_tuple[1], valid_tuple[1], test_tuple[1])),
            ag_np.concatenate((train_tuple[2], valid_tuple[2], test_tuple[2])))
# Fit the model with SGD
model.fit(all_data, valid_tuple)

epoch       0.000 | loss_total     0.77816 | train_MAE     0.78087 | valid_MAE     0.79248 | grad_wrt_mu     0.03600 | grad_wrt_b_per_user     0.00064 | grad_wrt_c_per_item     0.00039 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.011 | loss_total     0.75376 | train_MAE     0.78065 | valid_MAE     0.79166 | grad_wrt_mu     0.00400 | grad_wrt_b_per_user     0.00062 | grad_wrt_c_per_item     0.00038 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.022 | loss_total     0.78273 | train_MAE     0.78052 | valid_MAE     0.79159 | grad_wrt_mu     0.04200 | grad_wrt_b_per_user     0.00062 | grad_wrt_c_per_item     0.00038 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.033 | loss_total     0.74704 | train_MAE     0.78186 | valid_MAE     0.79218 | grad_wrt_mu     0.07400 | grad_wrt_b_per_user     0.00061 | grad_wrt_c_per_item     0.00038 | grad_wrt_U     0.00000 | grad_wrt_V     0.00000
epoch       0.133 | loss_total     0.80718 | train_MAE     0

In [34]:
pDict = model.param_dict
yhat = model.predict(all_data[0], all_data[1], pDict['mu'], pDict['b_per_user'], pDict['c_per_item'], pDict['U'], pDict['V'])
x_final_NF = ag_np.empty((len(all_data[0]), 106))
for i in range(len(all_data[0])):
    user_id = all_data[0][i]
    item_id = all_data[1][i]
    x_final_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][all_data[0][i]]], [user_info['is_male'][all_data[0][i]]]))
y_final_N = all_data[2]


In [35]:
bestr_mlp = mlp.set_params(**my_rand_searcher.best_params_)
bestr_mlp.fit(x_final_NF, y_final_N)

MLPClassifier(alpha=0.6073433442080506, hidden_layer_sizes=48, max_iter=1000,
              random_state=13, solver='lbfgs')

In [36]:
import pandas as pd
select_movies_df = pd.read_csv("./data_movie_lens_100k/ratings_masked_leaderboard_set.csv")
leaderboard_user_id = select_movies_df["user_id"]
leaderboard_item_id = select_movies_df["item_id"]

x_preds_NF = ag_np.empty((len(leaderboard_user_id), 106))
for i in range(len(leaderboard_user_id)):
    user_id = leaderboard_user_id[i]
    item_id = leaderboard_item_id[i]
    x_preds_NF[i] = ag_np.concatenate((pDict['U'][user_id], pDict['V'][item_id], [yhat[i]],
                                    pDict['mu'], [pDict['b_per_user'][user_id]], [pDict['c_per_item'][item_id]],
                                    [user_info['age'][leaderboard_user_id[i]]], [user_info['is_male'][leaderboard_user_id[i]]]))

yhat = bestr_mlp.predict(x_preds_NF)

print(yhat)
yhat_df = pd.DataFrame(yhat)
print(yhat_df)
yhat_df.to_csv(r'predicted_ratings_leaderboard.txt', header=None, index=None)

[4 4 3 ... 4 4 3]
      0
0     4
1     4
2     3
3     4
4     4
...  ..
9995  4
9996  4
9997  4
9998  4
9999  3

[10000 rows x 1 columns]
