In [3]:
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split

import pandas as pd
from sklearn.metrics import root_mean_squared_error

In [4]:
# Movie data found here https://grouplens.org/datasets/movielens/
# cols = ["user_id", "item_id", "rating", "timestamp"]
# movie_data = pd.read_csv(
#     "../data/ml-100k/u.data", names=cols, sep="\t", usecols=[0, 1, 2], engine="python"
# )

# Adaptation for ml-latest-small version of dataset
cols = ["userId", "movieId", "rating", "timestamp"]
# movie_data = pd.read_csv(
#     "ml-latest-small/ratings.csv", names=cols, sep="\t", usecols=[0, 1, 2], engine="python"
# )

movie_data = pd.read_csv(
    "ml-latest-small/ratings.csv", engine="python"
)

In [5]:
movie_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
X = movie_data[["userId", "movieId"]]
y = movie_data["rating"]

In [7]:
# renaming columns for pypi library to work correctly
X = X.rename(columns={"userId":"user_id"})
X = X.rename(columns={"movieId":"item_id"})

In [8]:
movie_data_final = pd.concat([X, y], axis=1)

In [9]:
movie_data_final

Unnamed: 0,user_id,item_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [10]:
# Prepare data for online learning
(
    X_train_initial,
    y_train_initial,
    X_train_update,
    y_train_update,
    X_test_update,
    y_test_update,
) = train_update_test_split(movie_data_final, frac_new_users=0.2)
# ) = train_update_test_split(movie_data, frac_new_users=0.2)

In [11]:
X_train_initial

Unnamed: 0,user_id,item_id
100587,610,98061
10998,68,4447
54082,356,48342
30291,212,1207
11742,71,609
...,...,...
78852,489,4054
7258,50,32657
56555,376,541
76593,480,3499


In [12]:
y_train_initial

100587    3.5
10998     4.0
54082     5.0
30291     3.5
11742     3.0
         ... 
78852     1.0
7258      3.5
56555     5.0
76593     3.0
38233     3.0
Name: rating, Length: 83152, dtype: float64

In [13]:
X_train_update

Unnamed: 0,user_id,item_id
8741,60,1203
37905,257,494
84602,543,101765
32068,220,318
71945,462,90866
...,...,...
5177,33,2501
3794,23,541
94010,599,5254
92918,599,780


In [14]:
y_train_update

8741     3.0
37905    3.0
84602    5.0
32068    5.0
71945    2.5
        ... 
5177     2.0
3794     5.0
94010    3.5
92918    3.5
47889    4.0
Name: rating, Length: 8842, dtype: float64

In [15]:
X_test_update

Unnamed: 0,user_id,item_id
52411,341,435
93874,599,4483
27228,184,172705
96109,603,32
1083,9,1270
...,...,...
77914,483,98585
89920,584,173
94660,599,58559
52835,346,6530


In [16]:
y_test_update

52411    3.0
93874    1.5
27228    5.0
96109    3.0
1083     5.0
        ... 
77914    3.5
89920    4.0
94660    3.5
52835    4.5
66411    2.0
Name: rating, Length: 8842, dtype: float64

In [37]:
# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)

In [38]:
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.9908606370496144
Epoch  2 / 20  -  train_rmse: 0.9646942928673473
Epoch  3 / 20  -  train_rmse: 0.9475215723317651
Epoch  4 / 20  -  train_rmse: 0.9347379740060606
Epoch  5 / 20  -  train_rmse: 0.9245831530853402
Epoch  6 / 20  -  train_rmse: 0.9161169131252159
Epoch  7 / 20  -  train_rmse: 0.9088380625216923
Epoch  8 / 20  -  train_rmse: 0.9024188985470719
Epoch  9 / 20  -  train_rmse: 0.8966448066714161
Epoch  10 / 20  -  train_rmse: 0.8913744883475714
Epoch  11 / 20  -  train_rmse: 0.8864912540866319
Epoch  12 / 20  -  train_rmse: 0.8819469627110944
Epoch  13 / 20  -  train_rmse: 0.8776326422393878
Epoch  14 / 20  -  train_rmse: 0.8735537448740682
Epoch  15 / 20  -  train_rmse: 0.8696555256936447
Epoch  16 / 20  -  train_rmse: 0.8658988366715559
Epoch  17 / 20  -  train_rmse: 0.8622792436938885
Epoch  18 / 20  -  train_rmse: 0.8587753471811304
Epoch  19 / 20  -  train_rmse: 0.855345443975685
Epoch  20 / 20  -  train_rmse: 0.8519978232247656


In [39]:
# Update model with new users
matrix_fact.update_users(
    X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1
)

Epoch  1 / 20  -  train_rmse: 0.9267355640101002
Epoch  2 / 20  -  train_rmse: 0.9123923306301637
Epoch  3 / 20  -  train_rmse: 0.9025503445198184
Epoch  4 / 20  -  train_rmse: 0.895419740139639
Epoch  5 / 20  -  train_rmse: 0.8899452499725353
Epoch  6 / 20  -  train_rmse: 0.8855857642522892
Epoch  7 / 20  -  train_rmse: 0.8819900903033405
Epoch  8 / 20  -  train_rmse: 0.8789411826858466
Epoch  9 / 20  -  train_rmse: 0.876276141774009
Epoch  10 / 20  -  train_rmse: 0.8739129900059123
Epoch  11 / 20  -  train_rmse: 0.8717885569268361
Epoch  12 / 20  -  train_rmse: 0.869846509286
Epoch  13 / 20  -  train_rmse: 0.8680491501305712
Epoch  14 / 20  -  train_rmse: 0.8663712437805832
Epoch  15 / 20  -  train_rmse: 0.8647974753816823
Epoch  16 / 20  -  train_rmse: 0.8633053585915009
Epoch  17 / 20  -  train_rmse: 0.8618855831472051
Epoch  18 / 20  -  train_rmse: 0.8605236237954392
Epoch  19 / 20  -  train_rmse: 0.8592170938663664
Epoch  20 / 20  -  train_rmse: 0.8579561266211527


In [40]:
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)
print(f"\nTest RMSE: {rmse:.4f}")

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)



Test RMSE: 0.8786




In [43]:
# pred = matrix_fact.predict(X_test_update)
rmse = root_mean_squared_error(y_test_update, pred)
# rmse = mean_squared_error(y_test_update, pred, squared=False)
print(f"\nTest RMSE: {rmse:.4f}")


Test RMSE: 0.8786


In [45]:
# Get recommendations
user = 200
items_known = X_train_initial.query("user_id == @user")["item_id"]
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
79,200,50,4.617956
171,200,1193,4.486053
357,200,608,4.472589
423,200,1136,4.468778
178,200,527,4.457607
818,200,1221,4.453756
105,200,593,4.42847
133,200,912,4.418559
110,200,750,4.384666
95,200,1265,4.369119


# Hyper parameter search

In [None]:
learning_rate = 0.0002
regularization_param = 0.02
latent_features = [2,4,8,16,100]
n_epochs_steps = [20,100,5000]
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
