In [5]:
!pip install scrapbook
!pip install recommenders

Collecting recommenders
  Downloading recommenders-1.1.0-py3-none-manylinux1_x86_64.whl (335 kB)
[K     |████████████████████████████████| 335 kB 7.9 MB/s 
Collecting lightfm<2,>=1.15
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 61.8 MB/s 
Collecting pandera[strategies]>=0.6.5
  Downloading pandera-0.9.0-py3-none-any.whl (197 kB)
[K     |████████████████████████████████| 197 kB 47.7 MB/s 
[?25hCollecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting nltk<4,>=3.4
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 39.1 MB/s 
[?25hCollecting memory-profiler<1,>=0.54.0
  Downloading memory_profiler-0.60.0.tar.gz (38 kB)
Collecting transformers<5,>=2.5.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 28.0 MB/s 
Collecting category-encoders<2,>=1.3.0
  Downloading category_encoders-1.3.0-py2.py3-none-any.wh

In [33]:
# set the environment path to find Recommenders
from tempfile import TemporaryDirectory
import sys
import os
import pandas as pd
import numpy as np
import scrapbook as sb
import torch, fastai
from fastai.collab import collab_learner, CollabDataBunch, load_learner

In [34]:
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.fastai.fastai_utils import cartesian_product, score
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("Cuda Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.7.13 (default, Mar 16 2022, 17:37:17) 
[GCC 7.5.0]
Pandas version: 1.3.5
Fast AI version: 1.0.61
Torch version: 1.10.0+cu111
Cuda Available: False
CuDNN Enabled: True


In [35]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"
COL_TIMESTAMP = "Timestamp"

# Model parameters
N_FACTORS = 40
EPOCHS = 5

In [43]:
ratings_df = pd.read_csv('/content/ratings.csv', sep=",", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP], header=None)

ratings_df = ratings_df.iloc[1:]

In [44]:
#ratings_df[UserId] = ratings_df[UserId].astype('str')
#ratings_df[MovieId] = ratings_df[MovieId].astype('str')

ratings_df.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
1,1,1,4,964982703
2,1,3,4,964981247
3,1,6,4,964982224
4,1,47,5,964983815
5,1,50,5,964982931


In [48]:
train_valid_df, test_df = python_stratified_split(
    ratings_df, filter_by="item", min_rating=10, ratio=0.8,
    col_user=COL_USER, col_item=COL_ITEM
)

print(test_df)

      UserId MovieId Rating   Timestamp
56795    378       1    4.5  1445347576
82208    522       1      3  1253344674
89244    579       1      4   958881146
25805    178       1      4  1164354911
22685    156       1      4  1106854640
...      ...     ...    ...         ...
79394    492     996      3   863976753
5306      36     996    2.5  1100803714
28124    195     996      1   974706168
26236    182     999    3.5  1075764968
48300    313     999      4  1030557936

[16175 rows x 4 columns]


In [49]:
test_df = test_df[test_df.UserId.isin(train_valid_df.UserId)]

In [50]:
with Timer() as preprocess_time:
    data = CollabDataBunch.from_df(train_valid_df, 
                                   user_name=COL_USER, 
                                   item_name=COL_ITEM, 
                                   rating_name=COL_RATING, 
                                   valid_pct=0)

In [51]:
data.show_batch()

UserId,MovieId,target
380,68954,5.0
135,1377,2.0
610,102445,3.0
414,3704,3.0
603,4226,5.0


In [52]:
learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0,5.5], wd=1e-1)
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(611, 40)
  (i_weight): Embedding(2270, 40)
  (u_bias): Embedding(611, 1)
  (i_bias): Embedding(2270, 1)
)

In [53]:
with Timer() as train_time:
    learn.fit_one_cycle(EPOCHS, max_lr=5e-3)

print("Took {} seconds for training.".format(train_time))

epoch,train_loss,valid_loss,time
0,0.788003,#na#,00:09
1,0.705961,#na#,00:08
2,0.603193,#na#,00:08
3,0.504573,#na#,00:08
4,0.381772,#na#,00:09


Took 44.6071 seconds for training.


In [54]:
total_users, total_items = learn.data.train_ds.x.classes.values()
total_items = total_items[1:]
total_users = total_users[1:]

In [56]:
test_users = test_df[COL_USER].unique()
test_users = np.intersect1d(test_users, total_users)

In [57]:
users_items = cartesian_product(np.array(test_users),np.array(total_items))
users_items = pd.DataFrame(users_items, columns=[COL_USER,COL_ITEM])

In [58]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[COL_USER, COL_ITEM], how='left')
training_removed = training_removed[training_removed[COL_RATING].isna()][[COL_USER, COL_ITEM]]

In [59]:
with Timer() as test_time:
    top_k_scores = score(learn, 
                         test_df=training_removed,
                         user_col=COL_USER, 
                         item_col=COL_ITEM, 
                         prediction_col=COL_PREDICTION)

print("Took {} seconds for {} predictions.".format(test_time, len(training_removed)))

Took 3.7920 seconds for 1310347 predictions.


In [60]:
eval_map = map_at_k(test_df, top_k_scores, col_user=COL_USER, col_item=COL_ITEM, 
                    col_rating=COL_RATING, col_prediction=COL_PREDICTION, 
                    relevancy_method="top_k", k=TOP_K)

eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=COL_USER, col_item=COL_ITEM, 
                      col_rating=COL_RATING, col_prediction=COL_PREDICTION, 
                      relevancy_method="top_k", k=TOP_K)

eval_precision = precision_at_k(test_df, top_k_scores, col_user=COL_USER, col_item=COL_ITEM, 
                                col_rating=COL_RATING, col_prediction=COL_PREDICTION, 
                                relevancy_method="top_k", k=TOP_K)

eval_recall = recall_at_k(test_df, top_k_scores, col_user=COL_USER, col_item=COL_ITEM, 
                          col_rating=COL_RATING, col_prediction=COL_PREDICTION, 
                          relevancy_method="top_k", k=TOP_K)

In [61]:
print("Model:\t" + learn.__class__.__name__,
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:	CollabLearner
Top K:	10
MAP:	0.022092
NDCG:	0.109710
Precision@K:	0.093564
Recall@K:	0.053384


In [62]:
scores = score(learn, 
               test_df=test_df.copy(), 
               user_col=COL_USER, 
               item_col=COL_ITEM, 
               prediction_col=COL_PREDICTION)

In [63]:
eval_r2 = rsquared(test_df, scores, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, col_prediction=COL_PREDICTION)
eval_rmse = rmse(test_df, scores, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, col_prediction=COL_PREDICTION)
eval_mae = mae(test_df, scores, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, col_prediction=COL_PREDICTION)
eval_exp_var = exp_var(test_df, scores, col_user=COL_USER, col_item=COL_ITEM, col_rating=COL_RATING, col_prediction=COL_PREDICTION)

print("Model:\t" + learn.__class__.__name__,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t%f" % eval_r2, sep='\n')

Model:	CollabLearner
RMSE:	0.810238
MAE:	0.624997
Explained variance:	0.372604
R squared:	0.370915
