In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
pip install papermill

Collecting papermill
  Downloading https://files.pythonhosted.org/packages/2f/9b/13bc32699675dbb5fa12bc8f046c3a57a4b4f43eb5fe1f1e52034f23bb7f/papermill-2.2.2-py3-none-any.whl
Collecting tenacity
  Downloading https://files.pythonhosted.org/packages/4e/e4/bcaf6978c0811fbb480acc9bd6e024b53390a61d153fa0be4f20a6c80d94/tenacity-6.3.1-py2.py3-none-any.whl
Collecting black
[?25l  Downloading https://files.pythonhosted.org/packages/dc/7b/5a6bbe89de849f28d7c109f5ea87b65afa5124ad615f3419e71beb29dc96/black-20.8b1.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 8.8MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting ansiwrap
  Downloading https://files.pythonhosted.org/packages/03/50/43e775a63e0d632d9be3b3fa1c9b2cbaf3b7870d203655710a3426f47c26/ansiwrap-0.8.4-py2.py3-none-any.whl
Collecting appdirs
  Downloading https://files.pythonhosted.org/packages/

In [6]:
%cd /content/gdrive/MyDrive/ADM_Final_Project

/content/gdrive/MyDrive/ADM_Final_Project


In [7]:
! git clone https://github.com/Microsoft/Recommenders
import os
os.chdir('/content/gdrive/My Drive/ADM_Final_Project/Recommenders')

fatal: destination path 'Recommenders' already exists and is not an empty directory.


In [8]:
import sys
sys.path.append("../../")
import time
import os
import matplotlib.pyplot as plt
import itertools
import pandas as pd
import numpy as np
import papermill as pm
import torch, fastai
import datetime as dt
from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("Cuda Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
Pandas version: 1.1.5
Fast AI version: 1.0.61
Torch version: 1.7.0+cu101
Cuda Available: False
CuDNN Enabled: True


In [10]:
fields = ["Customer_ID","ProductID","Rating","Order_Date"]

data = pd.read_csv('/content/gdrive/MyDrive/ADM_Final_Project/reg_lost.csv', skipinitialspace=True, usecols=fields)
# Convert to 32-bit in order to reduce memory consumption 
print(data["Rating"].isna().count()) 
#df.fillna(0)
data.loc[:, 'Rating'] = data['Rating'].astype(np.int32) 
data.columns = ["Order_Date","Customer_ID","ProductID","Review"]
data.head()


21699


Unnamed: 0,Order_Date,Customer_ID,ProductID,Review
0,2014-01-01,8,1,4
1,2014-01-01,9,1,4
2,2014-01-01,12,1,4
3,2014-01-01,20,1,4
4,2014-01-01,24,1,4


In [11]:
df_fastai=data[['Customer_ID','ProductID','Review','Order_Date']]

In [12]:
data['Order_Date'] = pd.to_datetime(data['Order_Date'])
data['Timestamp'] =(data['Order_Date'] - dt.datetime(1970,1,1)).dt.total_seconds()

In [13]:
USER, ITEM, RATING, TIMESTAMP, PREDICTION = 'Customer_ID', 'ProductID', 'Review', 'Order_Date', 'Prediction'

In [14]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
#SNACKS_DATA_SIZE = '100k'

# Model parameters
N_FACTORS = 40
EPOCHS = 5

In [15]:
# Split the dataset
train_valid_df, test_df = python_stratified_split(
    df_fastai, 
    ratio=0.70, 
    min_rating=1, 
    filter_by="user", 
    col_user=USER, 
    col_item=ITEM
)

In [16]:
#train_valid_df=df_fastai
#test_df = df_fastai

In [17]:
len(train_valid_df)

15241

In [18]:
train_valid_df["Customer_ID"].value_counts(normalize=True)
len(test_df)

6458

In [19]:
train_valid_df["Customer_ID"].value_counts(normalize=True)

2975    0.005315
2354    0.003806
845     0.003674
3055    0.003609
1012    0.003477
          ...   
3986    0.000066
1987    0.000066
2027    0.000066
4       0.000066
2049    0.000066
Name: Customer_ID, Length: 2832, dtype: float64

In [20]:
# fix random seeds to make sure our runs are reproducible
np.random.seed(101)
torch.manual_seed(101)
torch.cuda.manual_seed_all(101)

In [21]:
start_time = time.time()

data = CollabDataBunch.from_df(train_valid_df, user_name=USER, item_name=ITEM, rating_name=RATING, valid_pct=0)

preprocess_time = time.time() - start_time

In [22]:
data.show_batch()

Customer_ID,ProductID,target
3179,110,5.0
1012,20,3.0
2566,381,4.0
53,233,5.0
949,188,4.0


In [23]:
learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0,5.5], wd=1e-1)
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(2833, 40)
  (i_weight): Embedding(371, 40)
  (u_bias): Embedding(2833, 1)
  (i_bias): Embedding(371, 1)
)

In [24]:
start_time = time.time()

learn.fit_one_cycle(EPOCHS, max_lr=5e-3)

train_time = time.time() - start_time + preprocess_time
print("Took {} seconds for training.".format(train_time))

epoch,train_loss,valid_loss,time
0,1.505921,#na#,00:02
1,0.928712,#na#,00:02
2,0.644228,#na#,00:01
3,0.406435,#na#,00:02
4,0.304513,#na#,00:01


Took 10.365350008010864 seconds for training.


In [25]:
learn.export('superstore_model.pkl')

In [26]:
learner = load_learner(path=".", file = 'superstore_model.pkl')

In [27]:
total_users, total_items = learner.data.train_ds.x.classes.values()
total_items = total_items[1:]
total_users = total_users[1:]

In [28]:
print(total_users,total_items)
print(len(total_items))
print(len(total_users))

['1' '2' '4' '5' ... '4002' '4006' '4009' '4010'] ['1' '2' '3' '4' ... '436' '437' '438' '440']
370
2832


In [29]:
test_users = test_df[USER].unique()
test_users = np.intersect1d(test_users, total_users)
print(len(test_users))

2351


In [30]:
users_items = cartesian_product(np.array(test_users),np.array(total_items))
users_items = pd.DataFrame(users_items, columns=[USER,ITEM])

In [31]:

users_items.shape

(869870, 2)

In [32]:
training_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')
training_removed = training_removed[training_removed[RATING].isna()][[USER, ITEM]]

In [33]:

training_removed

Unnamed: 0,Customer_ID,ProductID
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
869865,999,435
869866,999,436
869867,999,437
869868,999,438


In [34]:
start_time = time.time()

top_k_scores = score(learner, 
                     test_df=training_removed,
                     user_col=USER, 
                     item_col=ITEM, 
                     prediction_col=PREDICTION)

test_time = time.time() - start_time
print("Took {} seconds for {} predictions.".format(test_time, len(training_removed)))

Took 1.4022915363311768 seconds for 855110 predictions.


In [35]:
top_k_scores.head(10)

Unnamed: 0,Customer_ID,ProductID,Prediction
269,1,318,4.565425
48,1,50,4.518311
250,1,296,4.425762
98,1,111,4.35028
31,1,32,4.308034
209,1,246,4.273799
53,1,58,4.248708
28,1,29,4.246809
223,1,260,4.238332
108,1,123,4.224183


In [36]:
print(test_df.dtypes)
test_df["ProductID"] = test_df["ProductID"].astype(str).astype(object)
test_df["Customer_ID"] = test_df["Customer_ID"].astype(str).astype(object)

Customer_ID     int64
ProductID       int64
Review          int32
Order_Date     object
dtype: object


In [37]:
top_k_scores.dtypes


Customer_ID     object
ProductID       object
Prediction     float32
dtype: object

In [38]:
eval_map = map_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                    col_rating=RATING, col_prediction=PREDICTION, 
                    relevancy_method="top_k", k=TOP_K)

In [39]:
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                      col_rating=RATING, col_prediction=PREDICTION, 
                      relevancy_method="top_k", k=TOP_K)

In [40]:
eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                                col_rating=RATING, col_prediction=PREDICTION, 
                                relevancy_method="top_k", k=TOP_K)

In [41]:
eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, 
                          col_rating=RATING, col_prediction=PREDICTION, 
                          relevancy_method="top_k", k=TOP_K)

In [42]:
eval_r2 = rsquared(test_df.copy(), top_k_scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_rmse = rmse(test_df, top_k_scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_mae = mae(test_df, top_k_scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_exp_var = exp_var(test_df, top_k_scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)

print("Model:\t" + learn.__class__.__name__,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t%f" % eval_r2, sep='\n')

Model:	CollabLearner
RMSE:	0.998612
MAE:	0.771425
Explained variance:	0.231547
R squared:	0.229498
