https://nbviewer.jupyter.org/github/microsoft/recommenders/blob/master/notebooks/00_quick_start/sar_movielens.ipynb

In [125]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', 25)

df = pd.read_csv(os.path.join("..","..","data","TA_User_Reviws_Korea_all_new_df.csv"))
df.head()

Unnamed: 0,userID,rating,location.locationId,location.name,location.placeType,location.reviewSummary.count,location.reviewSummary.rating,location.reviewSummary.locationId,tripInfo.stayDate,userProfile.hometown.location.name
0,0,4,306130,Lotte Hotel Seoul,ACCOMMODATION,4033.0,4.5,306130.0,2016-02-29,Jakarta
1,0,2,6352819,VIP TRAVEL,ATTRACTION,276.0,4.5,6352819.0,2016-02-29,Jakarta
2,1,5,9033360,Haagen Dazs,EATERY,10.0,4.5,9033360.0,2015-08-31,Seoul
3,1,5,9017499,Gongcha,EATERY,8.0,4.0,9017499.0,2015-11-30,Seoul
4,1,5,4076062,Mr. Pizza Terminal,EATERY,7.0,4.0,4076062.0,2015-06-30,Seoul


In [126]:
COLUMNS = ['userID', 'location.locationId', 'rating', 'stayDate']
df = df.loc[:,COLUMNS]
df.head()

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [127]:
# ! pip install papermill

In [128]:
# set the environment path to find Recommenders
import sys
sys.path.append("../../")

import itertools
import logging
import os

import numpy as np
import pandas as pd
import papermill as pm

from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode

In [129]:
# top k items to recommend
TOP_K = 10

In [130]:
# Convert the float precision to 32-bit in order to reduce memory consumption 
df.loc[:, 'rating'] = df['rating'].astype(np.float64)
df.head()

Unnamed: 0,userID,rating,location.locationId,location.name,location.placeType,location.reviewSummary.count,location.reviewSummary.rating,location.reviewSummary.locationId,tripInfo.stayDate,userProfile.hometown.location.name
0,0,4.0,306130,Lotte Hotel Seoul,ACCOMMODATION,4033.0,4.5,306130.0,2016-02-29,Jakarta
1,0,2.0,6352819,VIP TRAVEL,ATTRACTION,276.0,4.5,6352819.0,2016-02-29,Jakarta
2,1,5.0,9033360,Haagen Dazs,EATERY,10.0,4.5,9033360.0,2015-08-31,Seoul
3,1,5.0,9017499,Gongcha,EATERY,8.0,4.0,9017499.0,2015-11-30,Seoul
4,1,5.0,4076062,Mr. Pizza Terminal,EATERY,7.0,4.0,4076062.0,2015-06-30,Seoul


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28561 entries, 0 to 28560
Data columns (total 10 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   userID                              28561 non-null  int64  
 1   rating                              28561 non-null  float64
 2   location.locationId                 28561 non-null  int64  
 3   location.name                       28561 non-null  object 
 4   location.placeType                  28561 non-null  object 
 5   location.reviewSummary.count        28561 non-null  float64
 6   location.reviewSummary.rating       28561 non-null  float64
 7   location.reviewSummary.locationId   28561 non-null  float64
 8   tripInfo.stayDate                   28561 non-null  object 
 9   userProfile.hometown.location.name  28561 non-null  object 
dtypes: float64(4), int64(2), object(4)
memory usage: 2.2+ MB


In [132]:
# import datetime

# def convert_date(str_date):
#   return datetime.datetime.strptime(str_date, "%Y-%m-%d")

# df['stayDate'] = df['stayDate'].apply(convert_date)
# print(df.info())

In [134]:
def clean(date):
  return date.replace('-','')

df['stayDate'] = df['tripInfo.stayDate'].apply(clean)
df.head()

Unnamed: 0,userID,rating,location.locationId,location.name,location.placeType,location.reviewSummary.count,location.reviewSummary.rating,location.reviewSummary.locationId,tripInfo.stayDate,userProfile.hometown.location.name,stayDate
0,0,4.0,306130,Lotte Hotel Seoul,ACCOMMODATION,4033.0,4.5,306130.0,2016-02-29,Jakarta,20160229
1,0,2.0,6352819,VIP TRAVEL,ATTRACTION,276.0,4.5,6352819.0,2016-02-29,Jakarta,20160229
2,1,5.0,9033360,Haagen Dazs,EATERY,10.0,4.5,9033360.0,2015-08-31,Seoul,20150831
3,1,5.0,9017499,Gongcha,EATERY,8.0,4.0,9017499.0,2015-11-30,Seoul,20151130
4,1,5.0,4076062,Mr. Pizza Terminal,EATERY,7.0,4.0,4076062.0,2015-06-30,Seoul,20150630


In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28561 entries, 0 to 28560
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   userID                              28561 non-null  int64  
 1   rating                              28561 non-null  float64
 2   location.locationId                 28561 non-null  int64  
 3   location.name                       28561 non-null  object 
 4   location.placeType                  28561 non-null  object 
 5   location.reviewSummary.count        28561 non-null  float64
 6   location.reviewSummary.rating       28561 non-null  float64
 7   location.reviewSummary.locationId   28561 non-null  float64
 8   tripInfo.stayDate                   28561 non-null  object 
 9   userProfile.hometown.location.name  28561 non-null  object 
 10  stayDate                            28561 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory

In [136]:
df['stayDate'] = df['stayDate'].astype('int')

In [137]:
df.head()

Unnamed: 0,userID,rating,location.locationId,location.name,location.placeType,location.reviewSummary.count,location.reviewSummary.rating,location.reviewSummary.locationId,tripInfo.stayDate,userProfile.hometown.location.name,stayDate
0,0,4.0,306130,Lotte Hotel Seoul,ACCOMMODATION,4033.0,4.5,306130.0,2016-02-29,Jakarta,20160229
1,0,2.0,6352819,VIP TRAVEL,ATTRACTION,276.0,4.5,6352819.0,2016-02-29,Jakarta,20160229
2,1,5.0,9033360,Haagen Dazs,EATERY,10.0,4.5,9033360.0,2015-08-31,Seoul,20150831
3,1,5.0,9017499,Gongcha,EATERY,8.0,4.0,9017499.0,2015-11-30,Seoul,20151130
4,1,5.0,4076062,Mr. Pizza Terminal,EATERY,7.0,4.0,4076062.0,2015-06-30,Seoul,20150630


In [138]:
train, test = python_random_split(df, 0.75)

In [139]:
header = {
    "col_user": "userID",
    "col_item": "location.locationId",
    "col_rating": "rating",
    "col_timestamp": "stayDate",
    "col_prediction": "Prediction",
}

In [140]:
# set log level to INFO
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SARSingleNode(
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [141]:
model.fit(train)

2020-05-28 15:42:38,183 INFO     Collecting user affinity matrix
2020-05-28 15:42:38,186 INFO     Calculating time-decayed affinities
2020-05-28 15:42:38,204 INFO     Creating index columns
2020-05-28 15:42:38,246 INFO     Building user affinity sparse matrix
2020-05-28 15:42:38,249 INFO     Calculating item co-occurrence
2020-05-28 15:42:38,413 INFO     Calculating item similarity
2020-05-28 15:42:38,413 INFO     Using jaccard based similarity
2020-05-28 15:42:42,861 INFO     Done training


In [142]:
top_k = model.recommend_k_items(test, remove_seen=True)

ValueError: SAR cannot score users that are not in the training set

In [143]:
top_k.shape

NameError: name 'top_k' is not defined

In [None]:
top_k_with_titles = (top_k.join(data[['location.locationId', 'location.name']].drop_duplicates().set_index('location.locationId'), 
                                on='location.locationId', 
                                how='inner').sort_values(by=['userId', 'Prediction'], ascending=False))
display(top_k_with_titles.head(10))

## Evaluate the results  
항목 유사도 행렬 S와 사용자 선호도 행렬 A가 행렬 곱으로생성된 추천 점수는 movielens 데이터 세트의 원래 명시적 등급과 동일한 축을 가져야 합니다.  
즉, SAR 알고리즘은 사용자 - 항목 쌍에 대한 명시적 등급을 “예측하는 것”이 아니라 관련되는 항목을 사용자에게 “추천하는 작업”을 의미합니다.  
RMSE와 같은 평가 지표보다 precision@k, recall@k 등과 같은 순위측정 기준은 SAR 알고리즘을 평가하는 데 더 적합합니다.  
다음은reco_utils에 제공된 평가 함수를 사용하여 SAR 모델을 평가하는 방법을 보여줍니다.

In [None]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='userID', 
              col_item='location.locationId', 
              col_rating='rating', 
              col_prediction='Prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [None]:
to_k.head()


In [None]:
print({model.model_str},{TOP_K},{eval_map},{eval_ndcg},{eval_precision},{eval_recall}, sep='\n')