# SAR 
* cooccurrence : 성능 그나마 아주 아주 아주 쪼끔 나옴 
* jaccard, lift : 성능 0...ㅎ

## 1. Load Data

In [4]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', 25)

df = pd.read_csv("TA_User_Reviews_Korea_all_new_df.csv")
df.head()

Unnamed: 0,userID,rating,location.locationId,location.name,location.placeType,location.reviewSummary.count,location.reviewSummary.rating,location.reviewSummary.locationId,tripInfo.stayDate,userProfile.hometown.location.name
0,0,4,306130,Lotte Hotel Seoul,ACCOMMODATION,4033.0,4.5,306130.0,2016-02-29,Jakarta
1,0,2,6352819,VIP TRAVEL,ATTRACTION,276.0,4.5,6352819.0,2016-02-29,Jakarta
2,1,5,9033360,Haagen Dazs,EATERY,10.0,4.5,9033360.0,2015-08-31,Seoul
3,1,5,9017499,Gongcha,EATERY,8.0,4.0,9017499.0,2015-11-30,Seoul
4,1,5,4076062,Mr. Pizza Terminal,EATERY,7.0,4.0,4076062.0,2015-06-30,Seoul


In [5]:
COLUMNS = ['userID', 'location.locationId', 'location.name', 'location.placeType' ,'rating', 'tripInfo.stayDate']
df = df.loc[:,COLUMNS]
df.head()

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
0,0,306130,Lotte Hotel Seoul,ACCOMMODATION,4,2016-02-29
1,0,6352819,VIP TRAVEL,ATTRACTION,2,2016-02-29
2,1,9033360,Haagen Dazs,EATERY,5,2015-08-31
3,1,9017499,Gongcha,EATERY,5,2015-11-30
4,1,4076062,Mr. Pizza Terminal,EATERY,5,2015-06-30


## 2. git
* https://github.com/microsoft/recommenders/tree/master/reco_utils/recommender/sar 

In [None]:
pip install git+https://github.com/microsoft/recommenders 

In [6]:
import sys
sys.path.append("../../")

import itertools
import logging
import os

import papermill as pm

from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode

In [7]:
import datetime
import time 

## 3. Data Preprocessing 

In [8]:
# top k items to recommend
TOP_K = 10
DATA_SIZE = df.shape[0]

In [9]:
df["tripInfo.stayDate"] = df.apply(lambda x : time.mktime(datetime.datetime.strptime(x["tripInfo.stayDate"], "%Y-%m-%d").timetuple()), axis=1)

In [10]:
df.head()

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
0,0,306130,Lotte Hotel Seoul,ACCOMMODATION,4,1456672000.0
1,0,6352819,VIP TRAVEL,ATTRACTION,2,1456672000.0
2,1,9033360,Haagen Dazs,EATERY,5,1440947000.0
3,1,9017499,Gongcha,EATERY,5,1448809000.0
4,1,4076062,Mr. Pizza Terminal,EATERY,5,1435590000.0


In [11]:
df2 = df.copy()

In [12]:
header = {
    "col_user": "userID",
    "col_item": "location.locationId",
    "col_rating": "rating",
    "col_timestamp": "tripInfo.stayDate",
}

In [13]:
train, test = python_stratified_split(df, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42)

In [14]:
print(train.shape, test.shape)

(22167, 6) (6394, 6)


## 4. Modeling 

In [15]:
model = SARSingleNode(
    similarity_type="cooccurrence", # jaccard, lift, cooccurrence 
    time_decay_coefficient=100, # 100일 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [16]:
model.fit(train) # 1초 걸림 

In [17]:
top_k = model.recommend_k_items(test, remove_seen=True)

In [18]:
display(top_k.head(20))

Unnamed: 0,userID,location.locationId,prediction
0,1,1169465,0.001202
1,1,553546,0.001172
2,1,324888,0.001117
3,1,2194168,0.001058
4,1,301815,0.000873
5,1,301253,0.000768
6,1,3477158,0.000735
7,1,592506,0.000723
8,1,306139,0.000718
9,1,1379963,0.000648


In [19]:
top_k.loc[top_k["prediction"]!=0] 

Unnamed: 0,userID,location.locationId,prediction
0,1,1169465,0.001202
1,1,553546,0.001172
2,1,324888,0.001117
3,1,2194168,0.001058
4,1,301815,0.000873
...,...,...,...
17975,4552,308007,0.013322
17976,4552,1643534,0.013322
17977,4552,17678712,0.013322
17978,4552,3687319,0.013322


## 5. Evaluate the Result

In [21]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='userID', 
              col_item='location.locationId', 
              col_rating='rating', 
              col_prediction='prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [22]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 10
MAP:		 0.101020
NDCG:		 0.146850
Precision@K:	 0.044605
Recall@K:	 0.211364


### user별로 살펴보기 

In [20]:
df2.head()

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
0,0,306130,Lotte Hotel Seoul,ACCOMMODATION,4,1456672000.0
1,0,6352819,VIP TRAVEL,ATTRACTION,2,1456672000.0
2,1,9033360,Haagen Dazs,EATERY,5,1440947000.0
3,1,9017499,Gongcha,EATERY,5,1448809000.0
4,1,4076062,Mr. Pizza Terminal,EATERY,5,1435590000.0


In [25]:
def usercheck(i): 
    useri = top_k.loc[top_k["userID"]==i]

    userj = useri.join(df2[['location.locationId', 'location.name', 'location.placeType']].drop_duplicates().set_index('location.locationId'),
                            on='location.locationId', how='left')
    
    return userj 

In [38]:
df2.loc[df2["userID"]==1]

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
2,1,9033360,Haagen Dazs,EATERY,5,1440947000.0
3,1,9017499,Gongcha,EATERY,5,1448809000.0
4,1,4076062,Mr. Pizza Terminal,EATERY,5,1435590000.0
5,1,9418352,Mos Burger,EATERY,5,1448809000.0
6,1,4075465,Amoje Sinsegye Gangnam,EATERY,5,1448809000.0
7,1,4075829,Hollys Coffee Sinsegye Gangnam,EATERY,4,1446217000.0
8,1,9417118,Mongsyusyu,EATERY,5,1451488000.0
9,1,8991046,Hanuri,EATERY,5,1446217000.0
10,1,9404826,Menmusya,EATERY,5,1443539000.0
11,1,4050184,Nolbu Budaejjigae Central City,EATERY,4,1432998000.0


In [26]:
usercheck(1)

Unnamed: 0,userID,location.locationId,prediction,location.name,location.placeType
0,1,1169465,0.001202,N Seoul Tower,ATTRACTION
1,1,553546,0.001172,Myeongdong Shopping Street,ATTRACTION
2,1,324888,0.001117,Gyeongbokgung Palace,ATTRACTION
3,1,2194168,0.001058,Seoul Metro,ATTRACTION
4,1,301815,0.000873,The Westin Chosun Seoul,ACCOMMODATION
5,1,301253,0.000768,The Shilla Seoul,ACCOMMODATION
6,1,3477158,0.000735,Conrad Seoul,ACCOMMODATION
7,1,592506,0.000723,Insadong,ATTRACTION
8,1,306139,0.000718,"THE PLAZA Seoul, Autograph Collection",ACCOMMODATION
9,1,1379963,0.000648,Bukchon Hanok Village,ATTRACTION


In [39]:
df2.loc[df2["userID"]==2]

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
36,2,8587847,Four Seasons Hotel Seoul,ACCOMMODATION,5,1488208000.0
37,2,1169465,N Seoul Tower,ATTRACTION,4,1488208000.0
38,2,553546,Myeongdong Shopping Street,ATTRACTION,4,1488208000.0
39,2,324888,Gyeongbokgung Palace,ATTRACTION,5,1488208000.0
40,2,2194168,Seoul Metro,ATTRACTION,4,1488208000.0
41,2,2571660,Itaewon,ATTRACTION,4,1488208000.0
42,2,9452203,Shilla Stay Gwanghwamun,ACCOMMODATION,5,1488208000.0


In [27]:
usercheck(2)

Unnamed: 0,userID,location.locationId,prediction,location.name,location.placeType
10,2,324888,0.236662,Gyeongbokgung Palace,ATTRACTION
11,2,1379963,0.155741,Bukchon Hanok Village,ATTRACTION
12,2,592506,0.150169,Insadong,ATTRACTION
13,2,320359,0.123903,Changdeokgung Palace,ATTRACTION
14,2,1046419,0.111963,Cheonggyecheon Stream,ATTRACTION
15,2,324907,0.094453,Namdaemun Market,ATTRACTION
16,2,554582,0.090473,Namsan Park,ATTRACTION
17,2,6671988,0.087554,Dongdaemun Design Plaza (DDP),ATTRACTION
18,2,554537,0.078799,The War Memorial of Korea,ATTRACTION
19,2,301815,0.069513,The Westin Chosun Seoul,ACCOMMODATION


In [40]:
# 호텔에 대한 평점이 있어야 모델 적용 가능할 것 같음 
df2.loc[df2["userID"]==3]

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
43,3,8587847,Four Seasons Hotel Seoul,ACCOMMODATION,5,1496156000.0


In [28]:
# 보안책 뭐야 뭐야 
# ??? sparse 한 문제가 생기면 -> autoencoder 적용하고 해야할 것 같은데 ... 
# 3~5 값으로 다 채워줌 ㅜㅜ 
# 1->4.1??? : loss 해봐야 할 것 같은데 - mse 값 자체는 작게 나온다 
usercheck(3)

Unnamed: 0,userID,location.locationId,prediction,location.name,location.placeType


In [41]:
df2.loc[df2["userID"]==4]

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
44,4,10692374,Hotel28 Myeongdong,ACCOMMODATION,5,1556550000.0
45,4,306118,Grand InterContinental Seoul Parnas,ACCOMMODATION,5,1556550000.0
46,4,299154,InterContinental Seoul COEX,ACCOMMODATION,5,1556550000.0


In [29]:
usercheck(4)

Unnamed: 0,userID,location.locationId,prediction,location.name,location.placeType
20,4,299154,3.509729,InterContinental Seoul COEX,ACCOMMODATION
21,4,301815,3.190663,The Westin Chosun Seoul,ACCOMMODATION
22,4,301253,3.190663,The Shilla Seoul,ACCOMMODATION
23,4,3477158,2.871597,Conrad Seoul,ACCOMMODATION
24,4,2194168,2.55253,Seoul Metro,ATTRACTION
25,4,320359,1.914398,Changdeokgung Palace,ATTRACTION
26,4,5113510,1.914398,JW Marriott Dongdaemun Square Seoul,ACCOMMODATION
27,4,325043,1.914398,National Museum of Korea,ATTRACTION
28,4,306130,1.914398,Lotte Hotel Seoul,ACCOMMODATION
29,4,306139,1.914398,"THE PLAZA Seoul, Autograph Collection",ACCOMMODATION


In [42]:
df2.loc[df2["userID"]==5]

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
47,5,306145,Royal Hotel Seoul,ACCOMMODATION,3,1514646000.0
48,5,10692374,Hotel28 Myeongdong,ACCOMMODATION,5,1498748000.0
49,5,10514149,Amanti Hotel Seoul,ACCOMMODATION,3,1514646000.0
50,5,554534,Everland,ATTRACTION,3,1498748000.0
51,5,2642194,Q Hotel,ACCOMMODATION,3,1467212000.0
52,5,7685101,Floral Hotel Namsancity Seoul Myeongdong,ACCOMMODATION,5,1467212000.0
53,5,6671995,One Mount Water Park,ATTRACTION,1,1459350000.0
54,5,1483296,Ilsan Lake Park,ATTRACTION,5,1459350000.0
55,5,506121,New Oriental Hotel,ACCOMMODATION,5,1459350000.0
56,5,2194168,Seoul Metro,ATTRACTION,1,1459350000.0


In [30]:
usercheck(5)

Unnamed: 0,userID,location.locationId,prediction,location.name,location.placeType
30,5,553546,0.037239,Myeongdong Shopping Street,ATTRACTION
31,5,324888,0.019563,Gyeongbokgung Palace,ATTRACTION
32,5,8692662,0.016733,Crown Park Hotel Seoul,ACCOMMODATION
33,5,1169465,0.016577,N Seoul Tower,ATTRACTION
34,5,1046419,0.014717,Cheonggyecheon Stream,ATTRACTION
35,5,1552278,0.014024,Kwangjang Market,ATTRACTION
36,5,12287729,0.013276,G2 Hotel,ACCOMMODATION
37,5,301253,0.013238,The Shilla Seoul,ACCOMMODATION
38,5,299154,0.013222,InterContinental Seoul COEX,ACCOMMODATION
39,5,306139,0.013028,"THE PLAZA Seoul, Autograph Collection",ACCOMMODATION


In [81]:
# EATERY 많이 평가한 user13 (47개 평가)
df2.loc[df2["userID"]==13]

Unnamed: 0,userID,location.locationId,location.name,location.placeType,rating,tripInfo.stayDate
76,13,2355154,Hourglass Park,ATTRACTION,3,1.527692e+09
77,13,13399849,Skybay Hotel Gyeongpo,ACCOMMODATION,4,1.527692e+09
78,13,9133515,Chodang Grandma Silky Tofu,EATERY,4,1.527692e+09
79,13,3938069,Gyeongpo Beach,ATTRACTION,4,1.527692e+09
80,13,1966431,Ojukheon,ATTRACTION,4,1.527692e+09
...,...,...,...,...,...,...
143,13,3164604,Dolsan Park,ATTRACTION,4,1.432998e+09
144,13,4031882,Hwangso Sikdang,EATERY,4,1.432998e+09
145,13,3164603,Yi Sun Shin Square,ATTRACTION,3,1.432998e+09
146,13,2653511,Elysian Gangchon,ACCOMMODATION,4,1.414681e+09


In [82]:
usercheck(13)

Unnamed: 0,userID,location.locationId,prediction,location.name,location.placeType
60,13,553546,0.146499,Myeongdong Shopping Street,ATTRACTION
61,13,5541269,0.126048,Sokcho Jungang Market,ATTRACTION
62,13,301253,0.125464,The Shilla Seoul,ACCOMMODATION
63,13,1174982,0.113172,Tosokchon Samgyetang,EATERY
64,13,554537,0.10497,The War Memorial of Korea,ATTRACTION
65,13,320359,0.104967,Changdeokgung Palace,ATTRACTION
66,13,609340,0.104929,Myeong-dong Cathedral,ATTRACTION
67,13,4031198,0.104928,Pyeongyang Myeonok,EATERY
68,13,6656879,0.104916,Samcheongdong-gil Road,ATTRACTION
69,13,306105,0.10491,Imperial Palace Seoul,ACCOMMODATION


### + Lift / Jaccard : 성능 0 ~ 

In [39]:
model = SARSingleNode(
    similarity_type="lift", # jaccard, lift, cooccurrence 
    time_decay_coefficient=100, # 100일 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [40]:
model.fit(train) # cooccurrence 보다는 쪼끔 더 걸림 

In [41]:
top_k = model.recommend_k_items(test, remove_seen=True)

In [42]:
display(top_k.head(20))

Unnamed: 0,userID,location.locationId,prediction
0,1,9314172,0.0
1,1,9070095,0.0
2,1,7748061,0.0
3,1,8990236,0.0
4,1,9003589,0.0
5,1,9028032,0.0
6,1,6876823,0.0
7,1,3938967,0.0
8,1,8634423,0.0
9,1,12951821,0.0


In [43]:
top_k.loc[top_k["prediction"] != 0]

Unnamed: 0,userID,location.locationId,prediction


In [44]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='userID', 
              col_item='location.locationId', 
              col_rating='rating', 
              col_prediction='prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [45]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 10
MAP:		 0.000012
NDCG:		 0.000245
Precision@K:	 0.000111
Recall@K:	 0.000012
