In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-08 18:12:36--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-08 18:12:37 (19.9 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-08 18:12:37--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-08 18:12:38 (15.1 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [None]:
ratings.head()

In [4]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [5]:
len(ratings_trainings)

370752

### 過濾重複的資料

In [6]:
training_data = (
     ratings_trainings
     .sort_values("DATE", ascending=False)
     .groupby(['reviewerID', 'asin']).head(1)
)

In [7]:
len(training_data)

361012

###再過濾
####1.只出現一次評分的商品紀錄：表示只有一個人評分過一次這項商品，沒有更多紀錄可以看該商品跟其他商品的關係
####2.只評分過一項商品的使用者：因為只評分過一個商品，沒有更多紀錄可以看該商品跟其他商品的關係

In [8]:
rm1=training_data.groupby('asin').asin.count().reset_index(name='count').sort_values(['count'], ascending=False)
rm1_asin=rm1.loc[(rm1['count'] == 1)]
rm1_asin


Unnamed: 0,asin,count
586,B0006U2QGU,1
31378,B01FYMCF24,1
32002,B01GR29XFW,1
4027,B0034KZX0K,1
24926,B017NMTUHG,1
...,...,...
6337,B007FRNWPG,1
19547,B00UUQ5SQ0,1
6336,B007FOHZ4S,1
19552,B00UV7TVLG,1


In [9]:
### 濾掉只出現一次評分的商品
training_data_asin = training_data.set_index('asin')
rm1_asin_asin = rm1_asin.set_index('asin')
tmp1_training_data=training_data_asin.drop(rm1_asin_asin.index)
tmp1_training_data.reset_index(inplace=True)
tmp1_training_data


Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,B01DUU06FK,A3LPM9597RHTCH,4.0,1535673600,2018-08-31
1,B015ZOWER2,A7REX3VF8C3A4,5.0,1535673600,2018-08-31
2,B01DJI7796,A3593GO4AYVVDO,5.0,1535673600,2018-08-31
3,B01CW24JXC,A4GTDV68XZFGD,5.0,1535673600,2018-08-31
4,B01CW8SDLU,A353ECRVPE944M,1.0,1535673600,2018-08-31
...,...,...,...,...,...
347488,B000050FDP,A3RVIXD86WUT1E,5.0,973987200,2000-11-12
347489,B000050B62,A1U7T7UCCV3SBN,5.0,972777600,2000-10-29
347490,0061073717,A2XMFX1BR0IJFJ,5.0,959990400,2000-06-03
347491,0061073717,ATKPYXA8XFKGJ,5.0,957571200,2000-05-06


In [10]:
### 只評分過一項商品的使用者
rm2=tmp1_training_data.groupby('reviewerID').reviewerID.count().reset_index(name='count').sort_values(['count'], ascending=False)
rm2_reviewerID=rm2.loc[(rm2['count'] == 1)]
rm2_reviewerID

Unnamed: 0,reviewerID,count
217205,A3MTCNGFZOYS3S,1
216821,A3MN9A0QZGYP9C,1
168845,A31O4BN0WVHMK6,1
212945,A3KY0EWXCM2KLR,1
213694,A3LADPTMCSPKKJ,1
...,...,...
108812,A2BGS9SWSW3SJ5,1
108811,A2BGQWYT14DBG2,1
108810,A2BGPURA1XUZUB,1
108809,A2BGO1WGNG9F9C,1


In [11]:
### 濾掉只評分過一項商品的使用者
training_data_reviewerID = tmp1_training_data.set_index('reviewerID')
rm2_reviewerID_reviewerID = rm2_reviewerID.set_index('reviewerID')
tmp2_training_data=training_data_reviewerID.drop(rm2_reviewerID_reviewerID.index)
tmp2_training_data.reset_index(inplace=True)
tmp2_training_data

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,DATE
0,A2IW9URACR5LUD,B01DKQAXC0,4.0,1535673600,2018-08-31
1,A27C3J238GH88L,B01FIPS9WW,5.0,1535673600,2018-08-31
2,A3J0LOCVOIEDNE,B01AS25Y0U,5.0,1535500800,2018-08-29
3,A1UI2UN2YZB6MG,B01ES87TOY,3.0,1535414400,2018-08-28
4,A24J0AXBSPG5FC,B010SV3L3I,5.0,1535328000,2018-08-27
...,...,...,...,...,...
64225,A1HZVOJGCKYB0U,B000050B65,5.0,1023840000,2002-06-12
64226,A1HZVOJGCKYB0U,B000050B63,5.0,1023840000,2002-06-12
64227,A231WM2Z2JL0U3,B000050B65,4.0,1022976000,2002-06-02
64228,A2KEO12W2C0FUB,B000050FDR,1.0,1007683200,2001-12-07


In [12]:
training_data = tmp2_training_data
len(training_data)

64230

### 安裝 surprise

In [13]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.5 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619414 sha256=df5780884fcbddb1fa332cdd4ecd774c11e39c944be513befb7ec10406dc1bdc
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [14]:
import time
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

### 將資料讀取成 Surprise 所需的格式

In [15]:
reader = Reader(rating_scale=(0, 5))
training_data = training_data[['reviewerID', 'asin', 'overall']]
data = Dataset.load_from_df(training_data, reader=reader)

### 設定所使用的演算法及參數

In [16]:
sim_options = {
     'name': 'cosine',
     'user_based': False  # compute similarities between items
}
algo = KNNBasic
algo_impl = algo(sim_options=sim_options)
trainset = data.build_full_trainset()
algo_impl.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fc2045ed4d0>

### 獲取推薦結果

In [17]:
def recommender(users=[], k=10):
  recommendation = {}
  for user in users:
     items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
     recommend_item_list = []
     recommend_item_set = set()
     for item in items_user_rated:
         iid = algo_impl.trainset.to_inner_iid(item)
         recommend_items_iid = algo_impl.get_neighbors(iid, k)
         for sim_item_iid in recommend_items_iid:
             item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
             if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                 recommend_item_list.append(item_raw_id)
                 recommend_item_set.add(item_raw_id)
 
         if len(recommend_item_list) >= k:
             recommend_item_list = recommend_item_list[:k]
             break
     recommendation[user] = recommend_item_list
  return recommendation

In [18]:
ratings_by_user = recommender(users)

In [19]:
ratings_by_user

{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': [],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': [],
 'A174YOBOSW9WDN': [],
 'A1786SKRAJXH86': [],
 'A17K2BUZ20WD2': [],
 'A17LYRFV645L0V': [],
 'A18LNGVXDZBTUR': [],
 'A19503XX7GU6J2': ['B00NS8YAHU',
  'B019Z9L2SK',
  'B015DZXQ

In [20]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.001694915254237288