In [1]:
### 基礎建設
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [2]:
### 載入資料
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-09 15:45:20--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-09 15:45:20 (20.0 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-09 15:45:21--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-09 15:45:21 (15.2 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [4]:
metadata.shape

(32892, 19)

In [5]:
ratings.shape

(371345, 5)

In [6]:
### 提醒，請使用相同類別（beauty）的資料進行實作。
### 處理 rank 中的排行跟分類
metadata['rank_rank'] = metadata['rank'].str.split(" ").str.get(0).str.replace(',','').to_frame()
metadata['rank_category'] = metadata['rank'].str.split("in ").str.get(1).to_frame()

### 濾出 beauty 類別
metadata_beauty = metadata.loc[(metadata['rank_category']=='Beauty & Personal Care (')]

### 過濾重複的資料
metadata_beauty = (
     metadata_beauty
     .sort_values('asin', ascending=False)
     .groupby(['asin']).head(1)
)

### 過濾重複的資料
ratings = (
     ratings
     .sort_values("DATE", ascending=False)
     .groupby(['reviewerID', 'asin']).head(1)
)

metadata_beauty = metadata_beauty.set_index('asin')
ratings = ratings.set_index('asin')

ratings_beauty = pd.merge(ratings,metadata_beauty['rank_category'],how='inner', on=['asin'])
ratings_beauty.reset_index(inplace=True)
metadata_beauty.reset_index(inplace=True)


In [7]:
metadata_beauty.shape

(31541, 21)

In [8]:
ratings_beauty.shape

(333853, 6)

In [9]:
### 資料切分
ratings = ratings_beauty
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [10]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

## U49_User_based_collaborative_filtering

In [11]:
### 將傳入的資料（dataframe）轉成 user_to_items dictionary，方便後續的運算。
user_to_items = defaultdict(dict)
for _, row in ratings_trainings.iterrows():
    row = dict(row)
    user = row['reviewerID']
    item = row['asin']
    rating = float(row['overall'])
    user_to_items[user][item] = rating

### 過濾掉出現次數小於 n 的使用者。（因為他們較少出現，所以過濾掉他們對結果的精確度影響並不大）。
user_to_items2=user_to_items
remove_obscure_user = True
user_rating_threshold = 3
all_users = list(user_to_items2.keys())
for user in all_users:
    ratings = user_to_items2[user]
    if remove_obscure_user and len(ratings) < user_rating_threshold:
        del user_to_items[user]   

### 使用轉置這個技巧來優化計算效率，為了優化計算的效率，我們第一步先準備資料，我們把 user_to_items dictionary 轉換成 item_to_users。
item_to_users = defaultdict(dict)
for user, items in user_to_items2.items():
    for item, rating in items.items():
        item_to_users[item][user] = rating

### 在得到 item_to_users 之後，我們就可以把 xy, xx, yy 填入 pre_user_similarity matrix，首先我們會走過每一個 item，
### 得到每一個 item 有哪一些 user 對它評分，之後產生所有 user 的倆倆組合，計算 xy, xx, yy 之後填入 
init_sim = lambda: [0 for _ in range(3)]
factory = lambda: defaultdict(init_sim)
pre_user_similarity = defaultdict(factory)
for item, user_ratings in item_to_users.items():
    if len(user_ratings) > 1:
        for user1, user2 in combinations(user_ratings.keys(), 2):
            xy = user_ratings[user1] * user_ratings[user2]
            xx = user_ratings[user1] ** 2
            yy = user_ratings[user2] ** 2
            pre_user_similarity[user1][user2][0] += xy
            pre_user_similarity[user1][user2][1] += xx
            pre_user_similarity[user1][user2][2] += yy
            pre_user_similarity[user2][user1][0] += xy
            pre_user_similarity[user2][user1][1] += xx
            pre_user_similarity[user2][user1][2] += yy

### 計算 user similarity matrix：做完計算之後將結果依大小順序加入到 list 中即可    
user_similarity = {}
for src_user in pre_user_similarity:
    user_similarity_order = []
    for dst_user, val in pre_user_similarity[src_user].items():
        xy = val[0]
        xx = val[1]
        yy = val[2]
        div = ((xx*yy) ** 0.5)
        if div == 0:
            continue
        similarity = xy / div
        if similarity < 0:
            continue
        for i, s in enumerate(user_similarity_order):
            target_similarity = s[1]
            if target_similarity < similarity:
                user_similarity_order.insert(i, (dst_user, similarity))
                break
        else:
            user_similarity_order.append((dst_user, similarity))
    user_similarity[src_user] = user_similarity_order

### 獲取最終的推薦結果
### 第一個迴圈去掃描過全部的 user，根據傳入的 user 去找跟他們最相似的其他 users ，
### 並從相似 users （第二個迴圈）有評分的商品（第三個迴圈）中依評分高低得到推薦結果（這邊有優化的空間：可以設定低於多少評分的就不推薦）。
### 要注意的是，需避免推薦到 target user 已經評分過的商品，直到所要求的數量就可以停止推薦了。             

def recommender_User_based(users=[], k=10):
  recommendation = {}
  for user in users:
      if user in user_similarity:
          sim_users = user_similarity[user]
          recommended_items = []
          recommended_items_set = set()
          user_have_rated = set(user_to_items[user])
          stop_recommend = False
          for sim_user, _ in sim_users:
            items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
            for item, _ in items_from_sim_user:
                if item not in user_have_rated and item not in recommended_items_set:
                  recommended_items.append(item)
                  recommended_items_set.add(item)
                if len(recommended_items) >= k:
                  stop_recommend = True
                  break
                if stop_recommend:
                  break
            recommendation[user] = recommended_items
      else:
            recommendation[user] = []
  return recommendation

In [12]:
ratings_by_user_User_based = recommender_User_based(users)

## U50_Item_based_collaborative_filtering

In [13]:
### 將傳入的資料（dataframe）轉成 item_to_users dictionary，方便後續的運算。
import pandas as pd
from itertools import combinations
from collections import defaultdict
item_to_users = defaultdict(dict)
for _, row in ratings_trainings.iterrows():
    row = dict(row)
    user = row['reviewerID']
    item = row['asin']
    rating = float(row['overall'])
    item_to_users[item][user] = rating

### 為了優化計算的效率，我們第一步先準備資料，我們把 item_to_users dictionary 轉置（transpose）成 user_to_items。
user_to_items = defaultdict(dict)
for item, rating_users in item_to_users.items():
    for user, rating in rating_users.items():
        user_to_items[user][item] = rating

### 在得到 user_to_items 之後，我們就可以把 xy, xx, yy 填入 pre_item_similarity matrix，
### 首先我們會走過每一個 user，得到每一個 user 有對哪一些 item 評分，之後產生所有 item 的倆倆組合，計算 xy, xx, yy 之後填入。            
init_sim = lambda: [0, 0, 0]
factory = lambda: defaultdict(init_sim)
pre_item_similarity = defaultdict(factory)
for user, items in user_to_items.items():
    if len(items) > 1:
        for i1, i2 in combinations(items.keys(), 2):
            xy = items[i1] * items[i2]
            xx = items[i1] ** 2
            yy = items[i2] ** 2
            pre_item_similarity[i1][i2][0] += xy
            pre_item_similarity[i1][i2][1] += xx
            pre_item_similarity[i1][i2][2] += yy
 
            pre_item_similarity[i2][i1][0] += xy
            pre_item_similarity[i2][i1][1] += xx
            pre_item_similarity[i2][i1][2] += yy

### 計算 item similarity matrix：做完計算之後將結果依大小順序加入到 list 中即可
item_similarity = {}
for src_item in pre_item_similarity:
    item_similarity_order = []
    for dst_item, val in pre_item_similarity[src_item].items():
        xy = val[0]
        xx = val[1]
        yy = val[2]
        div = ((xx*yy) ** 0.5)
        if div == 0:
          continue
        similarity = xy / div
        if similarity < 0:
          continue
        for i, s in enumerate(item_similarity_order):
          target_similarity = s[1]
          if target_similarity < similarity:
            item_similarity_order.insert(i, (dst_item, similarity))
            break
        else:
          item_similarity_order.append((dst_item, similarity))
    item_similarity[src_item] = item_similarity_order

### 獲取最終的推薦結果
### 根據傳入的 users ，先找出每個 user 有評分過的商品之後，依據相似程度找出與該商品相似的商品。
### 要注意的一點是，要避免推薦到 target user 已經評分過的商品，直到所要求的數量就可以停止推薦了。 
def recommender_Item_based(users=[], k=10):
 recommendation = {}
 for user in users:
     items = []
     items_set = set()
     stop = False
     user_has_rated = set(user_to_items[user])
     for item in user_has_rated:
         if item in item_similarity:
             for sim_item, _ in item_similarity[item]:
                 # skip the item user has rated
                 if sim_item not in user_has_rated and sim_item not in items_set:
                     items.append(sim_item)
                     items_set.add(sim_item)
                 if len(items) >= k:
                     stop = True
                     break
             if stop:
                 break
     recommendation[user] = items
 return recommendation


In [14]:
ratings_by_user_Item_based = recommender_Item_based(users)

## U51_利用套件surprise實作collaborative_filtering方法

In [15]:
training_data = ratings_trainings

### 過濾資料以免RAM不足
### 1.只出現一次評分的商品紀錄：表示只有一個人評分過一次這項商品，沒有更多紀錄可以看該商品跟其他商品的關係
### 2.只評分過一項商品的使用者：因為只評分過一個商品，沒有更多紀錄可以看該商品跟其他商品的關係

### 只出現一次評分的商品紀錄
rm1=training_data.groupby('asin').asin.count().reset_index(name='count').sort_values(['count'], ascending=False)
rm1_asin=rm1.loc[(rm1['count'] == 1)]

### 濾掉只出現一次評分的商品
training_data_asin = training_data.set_index('asin')
rm1_asin_asin = rm1_asin.set_index('asin')
tmp1_training_data=training_data_asin.drop(rm1_asin_asin.index)
tmp1_training_data.reset_index(inplace=True)

### 只評分過一項商品的使用者
rm2=tmp1_training_data.groupby('reviewerID').reviewerID.count().reset_index(name='count').sort_values(['count'], ascending=False)
rm2_reviewerID=rm2.loc[(rm2['count'] == 1)]

### 濾掉只評分過一項商品的使用者
training_data_reviewerID = tmp1_training_data.set_index('reviewerID')
rm2_reviewerID_reviewerID = rm2_reviewerID.set_index('reviewerID')
tmp2_training_data=training_data_reviewerID.drop(rm2_reviewerID_reviewerID.index)
tmp2_training_data.reset_index(inplace=True)

training_data = tmp2_training_data


### 安裝 surprise

In [16]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619423 sha256=644b51e833a2558e01f497d06fbf21b4252c63c2adfa3232a306135f81413e13
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [17]:
import time
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

In [18]:
### 將資料讀取成 Surprise 所需的格式
reader = Reader(rating_scale=(0, 5))
training_data = training_data[['reviewerID', 'asin', 'overall']]
data = Dataset.load_from_df(training_data, reader=reader)

### 設定所使用的演算法及參數
sim_options = {
     'name': 'cosine',
     'user_based': False  # compute similarities between items
}
algo = KNNBasic
algo_impl = algo(sim_options=sim_options)
trainset = data.build_full_trainset()
algo_impl.fit(trainset)

### 獲取推薦結果
def recommender_surprise(users=[], k=10):
  recommendation = {}
  for user in users:
     items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
     recommend_item_list = []
     recommend_item_set = set()
     for item in items_user_rated:
         iid = algo_impl.trainset.to_inner_iid(item)
         recommend_items_iid = algo_impl.get_neighbors(iid, k)
         for sim_item_iid in recommend_items_iid:
             item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
             if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                 recommend_item_list.append(item_raw_id)
                 recommend_item_set.add(item_raw_id)
 
         if len(recommend_item_list) >= k:
             recommend_item_list = recommend_item_list[:k]
             break
     recommendation[user] = recommend_item_list
  return recommendation


Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
ratings_by_user_surprise = recommender_surprise(users)

## 評估結果

In [20]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

In [21]:
evaluate(ratings_testings_by_user, ratings_by_user_User_based)

0.0

In [22]:
evaluate(ratings_testings_by_user, ratings_by_user_Item_based)

0.003424657534246575

In [23]:
evaluate(ratings_testings_by_user, ratings_by_user_surprise)

0.0017123287671232876