In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-08 14:20:41--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-08 14:20:42 (18.0 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-08 14:20:42--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-08 14:20:43 (14.1 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [4]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18
2,143026860,A1572GUYS7DGSR,4.0,1407628800,2014-08-10
3,143026860,A1PSGLFK1NSVO,5.0,1362960000,2013-03-11
4,143026860,A6IKXKZMTKGSC,5.0,1324771200,2011-12-25


In [5]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

### 將傳入的資料（dataframe）轉成 user_to_items dictionary，方便後續的運算。

In [6]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

user_to_items = defaultdict(dict)
for _, row in ratings_trainings.iterrows():
    row = dict(row)
    user = row['reviewerID']
    item = row['asin']
    rating = float(row['overall'])
    user_to_items[user][item] = rating

In [7]:
len(user_to_items)

323489

### 過濾掉出現次數小於 n 的使用者。（因為他們較少出現，所以過濾掉他們對結果的精確度影響並不大）。

In [8]:
user_to_items2=user_to_items
remove_obscure_user = True
user_rating_threshold = 3
all_users = list(user_to_items2.keys())
for user in all_users:
    ratings = user_to_items2[user]
    if remove_obscure_user and len(ratings) < user_rating_threshold:
        del user_to_items[user]

In [9]:
len(user_to_items2)

4793

### 使用轉置這個技巧來優化計算效率，為了優化計算的效率，我們第一步先準備資料，我們把 user_to_items dictionary 轉換成 item_to_users。

In [10]:
item_to_users = defaultdict(dict)
for user, items in user_to_items2.items():
    for item, rating in items.items():
        item_to_users[item][user] = rating

In [None]:
item_to_users

### 在得到 item_to_users 之後，我們就可以把 xy, xx, yy 填入 pre_user_similarity matrix，首先我們會走過每一個 item，得到每一個 item 有哪一些 user 對它評分，之後產生所有 user 的倆倆組合，計算 xy, xx, yy 之後填入

In [11]:
init_sim = lambda: [0 for _ in range(3)]
factory = lambda: defaultdict(init_sim)
pre_user_similarity = defaultdict(factory)
for item, user_ratings in item_to_users.items():
    if len(user_ratings) > 1:
        for user1, user2 in combinations(user_ratings.keys(), 2):
            xy = user_ratings[user1] * user_ratings[user2]
            xx = user_ratings[user1] ** 2
            yy = user_ratings[user2] ** 2
            pre_user_similarity[user1][user2][0] += xy
            pre_user_similarity[user1][user2][1] += xx
            pre_user_similarity[user1][user2][2] += yy
            pre_user_similarity[user2][user1][0] += xy
            pre_user_similarity[user2][user1][1] += xx
            pre_user_similarity[user2][user1][2] += yy

In [None]:
pre_user_similarity

### 計算 user similarity matrix：做完計算之後將結果依大小順序加入到 list 中即可

In [12]:
user_similarity = {}
for src_user in pre_user_similarity:
    user_similarity_order = []
    for dst_user, val in pre_user_similarity[src_user].items():
        xy = val[0]
        xx = val[1]
        yy = val[2]
        div = ((xx*yy) ** 0.5)
        if div == 0:
            continue
        similarity = xy / div
        if similarity < 0:
            continue
        for i, s in enumerate(user_similarity_order):
            target_similarity = s[1]
            if target_similarity < similarity:
                user_similarity_order.insert(i, (dst_user, similarity))
                break
        else:
            user_similarity_order.append((dst_user, similarity))
    user_similarity[src_user] = user_similarity_order

In [None]:
user_similarity

### 獲取最終的推薦結果

In [13]:
def recommender(users=[], k=10):
  recommendation = {}
  for user in users:
      if user in user_similarity:
          sim_users = user_similarity[user]
          recommended_items = []
          recommended_items_set = set()
          user_have_rated = set(user_to_items[user])
          stop_recommend = False
          for sim_user, _ in sim_users:
            items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
            for item, _ in items_from_sim_user:
                if item not in user_have_rated and item not in recommended_items_set:
                  recommended_items.append(item)
                  recommended_items_set.add(item)
                if len(recommended_items) >= k:
                  stop_recommend = True
                  break
                if stop_recommend:
                  break
            recommendation[user] = recommended_items
      else:
            recommendation[user] = []
  return recommendation

In [14]:
ratings_by_user = recommender(users)
ratings_by_user


{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': [],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': [],
 'A174YOBOSW9WDN': [],
 'A1786SKRAJXH86': [],
 'A17K2BUZ20WD2': [],
 'A17LYRFV645L0V': [],
 'A18LNGVXDZBTUR': [],
 'A19503XX7GU6J2': [],
 'A19HVHRZDYFEOP': [],
 'A19JM38B861BO

In [15]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.0