In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-08 18:10:00--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-08 18:10:01 (19.1 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-08 18:10:01--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-08 18:10:02 (15.0 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [None]:
ratings.head()

In [4]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

### 將傳入的資料（dataframe）轉成 item_to_users dictionary，方便後續的運算。

In [5]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
item_to_users = defaultdict(dict)
for _, row in ratings_trainings.iterrows():
    row = dict(row)
    user = row['reviewerID']
    item = row['asin']
    rating = float(row['overall'])
    item_to_users[item][user] = rating

In [None]:
item_to_users

### 為了優化計算的效率，我們第一步先準備資料，我們把 item_to_users dictionary 轉置（transpose）成 user_to_items。

In [6]:
user_to_items = defaultdict(dict)
for item, rating_users in item_to_users.items():
    for user, rating in rating_users.items():
        user_to_items[user][item] = rating

In [None]:
user_to_items

### 在得到 user_to_items 之後，我們就可以把 xy, xx, yy 填入 pre_item_similarity matrix，首先我們會走過每一個 user，得到每一個 user 有對哪一些 item 評分，之後產生所有 item 的倆倆組合，計算 xy, xx, yy 之後填入。

In [7]:
   init_sim = lambda: [0, 0, 0]
   factory = lambda: defaultdict(init_sim)
   pre_item_similarity = defaultdict(factory)
   for user, items in user_to_items.items():
       if len(items) > 1:
           for i1, i2 in combinations(items.keys(), 2):
               xy = items[i1] * items[i2]
               xx = items[i1] ** 2
               yy = items[i2] ** 2
               pre_item_similarity[i1][i2][0] += xy
               pre_item_similarity[i1][i2][1] += xx
               pre_item_similarity[i1][i2][2] += yy
 
               pre_item_similarity[i2][i1][0] += xy
               pre_item_similarity[i2][i1][1] += xx
               pre_item_similarity[i2][i1][2] += yy

In [None]:
pre_item_similarity

### 計算 item similarity matrix：做完計算之後將結果依大小順序加入到 list 中即可

In [8]:
item_similarity = {}
for src_item in pre_item_similarity:
    item_similarity_order = []
    for dst_item, val in pre_item_similarity[src_item].items():
        xy = val[0]
        xx = val[1]
        yy = val[2]
        div = ((xx*yy) ** 0.5)
        if div == 0:
          continue
        similarity = xy / div
        if similarity < 0:
          continue
        for i, s in enumerate(item_similarity_order):
          target_similarity = s[1]
          if target_similarity < similarity:
            item_similarity_order.insert(i, (dst_item, similarity))
            break
        else:
          item_similarity_order.append((dst_item, similarity))
    item_similarity[src_item] = item_similarity_order

In [None]:
item_similarity

### 獲取最終的推薦結果
### 根據傳入的 users ，先找出每個 user 有評分過的商品之後，依據相似程度找出與該商品相似的商品。要注意的一點是，要避免推薦到 target user 已經評分過的商品，直到所要求的數量就可以停止推薦了。

In [9]:
def recommender(users=[], k=10):
 recommendation = {}
 for user in users:
     items = []
     items_set = set()
     stop = False
     user_has_rated = set(user_to_items[user])
     for item in user_has_rated:
         if item in item_similarity:
             for sim_item, _ in item_similarity[item]:
                 # skip the item user has rated
                 if sim_item not in user_has_rated and sim_item not in items_set:
                     items.append(sim_item)
                     items_set.add(sim_item)
                 if len(items) >= k:
                     stop = True
                     break
             if stop:
                 break
     recommendation[user] = items
 return recommendation

In [10]:
ratings_by_user = recommender(users)

In [11]:
ratings_by_user

{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': ['B00BWIT33Y'],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': ['B000EE9XYG',
  'B007SA3AUW',
  'B000FVHRXC',
  'B001U4RGOS',
  'B01CRWLLX8',
  'B004E4GHRS',
  'B006IB5T4W',
  'B00B0SA1SC',
  'B00JYGWWIO',
  'B00L88S190'],
 'A174Y

In [12]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.001694915254237288