# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-26 15:37:23--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-26 15:37:23 (24.2 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-26 15:37:24--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-26 15:37:24 (18.4 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata.head()

In [None]:
ratings.head()

## 資料整理

In [4]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [5]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [None]:
ratings_trainings

In [None]:
ratings_testings

## 產生推薦 random_based

In [6]:
def recommender_random_based(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    random-based
    '''
    ratings_trainings = training_data
    recommendations = {user: ratings_trainings['asin'].sample(n=k).tolist() for user in users}
    return recommendations
ratings_by_user_random = recommender_random_based(ratings_trainings, users)
ratings_by_user_random

{'A100XQFWKQ30O2': ['B0027BG0YK',
  'B0017TZD7S',
  'B000GLRREU',
  'B01FLJROJ4',
  'B00YO55LJC',
  'B00VF344X0',
  'B000FOI48G',
  'B019FOXB70',
  'B001FB5H9C',
  'B00ILPQICU'],
 'A103T1QOGFCSEH': ['B006WYJM8Y',
  'B00HPZ43A6',
  'B00BMVV3MK',
  'B01AX2X5W4',
  'B01GOSELHE',
  'B008U1Q4DI',
  'B00XMV8OO8',
  'B00V96WXDQ',
  'B00NNLKABW',
  'B001BROT1M'],
 'A106UKKSJ2KXPF': ['B00TR3C8XS',
  'B00005JS5C',
  'B00SYHKYQ6',
  'B00RNKER4U',
  'B00K3KFP4I',
  'B00PYFR9PK',
  'B00157OBRU',
  'B001O707QU',
  'B000WYJTZG',
  'B00Z5VI9MK'],
 'A10A7GV4D5A11V': ['B004A7NXG2',
  'B00S4ADEJ2',
  'B01H3ZQ2NI',
  'B000FEF1V4',
  'B01CV597NM',
  'B01GR1L5R2',
  'B00XLE44Y0',
  'B001AJ6YS2',
  'B000Q9AKVE',
  'B011JG1K3E'],
 'A1119JJ37ZLB8R': ['B00EFBHNLY',
  'B01CT1W6U4',
  'B01DDQXRWU',
  'B00BQGYBO8',
  'B004E3ION4',
  'B001E5E9RK',
  'B00I2OR6YE',
  'B000GLRREU',
  'B011JG1K3E',
  'B00FZERN3I'],
 'A113UOOLBSZN52': ['B001LNODUS',
  'B00IPVCCVQ',
  'B00EHN4TH6',
  'B000FI4S1E',
  'B01C3K03VK',
  'B000

## 產生推薦 rule_based

In [None]:
# 沒有購買紀錄的人推薦優質熱門商品
# 1.平均分數>=4
# 2.評分次數>=100
# 3.取上述商品的rank推薦

# 有高評分購買紀錄的人 推薦買過相同商品的人 買過的其他商品
# 1.先濾出該ID評分>=4的商品
# 2.找出買過相同商品的人買過的其他商品
# 3.且平均分數>=4
# 4.取上述商品的rank推薦



In [7]:
# 產生新的 metadata 表
df_asin = metadata['asin']
df_brand = metadata['brand']
df_title = metadata['title']
df_price = metadata['price']
df_rank = metadata['rank'].str.split(" ").str.get(0).str.replace(',','')
df_rank2 = pd.to_numeric(df_rank, errors='coerce')
df_description = metadata['description']
metadata_tmp = pd.concat([df_asin,df_brand,df_title,df_price,df_rank2,df_description],axis='columns')
metadata_new = metadata_tmp.set_index('asin')
metadata_new['overall_mean'] = ratings.groupby(by=['asin']).mean()['overall']
metadata_new['reviews_count'] = ratings.groupby(by=['asin']).size()
metadata_new


Unnamed: 0_level_0,brand,title,price,rank,description,overall_mean,reviews_count
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6546546450,idea village,Loud 'N Clear&trade; Personal Sound Amplifier,,2938573.0,[Loud 'N Clear Personal Sound Amplifier allows...,2.5,2.0
7178680776,,No7 Lift &amp; Luminate Triple Action Serum 50...,$44.99,872854.0,[No7 Lift & Luminate Triple Action Serum 50ml ...,3.0,1.0
7250468162,No7,No7 Stay Perfect Foundation Cool Vanilla by No7,$28.76,956696.0,[No7 Stay Perfect Foundation now stays perfect...,5.0,1.0
7367905066,,Wella Koleston Perfect Hair Colour 44/44 Mediu...,,1870258.0,[],5.0,1.0
7414204790,Pirmal Healthcare,Lacto Calamine Skin Balance Oil control 120 ml...,$12.15,67701.0,[Lacto Calamine Skin Balance Daily Nourishing ...,4.4,15.0
...,...,...,...,...,...,...,...
B01HIWLLUK,,"Barielle Pro Textured Grip Cuticle Nipper, Purple",$9.95,2145325.0,[],5.0,1.0
B01HJ1K3YK,Salon Perfect,(Buy 3 Get 1 Free) Salon Perfect Eye Makeup Co...,,1639713.0,[],1.0,1.0
B01HJ84SGM,,NOW D-Mannose 500 mg - 120 Veg Capsules (Pack ...,$55.63,207410.0,[],5.0,5.0
B01HJASD20,GBSTORE,12 White Feather Shuttlecocks Birdies Badminto...,$12.99,965673.0,[Brand new and high quality<br> Enables fast v...,1.0,1.0


In [8]:
# 優質熱門商品
popular_products = metadata_new.loc[(metadata_new['overall_mean'] >= 4) & (metadata_new['reviews_count'] >= 100) ].sort_values(by=['rank'])
popular_products.reset_index(inplace=True)
popular_products
fix_recommend_top10=popular_products['asin'].unique()[0:10]

In [9]:
# 過濾出評分表中分數 >= 4 的四星評分表
all_ratings_trainings_4 = ratings_trainings.loc[(ratings_trainings['overall']>=4)]
all_ratings_trainings_4_asin = all_ratings_trainings_4.set_index('asin')
all_ratings_trainings_4_reviewerID = all_ratings_trainings_4.set_index('reviewerID')


In [None]:
# 某使用者A購買過且分數 >= 4 的商品
'''
user_ratings_trainings_4 = ratings_trainings.loc[(ratings_trainings['overall']>=4) & (ratings_trainings['reviewerID']=='A2GJX2KCUSR0EI')]
column_asin = ["asin"]
user_buy_item = pd.DataFrame(user_ratings_trainings_4['asin'].unique(),columns=column_asin)
user_buy_item_asin = user_buy_item.set_index('asin')
user_buy_item_asin
'''


In [None]:
# 承上，交集四星評分表
'''
buy_the_same_items_logs=all_ratings_trainings_4_asin.join(user_buy_item_asin, how='right')
buy_the_same_items_logs
'''


In [None]:
# 與某使用者A買過相同商品的人
'''
column_reviewerID = ["reviewerID"]
buy_the_same_items_users = pd.DataFrame(buy_the_same_items_logs['reviewerID'].unique(),columns=column_reviewerID)
buy_the_same_items_users_reviewerID = buy_the_same_items_users.set_index('reviewerID')
buy_the_same_items_users_reviewerID
'''

In [None]:
# 承上，與四星評分表交集
'''
buy_others_items_logs = all_ratings_trainings_4_reviewerID.join(buy_the_same_items_users_reviewerID, how='right')
buy_others_items_logs
'''

In [None]:
# 找出買過相同商品的人也買過的商品清單
'''
column_asin = ["asin"]
buy_others_items_list = pd.DataFrame(buy_others_items_logs['asin'].unique(),columns=column_asin)
buy_others_items_list_asin = buy_others_items_list.set_index('asin')
buy_others_items_list_asin
'''

In [None]:
# 承上的商品清單交集 metadata
'''
recommend_items = metadata_new.join(buy_others_items_list_asin, how='right')
recommend_items
'''

In [None]:
# 找出平均分數大於等於4分且評分次數大於等於100次 並用 rank 排序
'''
recommend_items_details = recommend_items.loc[(recommend_items['overall_mean'] >= 4) & (recommend_items['reviews_count'] >= 10)].sort_values(by=['rank'])
recommend_items_details.reset_index(inplace=True)
recommend_items_details
recommend_items_details['asin'].unique()[0:10]
'''



In [10]:
def rule(user,k):
  # 某使用者A購買過且分數 >= 4 的商品
  user_ratings_trainings_4 = ratings_trainings.loc[(ratings_trainings['overall']>=4) & (ratings_trainings['reviewerID']==user)]
  if user_ratings_trainings_4.empty:
    recommend_topk=fix_recommend_top10
  else:
    column_asin = ["asin"]
    user_buy_item = pd.DataFrame(user_ratings_trainings_4['asin'].unique(),columns=column_asin)
    user_buy_item_asin = user_buy_item.set_index('asin')

    # 承上，交集四星評分表
    buy_the_same_items_logs=all_ratings_trainings_4_asin.join(user_buy_item_asin, how='right')
    
    # 與某使用者A買過相同商品的人
    column_reviewerID = ["reviewerID"]
    buy_the_same_items_users = pd.DataFrame(buy_the_same_items_logs['reviewerID'].unique(),columns=column_reviewerID)
    buy_the_same_items_users_reviewerID = buy_the_same_items_users.set_index('reviewerID')

    # 承上，與四星評分表交集
    buy_others_items_logs = all_ratings_trainings_4_reviewerID.join(buy_the_same_items_users_reviewerID, how='right')
    buy_others_items_logs

    # 找出買過相同商品的人也買過的商品清單
    column_asin = ["asin"]
    buy_others_items_list = pd.DataFrame(buy_others_items_logs['asin'].unique(),columns=column_asin)
    buy_others_items_list_asin = buy_others_items_list.set_index('asin')

    # 承上的商品清單交集 metadata
    recommend_items = metadata_new.join(buy_others_items_list_asin, how='right')
    
    # 找出平均分數大於等於4分且評分次數大於等於10次 並用 rank 排序
    recommend_items_details = recommend_items.loc[(recommend_items['overall_mean'] >= 4)& (recommend_items['reviews_count'] >= 100)].sort_values(by=['rank'])
    recommend_items_details.reset_index(inplace=True)
    recommend_items_details
    recommend_topk = recommend_items_details['asin'].unique()[0:k]
    if len(recommend_topk)<k:
      recommend_topk = fix_recommend_top10


  return recommend_topk



In [11]:
def recommender_rule_based(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}

    recommendations = {user: rule(user,k).tolist() for user in users}
    return recommendations


ratings_by_user_rule = recommender_rule_based(ratings_trainings, users)
ratings_by_user_rule

{'A100XQFWKQ30O2': ['B006IB5T4W',
  'B001QY8QXM',
  'B00005JS5C',
  'B002OITLG0',
  'B01DJI7796',
  'B000050FDY',
  'B01C39X6TW',
  'B013XKHA4M',
  'B00X4DKZKU',
  'B000A2LRBY'],
 'A103T1QOGFCSEH': ['B006IB5T4W',
  'B001QY8QXM',
  'B00005JS5C',
  'B002OITLG0',
  'B01DJI7796',
  'B000050FDY',
  'B01C39X6TW',
  'B013XKHA4M',
  'B00X4DKZKU',
  'B000A2LRBY'],
 'A106UKKSJ2KXPF': ['B006IB5T4W',
  'B001QY8QXM',
  'B00005JS5C',
  'B002OITLG0',
  'B01DJI7796',
  'B000050FDY',
  'B01C39X6TW',
  'B013XKHA4M',
  'B00X4DKZKU',
  'B000A2LRBY'],
 'A10A7GV4D5A11V': ['B006IB5T4W',
  'B001QY8QXM',
  'B00005JS5C',
  'B002OITLG0',
  'B01DJI7796',
  'B000050FDY',
  'B01C39X6TW',
  'B013XKHA4M',
  'B00X4DKZKU',
  'B000A2LRBY'],
 'A1119JJ37ZLB8R': ['B006IB5T4W',
  'B001QY8QXM',
  'B00005JS5C',
  'B002OITLG0',
  'B01DJI7796',
  'B000050FDY',
  'B01C39X6TW',
  'B013XKHA4M',
  'B00X4DKZKU',
  'B000A2LRBY'],
 'A113UOOLBSZN52': ['B006IB5T4W',
  'B001QY8QXM',
  'B00005JS5C',
  'B002OITLG0',
  'B01DJI7796',
  'B000

## 結果評估

In [12]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score



In [13]:
evaluate(ratings_testings_by_user, ratings_by_user_random)

0.00847457627118644

In [14]:
evaluate(ratings_testings_by_user, ratings_by_user_rule)

0.011864406779661017