# Sample Code

Hi~

## 基礎建設

In [None]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

In [None]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata.head()

In [None]:
ratings.head()

## 資料整理

In [None]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [None]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦 random_based

In [None]:
def recommender_random_based(training_data, users=[], k=3):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    random-based
    '''
    ratings_trainings = training_data
    recommendations = {user: ratings_trainings['asin'].sample(n=k).tolist() for user in users}
    return recommendations
ratings_by_user = recommender_random_based(ratings_trainings, users)
ratings_by_user

## 產生推薦 rule_based

In [None]:
# 推薦買過類別的熱門品且評分>=4
# 1.先濾出該ID評分>=4的商品
# 2.找出商品清單中上述商品的類別
# 3.取上述商品的rank推薦
#
# 推薦買過類別的熱門品且評分>=4
# 1.先濾出該ID評分>=4的商品
# 2.找出買過相同商品的人買過的其他商品
# 3.且平均分數>=4
# 4.取上述商品的rank推薦

In [None]:
# 產生新的 metadata 表
df_asin = metadata['asin']
df_brand = metadata['brand']
df_title = metadata['title']
df_price = metadata['price']
df_rank = metadata['rank'].str.split(" ").str.get(0).str.replace(',','')
df_rank2 = pd.to_numeric(df_rank, errors='coerce')
df_description = metadata['description']
metadata_tmp = pd.concat([df_asin,df_brand,df_title,df_price,df_rank2,df_description],axis='columns')
metadata_new = metadata_tmp.set_index('asin')
metadata_new['overall_mean'] = ratings.groupby(by=['asin']).mean()['overall']
metadata_new['reviews_count'] = ratings.groupby(by=['asin']).size()
metadata_new


In [None]:
# 過濾出評分表中分數 >= 4 的四星評分表
all_ratings_trainings_4 = ratings_trainings.loc[(ratings_trainings['overall']>=4)]
all_ratings_trainings_4_asin = all_ratings_trainings_4.set_index('asin')
all_ratings_trainings_4_reviewerID = all_ratings_trainings_4.set_index('reviewerID')


In [None]:
# 某使用者A購買過且分數 >= 4 的商品
user_ratings_trainings_4 = ratings_trainings.loc[(ratings_trainings['overall']>=4) & (ratings_trainings['reviewerID']=='A2GJX2KCUSR0EI')]
column_asin = ["asin"]
user_buy_item = pd.DataFrame(user_ratings_trainings_4['asin'].unique(),columns=column_asin)
user_buy_item_asin = user_buy_item.set_index('asin')
user_buy_item_asin



In [None]:
# 承上，交集四星評分表
buy_the_same_items_logs=all_ratings_trainings_4_asin.join(user_buy_item_asin, how='right')
buy_the_same_items_logs



In [None]:
# 與某使用者A買過相同商品的人
column_reviewerID = ["reviewerID"]
buy_the_same_items_users = pd.DataFrame(buy_the_same_items_logs['reviewerID'].unique(),columns=column_reviewerID)
buy_the_same_items_users_reviewerID = buy_the_same_items_users.set_index('reviewerID')
buy_the_same_items_users_reviewerID


In [None]:
# 承上，與四星評分表交集
buy_others_items_logs = all_ratings_trainings_4_reviewerID.join(buy_the_same_items_users_reviewerID, how='right')
buy_others_items_logs


In [None]:
# 找出買過相同商品的人也買過的商品清單
column_asin = ["asin"]
buy_others_items_list = pd.DataFrame(buy_others_items_logs['asin'].unique(),columns=column_asin)
buy_others_items_list_asin = buy_others_items_list.set_index('asin')
buy_others_items_list_asin

In [None]:
# 承上的商品清單交集 metadata
recommend_items = metadata_new.join(buy_others_items_list_asin, how='right')
recommend_items

In [None]:
# 找出平均分數大於等於4分且評分次數大於等於10次 並用 rank 排序
recommend_items_details = recommend_items.loc[(recommend_items['overall_mean'] >= 4) & (recommend_items['reviews_count'] >= 10)].sort_values(by=['rank'])
recommend_items_details.reset_index(inplace=True)
recommend_items_details
recommend_items_details['asin'].unique()[0:10]




In [None]:
def rule(user,k):
  # 某使用者A購買過且分數 >= 4 的商品
  user_ratings_trainings_4 = ratings_trainings.loc[(ratings_trainings['overall']>=4) & (ratings_trainings['reviewerID']==user)]
  column_asin = ["asin"]
  user_buy_item = pd.DataFrame(user_ratings_trainings_4['asin'].unique(),columns=column_asin)
  user_buy_item_asin = user_buy_item.set_index('asin')

  # 承上，交集四星評分表
  buy_the_same_items_logs=all_ratings_trainings_4_asin.join(user_buy_item_asin, how='right')
  
  # 與某使用者A買過相同商品的人
  column_reviewerID = ["reviewerID"]
  buy_the_same_items_users = pd.DataFrame(buy_the_same_items_logs['reviewerID'].unique(),columns=column_reviewerID)
  buy_the_same_items_users_reviewerID = buy_the_same_items_users.set_index('reviewerID')

  # 承上，與四星評分表交集
  buy_others_items_logs = all_ratings_trainings_4_reviewerID.join(buy_the_same_items_users_reviewerID, how='right')
  buy_others_items_logs

  # 找出買過相同商品的人也買過的商品清單
  column_asin = ["asin"]
  buy_others_items_list = pd.DataFrame(buy_others_items_logs['asin'].unique(),columns=column_asin)
  buy_others_items_list_asin = buy_others_items_list.set_index('asin')

  # 承上的商品清單交集 metadata
  recommend_items = metadata_new.join(buy_others_items_list_asin, how='right')
  
  # 找出平均分數大於等於4分且評分次數大於等於10次 並用 rank 排序
  recommend_items_details = recommend_items.loc[(recommend_items['overall_mean'] >= 4) & (recommend_items['reviews_count'] >= 10)].sort_values(by=['rank'])
  recommend_items_details.reset_index(inplace=True)
  recommend_items_details
  recommend_topk = recommend_items_details['asin'].unique()[0:k]
  
  return recommend_topk



In [None]:
ratings_trainings

In [None]:
ratings_testings

In [None]:
def recommender_rule_based(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    '''
    ratings_trainings = training_data
    ratings_trainings_4 = ratings_trainings.loc[ratings_trainings['overall']>=4]
    recommendations = {user: rule(user).tolist() for user in users}
    return recommendations


ratings_by_user = recommender_rule_based(ratings_trainings, users)
ratings_by_user

## 結果評估

In [None]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)