# Sample Code

## 基礎建設

In [21]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [22]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-26 02:47:42--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2021-12-26 02:47:43 (20.1 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2021-12-26 02:47:43--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2021-12-26 02:47:44 (14.0 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [23]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [24]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [25]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [26]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [27]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [28]:
import numpy as np
for_merge = metadata.loc[:, ['also_buy', 'brand', 'rank', 'price', 'asin']]
for_merge['rank'] = for_merge['rank'].str.replace(r'[^0-9]', '')
for_merge['rank'] = for_merge['rank'].astype(int, errors='ignore')
for_merge['price'] = for_merge['price'].str.replace(r'$', '')
for_merge['price'] = for_merge['price'].astype(float, errors='ignore')
for_merge.loc[for_merge.brand == '', 'brand'] = np.NaN
for_merge.loc[for_merge.price == '', 'price'] = np.NaN
for_merge

Unnamed: 0,also_buy,brand,rank,price,asin
0,[],idea village,2938573,,6546546450
1,"[B01E7LCSL6, B008X5RVME]",,872854,44.99,7178680776
2,[],No7,956696,28.76,7250468162
3,[B0041PBXX8],,1870258,,7367905066
4,[],Pirmal Healthcare,67701,12.15,7414204790
...,...,...,...,...,...
32887,[],,2145325,9.95,B01HIWLLUK
32888,[],Salon Perfect,1639713,,B01HJ1K3YK
32889,"[B01KON9B4S, B079X3YFXS, B00M79OYS6, B000JN4CR...",,207410,55.63,B01HJ84SGM
32890,[],GBSTORE,965673,12.99,B01HJASD20


In [29]:
merged = ratings.merge(right=for_merge, on='asin', how='left')
merged

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE,also_buy,brand,rank,price
0,0143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19,,,,
1,0143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18,,,,
2,0143026860,A1572GUYS7DGSR,4.0,1407628800,2014-08-10,,,,
3,0143026860,A1PSGLFK1NSVO,5.0,1362960000,2013-03-11,,,,
4,0143026860,A6IKXKZMTKGSC,5.0,1324771200,2011-12-25,,,,
...,...,...,...,...,...,...,...,...,...
387649,B01HJEGTYK,A202DCI7TV1022,1.0,1500508800,2017-07-20,"[B01HJEGQ6Q, B01L66EYCG, B00H7YZ7HI, B00CHKKWZ...",,478123,
387650,B01HJEGTYK,A3FSOR5IJOFIBE,5.0,1489622400,2017-03-16,"[B01HJEGQ6Q, B01L66EYCG, B00H7YZ7HI, B00CHKKWZ...",,478123,
387651,B01HJEGTYK,A1B5DK6CTP2P24,5.0,1488326400,2017-03-01,"[B01HJEGQ6Q, B01L66EYCG, B00H7YZ7HI, B00CHKKWZ...",,478123,
387652,B01HJEGTYK,A23OUYS5IRMJS9,2.0,1487635200,2017-02-21,"[B01HJEGQ6Q, B01L66EYCG, B00H7YZ7HI, B00CHKKWZ...",,478123,


In [18]:
ratings_testings.reviewerID.isin(ratings_trainings.reviewerID).value_counts()

False    551
True      39
Name: reviewerID, dtype: int64

In [12]:
merged.groupby('reviewerID')['brand'].value_counts()

reviewerID            brand             
A0010876CNE3ILIM9HV0  illunt                1
A001170867ZBE9FORRQL                        1
                      Wazor                 1
A0028738FYF1SKPPC7B1  Indian Earth          1
A0045356A23634W7RI4M  GENERIC               1
                                           ..
AZZXKG9AGRVGU                               1
AZZYW4YOE1B6E         Gillette              1
AZZZ5UJWUVCYZ         OC Nails              1
                      ONWON                 1
                      SWAROSVSKI CRYSTAL    1
Name: brand, Length: 340336, dtype: int64

In [17]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    Your Code
    '''
    return recommendations


ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{}

In [47]:
# 想法一：推薦評分最高的品牌隨機商品(如果很少評分數 但平均分很高 可以推薦)
# merged.groupby('overall')['brand'].count()
brand_recom = merged.groupby('brand')['overall'].agg(np.mean).to_frame().reset_index().sort_values('overall', ascending=False)
brand_recom.loc[brand_recom.overall == 5]

Unnamed: 0,brand,overall
5507,Pukka,5.0
5927,Sally hansen,5.0
2259,Enzo Milano,5.0
4531,MouthPlus,5.0
4530,Mouth Kote,5.0
...,...,...
5041,Organic Traditions,5.0
5754,Rosette,5.0
5755,Rosie Jane,5.0
4892,Nurture My Body,5.0


In [None]:
merged.groupby('overall')['brand']

## 結果評估

In [16]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.003389830508474576