# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-02 17:19:54--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-02 17:19:56 (10.1 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-02 17:19:56--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-02 17:19:58 (6.92 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [4]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


In [5]:
metadata.shape

(32892, 19)

# 資料整理

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

# 資料切分

In [7]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

# 產生需要的欄位

In [8]:
# 產生舊客的 metadata 表

# 處理 rank 中的排行跟分類
metadata['rank_rank'] = metadata['rank'].str.split(" ").str.get(0).str.replace(',','').to_frame()
metadata['rank_category'] = metadata['rank'].str.split("in ").str.get(1).to_frame()


# 將 details 轉為字串
metadata_discover = metadata['details']
tmp = metadata['details']
for i in range(len(metadata_discover)):
  tmp.iloc[i]=json.dumps(metadata_discover.iloc[i]).lower()
metadata['details_str']=tmp


# 抽取需要的欄位
df_asin = metadata['asin'].to_frame()
df_title = metadata['title'].str.lower().to_frame()
df_description = metadata['description'].str[0].str.lower().to_frame()
df_brand = metadata['brand'].str.lower().to_frame()
df_rank_category = metadata['rank_category'].str.lower().to_frame()
df_details_str = metadata['details_str'].str.lower().to_frame()


# 產生新的 metadata 表
metadata_tmp = pd.concat([df_asin,df_title,df_description,df_brand,df_rank_category,df_details_str],axis='columns')
#metadata_tmp2 = metadata_tmp.set_index('asin')


# 移除重複的資料
metadata_tmp3=metadata_tmp.drop_duplicates(subset=None, keep='first', inplace=False)
metadata_tmp3.reset_index(inplace=True)




In [9]:
metadata_tmp3['text'] = metadata_tmp3['title'].str.cat(metadata_tmp3['rank_category'],sep=" ")
metadata_for_old=metadata_tmp3
metadata_for_old

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,index,asin,title,description,brand,rank_category,details_str,text
0,0,6546546450,loud 'n clear&trade; personal sound amplifier,loud 'n clear personal sound amplifier allows ...,idea village,beauty & personal care (,"{""asin: "": ""6546546450""}",loud 'n clear&trade; personal sound amplifier ...
1,1,7178680776,no7 lift &amp; luminate triple action serum 50...,no7 lift & luminate triple action serum 50ml b...,,beauty & personal care (,"{""shipping weight:"": ""0.3 ounces ("", ""asin: "":...",no7 lift &amp; luminate triple action serum 50...
2,2,7250468162,no7 stay perfect foundation cool vanilla by no7,no7 stay perfect foundation now stays perfect ...,no7,beauty & personal care (,"{""shipping weight:"": ""3.5 ounces ("", ""asin: "":...",no7 stay perfect foundation cool vanilla by no...
3,3,7367905066,wella koleston perfect hair colour 44/44 mediu...,,,beauty & personal care (,"{""\n item weight: \n "": ""1.76 ounces"", ""...",wella koleston perfect hair colour 44/44 mediu...
4,4,7414204790,lacto calamine skin balance oil control 120 ml...,lacto calamine skin balance daily nourishing l...,pirmal healthcare,beauty & personal care (,"{""shipping weight:"": ""12 ounces ("", ""asin: "": ...",lacto calamine skin balance oil control 120 ml...
...,...,...,...,...,...,...,...,...
32483,32887,B01HIWLLUK,"barielle pro textured grip cuticle nipper, purple",,,beauty & personal care (,"{""asin: "": ""b01hiwlluk""}","barielle pro textured grip cuticle nipper, pur..."
32484,32888,B01HJ1K3YK,(buy 3 get 1 free) salon perfect eye makeup co...,,salon perfect,beauty & personal care (,"{""asin: "": ""b01hj1k3yk"", ""upc:"": ""671635851871""}",(buy 3 get 1 free) salon perfect eye makeup co...
32485,32889,B01HJ84SGM,now d-mannose 500 mg - 120 veg capsules (pack ...,,,beauty & personal care (,"{""shipping weight:"": ""1 pounds ("", ""asin: "": ""...",now d-mannose 500 mg - 120 veg capsules (pack ...
32486,32890,B01HJASD20,12 white feather shuttlecocks birdies badminto...,brand new and high quality<br> enables fast vo...,gbstore,beauty & personal care (,"{""shipping weight:"": ""4.8 ounces ("", ""asin: "":...",12 white feather shuttlecocks birdies badminto...


In [10]:
# 產生新客的 metadata 表

df_asin = metadata['asin']
df_brand = metadata['brand']
df_title = metadata['title']
df_price = metadata['price']
df_rank = metadata['rank'].str.split(" ").str.get(0).str.replace(',','')
df_rank2 = pd.to_numeric(df_rank, errors='coerce')
df_description = metadata['description']
df_imageURL = metadata['imageURL'].str[0].str.lower().to_frame()
metadata_tmp = pd.concat([df_asin,df_brand,df_title,df_price,df_rank2,df_description,df_imageURL],axis='columns')
metadata_new = metadata_tmp.set_index('asin')
metadata_new['overall_mean'] = ratings.groupby(by=['asin']).mean()['overall']
metadata_new['reviews_count'] = ratings.groupby(by=['asin']).size()


# 抽取商品評分最後日期
ratings_asin=ratings_trainings['asin']
ratings_date=ratings_trainings['DATE']
ratings_tmp = pd.concat([ratings_asin,ratings_date],axis='columns')
ratings_max_dt=ratings_tmp.groupby('asin').DATE.max().to_frame()
ratings_max_dt


metadata_new=pd.merge(metadata_new,ratings_max_dt,how='left', on=['asin'])
metadata_for_new=metadata_new
metadata_for_new

Unnamed: 0_level_0,brand,title,price,rank,description,imageURL,overall_mean,reviews_count,DATE
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6546546450,idea village,Loud 'N Clear&trade; Personal Sound Amplifier,,2938573.0,[Loud 'N Clear Personal Sound Amplifier allows...,,2.5,2.0,2013-08-13
7178680776,,No7 Lift &amp; Luminate Triple Action Serum 50...,$44.99,872854.0,[No7 Lift & Luminate Triple Action Serum 50ml ...,,3.0,1.0,2017-10-06
7250468162,No7,No7 Stay Perfect Foundation Cool Vanilla by No7,$28.76,956696.0,[No7 Stay Perfect Foundation now stays perfect...,,5.0,1.0,2017-08-26
7367905066,,Wella Koleston Perfect Hair Colour 44/44 Mediu...,,1870258.0,[],https://images-na.ssl-images-amazon.com/images...,5.0,1.0,2017-04-12
7414204790,Pirmal Healthcare,Lacto Calamine Skin Balance Oil control 120 ml...,$12.15,67701.0,[Lacto Calamine Skin Balance Daily Nourishing ...,https://images-na.ssl-images-amazon.com/images...,4.4,15.0,2018-03-02
...,...,...,...,...,...,...,...,...,...
B01HIWLLUK,,"Barielle Pro Textured Grip Cuticle Nipper, Purple",$9.95,2145325.0,[],,5.0,1.0,2016-09-24
B01HJ1K3YK,Salon Perfect,(Buy 3 Get 1 Free) Salon Perfect Eye Makeup Co...,,1639713.0,[],,1.0,1.0,2017-04-21
B01HJ84SGM,,NOW D-Mannose 500 mg - 120 Veg Capsules (Pack ...,$55.63,207410.0,[],,5.0,5.0,2018-03-23
B01HJASD20,GBSTORE,12 White Feather Shuttlecocks Birdies Badminto...,$12.99,965673.0,[Brand new and high quality<br> Enables fast v...,https://images-na.ssl-images-amazon.com/images...,1.0,1.0,2016-08-31


# 資料探索過程

In [11]:
## description	also_buy feature also_view	imageURL	imageURLHighRes
metadata_discover=metadata['imageURLHighRes'].str[0].str.lower().to_frame()
aa=metadata_discover.groupby('imageURLHighRes').imageURLHighRes.count().reset_index(name='count').sort_values(['count'], ascending=False)
aa
#aa.iloc[4898].imageURLHighRes
#aa.loc[(aa['count'] == 1)]
#bb=metadata_discover.isnull().sum()
#bb

Unnamed: 0,imageURLHighRes,count
5575,https://images-na.ssl-images-amazon.com/images...,38
3618,https://images-na.ssl-images-amazon.com/images...,29
14872,https://images-na.ssl-images-amazon.com/images...,25
11803,https://images-na.ssl-images-amazon.com/images...,19
5473,https://images-na.ssl-images-amazon.com/images...,16
...,...,...
5331,https://images-na.ssl-images-amazon.com/images...,1
5332,https://images-na.ssl-images-amazon.com/images...,1
5333,https://images-na.ssl-images-amazon.com/images...,1
5334,https://images-na.ssl-images-amazon.com/images...,1


In [12]:
##category	tech1	fit	title	tech2	brand	details	main_cat	similar_item	asin	date price rank0 rank1 details_str
metadata_discover=metadata['asin'].str.lower().to_frame()
aa=metadata_discover.groupby('asin').asin.count().reset_index(name='count').sort_values(['count'], ascending=False)
aa
#aa.iloc[2].price
#aa.iloc[3529]
#aa.loc[(aa['count'] == 1)]

Unnamed: 0,asin,count
327,b00027c9au,2
292,b00021di10,2
290,b00021cwyo,2
289,b00021cqta,2
288,b00021cpnc,2
...,...,...
11092,b00h41ddck,1
11091,b00h412x2g,1
11090,b00h412qc8,1
11089,b00h3vqvqg,1


In [13]:
## asin	reviewerID
ratings_discover=ratings['asin'].str.lower().to_frame()
aa=ratings_discover.groupby('asin').asin.count().reset_index(name='count').sort_values(['count'], ascending=False)
aa
#aa.iloc[2].price
#aa.iloc[3529]
#aa.loc[(aa['count'] == 1)]


Unnamed: 0,asin,count
939,b000foi48g,8672
1059,b000glrreu,8341
59,1620213982,4792
3301,b001qy8qxm,4544
29555,b01dkqaxc0,4246
...,...,...
10564,b00g70etsq,1
22343,b0131jvk8c,1
10558,b00g6exfg0,1
10556,b00g6214la,1


In [14]:
ratings_trainings.count()

asin              370752
reviewerID        370752
overall           370752
unixReviewTime    370752
DATE              370752
dtype: int64

In [15]:
ratings_testings.count()

asin              590
reviewerID        590
overall           590
unixReviewTime    590
DATE              590
dtype: int64

In [16]:
### asin reviewerID 查看交集狀況
column_asin = ["asin"]
ratings_trainings_item = pd.DataFrame(ratings_trainings['asin'].unique(),columns=column_asin)
ratings_testings_item = pd.DataFrame(ratings_testings['asin'].unique(),columns=column_asin)

ratings_trainings_item = ratings_trainings_item.set_index('asin')
ratings_testings_item = ratings_testings_item.set_index('asin')

#ratings_trainings_item.join(ratings_testings_item, how='inner')
pd.merge(ratings_trainings_item,ratings_testings_item,how='inner', on=['asin'])


B013XKHA4M
B015L947B2
B015P54DHU
B015PFP72A
B015SG1BSU
...
B01HENTQMI
B01HFLLNXE
B01HFT13NG
B01HFWIKH0
B01HHWQX8G


# 新客熱門商品

In [17]:
# 優質熱門商品

popular_products = metadata_for_new.loc[(metadata_for_new['overall_mean'] >= 4) & (metadata_for_new['reviews_count'] >= 100) ].sort_values(by=['imageURL','DATE','reviews_count','overall_mean',], ascending=False)
popular_products.reset_index(inplace=True)
popular_products.head(10)
fix_recommend_top10=popular_products['asin'].unique()[0:10].tolist()
fix_recommend_top10

['B000G8LWZI',
 'B000FTYALG',
 'B01C39X6TW',
 'B000WN2T2M',
 'B001V9FPHM',
 'B00HY7S61W',
 'B018SFIGXG',
 'B018SFOZYA',
 'B00021DJ32',
 'B001G8LKBS']

## 舊客產生推薦

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 計算商品用標題所表示的 tfidf 矩陣
df = metadata.drop_duplicates('title')
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(df['title'])

# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['title'])

# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=10):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res


In [19]:
def recommend_rule_and_content(items, k, user):
  
  user_in_trainings = ratings_trainings.loc[(ratings_trainings['reviewerID']==user)]
  
  if user_in_trainings.empty:
    recommend_topk=fix_recommend_top10
  else:
    recommend_topk=recommend_items(items, k)
  return recommend_topk



  

In [20]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    ratings_trainings = training_data
    recommendations = {user: recommend_rule_and_content(metadata_for_old[metadata_for_old['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['text'].tolist(), k, user) for user in users}
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': ['B000G8LWZI',
  'B000FTYALG',
  'B01C39X6TW',
  'B000WN2T2M',
  'B001V9FPHM',
  'B00HY7S61W',
  'B018SFIGXG',
  'B018SFOZYA',
  'B00021DJ32',
  'B001G8LKBS'],
 'A103T1QOGFCSEH': ['B000G8LWZI',
  'B000FTYALG',
  'B01C39X6TW',
  'B000WN2T2M',
  'B001V9FPHM',
  'B00HY7S61W',
  'B018SFIGXG',
  'B018SFOZYA',
  'B00021DJ32',
  'B001G8LKBS'],
 'A106UKKSJ2KXPF': ['B000G8LWZI',
  'B000FTYALG',
  'B01C39X6TW',
  'B000WN2T2M',
  'B001V9FPHM',
  'B00HY7S61W',
  'B018SFIGXG',
  'B018SFOZYA',
  'B00021DJ32',
  'B001G8LKBS'],
 'A10A7GV4D5A11V': ['B000G8LWZI',
  'B000FTYALG',
  'B01C39X6TW',
  'B000WN2T2M',
  'B001V9FPHM',
  'B00HY7S61W',
  'B018SFIGXG',
  'B018SFOZYA',
  'B00021DJ32',
  'B001G8LKBS'],
 'A1119JJ37ZLB8R': ['B000G8LWZI',
  'B000FTYALG',
  'B01C39X6TW',
  'B000WN2T2M',
  'B001V9FPHM',
  'B00HY7S61W',
  'B018SFIGXG',
  'B018SFOZYA',
  'B00021DJ32',
  'B001G8LKBS'],
 'A113UOOLBSZN52': ['B000G8LWZI',
  'B000FTYALG',
  'B01C39X6TW',
  'B000WN2T2M',
  'B001V9FPHM',
  'B00H

## 結果評估

In [25]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.00847457627118644