## 基礎建設

In [2]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [48]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re
import string

## 載入資料

In [4]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-02 15:10:56--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.2’


2022-01-02 15:10:59 (8.84 MB/s) - ‘All_Beauty.csv.2’ saved [15499476/15499476]

--2022-01-02 15:10:59--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.2’


2022-01-02 15:11:00 (6.57 MB/s) - ‘meta_All_Beauty.json.gz.2’ saved [10329961/10329961]



In [109]:
#商品的基本資訊 metadata,購買商品的紀錄 ratings
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 資料整理

In [110]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [111]:
# drop duplicates rows
metadata_clean = metadata.loc[metadata.astype(str).drop_duplicates().index]

In [112]:
# 商品資訊- select useful columns
metadata_clean = metadata_clean[['asin', 'brand', 'title', 'rank','description']]

In [113]:
metadata_clean['rank'] = metadata_clean['rank'].str.replace('&amp;', '&')
metadata_clean['rank'].fillna('0', inplace = True)
# 商品資訊-買什麼種類
metadata_clean['categories'] = [re.search('in (.*) \(', r).group(1) if r != '0' else None for r in metadata_clean['rank']]

# 取出字串前段數字,取代掉千分位
metadata_clean['rank'] = metadata_clean['rank'].str.split(expand = True)[0].str.replace(',', '')
# 排名，轉換型別
metadata_clean['rank'] = pd.to_numeric(metadata_clean['rank'])

In [114]:
metadata_clean['title'] = metadata_clean['title'].str.replace('&amp;', '&')

In [115]:
#asin mean score
#asin_rating = ratings.groupby('asin')[['overall']].mean().reset_index().rename(columns={"asin": "asin", "overall": "rating_mean"})

In [116]:
#metadata_clean = pd.merge(metadata_clean,asin_rating , on="asin", how="left")

In [117]:
metadata_clean['categories'].unique()

array(['Beauty & Personal Care', None, 'Grocery & Gourmet Food',
       'Health & Household', 'Sports & Outdoors', 'Toys & Games', 'Baby',
       'Clothing, Shoes & Jewelry', 'Home & Kitchen', 'Automotive',
       'Tools & Home Improvement'], dtype=object)

In [118]:
metadata_clean['description'] = metadata_clean['description'].str.join(',')
metadata_clean['desc'] = metadata_clean['categories']  +' '+ metadata_clean["title"] +' '+ metadata_clean["description"]

In [119]:
metadata_clean['desc'] = metadata_clean['desc'].str.lower()

In [120]:
metadata_clean.head(10)

Unnamed: 0,asin,brand,title,rank,description,categories,desc
0,6546546450,idea village,Loud 'N Clear&trade; Personal Sound Amplifier,2938573,Loud 'N Clear Personal Sound Amplifier allows ...,Beauty & Personal Care,beauty & personal care loud 'n clear&trade; pe...
1,7178680776,,No7 Lift & Luminate Triple Action Serum 50ml b...,872854,No7 Lift & Luminate Triple Action Serum 50ml b...,Beauty & Personal Care,beauty & personal care no7 lift & luminate tri...
2,7250468162,No7,No7 Stay Perfect Foundation Cool Vanilla by No7,956696,No7 Stay Perfect Foundation now stays perfect ...,Beauty & Personal Care,beauty & personal care no7 stay perfect founda...
3,7367905066,,Wella Koleston Perfect Hair Colour 44/44 Mediu...,1870258,,Beauty & Personal Care,beauty & personal care wella koleston perfect ...
4,7414204790,Pirmal Healthcare,Lacto Calamine Skin Balance Oil control 120 ml...,67701,Lacto Calamine Skin Balance Daily Nourishing L...,Beauty & Personal Care,beauty & personal care lacto calamine skin bal...
5,7535842801,Mary Kay,Mary Kay Satin Hands Hand Cream Travel MINI Si...,699317,Mary Kay Satin Hands Peach Hand Cream Travel S...,Beauty & Personal Care,beauty & personal care mary kay satin hands ha...
6,7538626107,,Unique Custom Cast Iron Liner Shader Tattoo Ma...,3206658,,Beauty & Personal Care,beauty & personal care unique custom cast iron...
7,8279996397,Chango,Imagen Bendita Por Su Santidad Our Lady of Cha...,1741858,"According to the legend, in 1613, two brothers...",Beauty & Personal Care,beauty & personal care imagen bendita por su s...
8,8637910351,Srpska knjizevna zadruga,Mali princ,2136432,Novi prevod proslavljene knjige Zadruga objavl...,Beauty & Personal Care,beauty & personal care mali princ novi prevod ...
9,8867326759,Sunatoria,Moleskine Payne's Grey Card Wallet (Moleskine ...,1260339,Moleskine's Card Wallet in Payne's Grey is the...,Beauty & Personal Care,beauty & personal care moleskine payne's grey ...


In [121]:
metadata_clean = metadata_clean[['asin','title','rank','description','desc']]

## 資料切分

In [122]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01') 
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]

In [123]:
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

# Content-Based recommendation

一、description

產生推薦
*   「描述」- tfidf矩陣
*   cosine_similarity，回傳k個最相近的物品 

In [124]:
metadata_clean['description']

0        Loud 'N Clear Personal Sound Amplifier allows ...
1        No7 Lift & Luminate Triple Action Serum 50ml b...
2        No7 Stay Perfect Foundation now stays perfect ...
3                                                         
4        Lacto Calamine Skin Balance Daily Nourishing L...
                               ...                        
32887                                                     
32888                                                     
32889                                                     
32890    Brand new and high quality<br> Enables fast vo...
32891                                                     
Name: description, Length: 32488, dtype: object

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 計算商品用標題所表示的 tfidf 矩陣
df = metadata_clean.drop_duplicates('description')
tf = TfidfVectorizer(analyzer='word', stop_words='english')

In [126]:
tfidf_matrix = tf.fit_transform(df['description'])

In [129]:
# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['description'])

In [130]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

In [None]:
#for user in users:
  #print(ratings_trainings[ratings_trainings['reviewerID'] == user])
  #print(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title'].tolist())
  #print(recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title'].tolist(), 2))

In [134]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    ratings_trainings = training_data
    # 近三個月作為沒有購買記錄使用者的推薦商品
    ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01') & (ratings['DATE'] >= '2018-06-01')
    ]

    purchased_user_list = ratings_trainings.reviewerID.tolist()
    top_ten_rcom = ratings_trainings.groupby('asin').count().sort_values('overall',ascending=False).head(k).index.tolist()

    for user in users:
      if user in purchased_user_list:
        recommendations[user] = recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['description'].tolist(), k)
      else:
        recommendations[user] = top_ten_rcom
    
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
#ratings_by_user

結果評估

In [135]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.13389830508474576