<a href="https://colab.research.google.com/github/ianfanggis/ac-data-course/blob/main/content_based_filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 基礎建設

In [13]:
import numpy as np
import re

import plotly.express as px
import matplotlib.pyplot as plt

In [14]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [15]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-02 11:41:34--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-02 11:41:35 (20.0 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-02 11:41:35--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-02 11:41:36 (15.1 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



#### metadata資訊
* asin - ID of the product, e.g. 0000031852 ← 商品 ID
* title - name of the product
* feature - bullet-point format features of the product
* description - description of the product
* price - price in US dollars (at time of crawl)
* imageURL - url of the product image
* imageURL - url of the high resolution product image
* related - related products (also bought, also viewed, bought together, buy after viewing)
* salesRank - sales rank information
* brand - brand name
* categories - list of categories the product belongs to
* tech1 - the first technical detail table of the product
* tech2 - the second technical detail table of the product
* similar - similar product table



In [16]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [17]:
metadata.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]


In [18]:
ratings.head(2)

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800


#### 對metadata處理

In [19]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32892 entries, 0 to 32891
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   category         32892 non-null  object
 1   tech1            32892 non-null  object
 2   description      32892 non-null  object
 3   fit              32892 non-null  object
 4   title            32892 non-null  object
 5   also_buy         32892 non-null  object
 6   tech2            32892 non-null  object
 7   brand            32892 non-null  object
 8   feature          32892 non-null  object
 9   rank             32892 non-null  object
 10  also_view        32892 non-null  object
 11  details          32892 non-null  object
 12  main_cat         32892 non-null  object
 13  similar_item     32892 non-null  object
 14  date             32892 non-null  object
 15  price            32892 non-null  object
 16  asin             32892 non-null  object
 17  imageURL         32892 non-null

In [20]:
# metadata[metadata.applymap(str).eq('[]').T.any()]

In [21]:
'''
1. 空值用nan替代
2. 移除重複
'''
metadata = metadata.mask(metadata.applymap(str).eq('[]')).replace('', np.nan)
display(metadata.info())
display(metadata.shape)

metadata = metadata.astype(str).drop_duplicates().reset_index(drop = True)
display(metadata.head(2))
display(metadata.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32892 entries, 0 to 32891
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         0 non-null      float64
 1   tech1            10 non-null     object 
 2   description      15119 non-null  object 
 3   fit              0 non-null      float64
 4   title            32891 non-null  object 
 5   also_buy         6597 non-null   object 
 6   tech2            0 non-null      float64
 7   brand            17219 non-null  object 
 8   feature          269 non-null    object 
 9   rank             32515 non-null  object 
 10  also_view        8132 non-null   object 
 11  details          32892 non-null  object 
 12  main_cat         32892 non-null  object 
 13  similar_item     1304 non-null   object 
 14  date             19 non-null     object 
 15  price            11459 non-null  object 
 16  asin             32892 non-null  object 
 17  imageURL    

None

(32892, 19)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,,,"[""Loud 'N Clear Personal Sound Amplifier allow...",,Loud 'N Clear&trade; Personal Sound Amplifier,,,idea village,,"2,938,573 in Beauty & Personal Care (",,{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,,
1,,,['No7 Lift & Luminate Triple Action Serum 50ml...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"['B01E7LCSL6', 'B008X5RVME']",,,,"872,854 in Beauty & Personal Care (",,"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,,


(32488, 19)

In [22]:
# metadata.columns

In [23]:
'''
保留所需使用以及與文字相關的欄位
'''
metadata_f = metadata.loc[:,['asin', 'rank', 'brand', 'price', 'also_view', 'also_buy','similar_item','description','title']]
# metadata = metadata.loc[:,['asin', 'rank', 'brand','description','title']]

In [24]:
'''
保留title, description, rank 中非為nan的值
'''
metadata_f = metadata_f.query('~(title == "nan" | description == "nan" | rank == "nan")')

In [25]:
display(metadata_f.head(2))
display(metadata_f.shape)

Unnamed: 0,asin,rank,brand,price,also_view,also_buy,similar_item,description,title
0,6546546450,"2,938,573 in Beauty & Personal Care (",idea village,,,,,"[""Loud 'N Clear Personal Sound Amplifier allow...",Loud 'N Clear&trade; Personal Sound Amplifier
1,7178680776,"872,854 in Beauty & Personal Care (",,$44.99,,"['B01E7LCSL6', 'B008X5RVME']","class=""a-bordered a-horizontal-stripes a-spa...",['No7 Lift & Luminate Triple Action Serum 50ml...,No7 Lift &amp; Luminate Triple Action Serum 50...


(14683, 9)

In [26]:
'''
1. 處理price
'''
metadata_f['price'] = metadata_f['price'].str.split('$',1,expand = True)[1]

In [27]:
'''
2. 處理rank
'''
metadata_f[['rank_no','rank']] = metadata_f['rank'].str.split(' ', n=1, expand=True).rename(columns={0:'rank_no', 1:'rank'})

In [28]:
'''
3. 處理description
'''
metadata_f['description'] = metadata_f['description'].apply(lambda x: ' '.join(x))

In [29]:
'''
合併 rank, description, title
規化、轉小寫
?<! =>
'''
metadata_f['text'] = metadata_f['rank'] + metadata_f['description']+ metadata_f['title']
# display(metadata_f['text'][15])
regex = re.compile(r"(?<!\d)[\W](?!\d)")
# display(metadata_f['text'].apply(lambda x: re.sub(regex, '', str(x)))[15])
metadata_f['text'] = metadata_f['text'].apply(lambda x: re.sub(regex, '', str(x))).str.lower()

#### 對ratings處理 (資料切分)

In [30]:
# 十位數轉時間指標
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'],unit = 's')

In [31]:
'''
提供使用者購買商品的紀錄
'''
ratings.head(2)

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18


In [32]:
# px.bar(ratings, x ='DATE' )

In [33]:
'''
4-5顆星的佔了大多數，所以作為條件
'''
# px.bar(ratings, x ='overall' )

'\n4-5顆星的佔了大多數，所以作為條件\n'

In [34]:
'''
取一個月內的資料作為tranings
'''
ratings_trainings = ratings.query('DATE < "2018-09-01" & overall >=4 & DATE >= "2018-08-01" ')
ratings_testings = ratings.query('DATE <= "2018-09-30" & DATE >= "2018-09-01" ')
print('ratings_trainings:', ratings_trainings.shape)
print('ratings_testings:', ratings_testings.shape)
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
display(ratings_testings_by_user[1])
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())
display(users[:5])

ratings_trainings: (1197, 5)
ratings_testings: (590, 5)


{'asin': ['B01DKQAXC0'], 'reviewerID': 'A103T1QOGFCSEH'}

['A100XQFWKQ30O2',
 'A103T1QOGFCSEH',
 'A106UKKSJ2KXPF',
 'A10A7GV4D5A11V',
 'A1119JJ37ZLB8R']

In [35]:
'''
1. 根據 asin(商品 ID)計算 overall(用戶對商品的評分)
2. 計算每個asin有幾則評論數量
3. 合併
'''
ratings_trainings_ov_mean = ratings_trainings.groupby( by = 'asin').agg({'overall': 'mean'}).reset_index()
ratings_trainings_ov_count = ratings_trainings.groupby( by = 'asin').agg({'overall': 'count'}).reset_index()

ratings_trainings_merge = pd.merge(ratings_trainings_ov_mean, ratings_trainings_ov_count, on = 'asin',suffixes = ('_mean','_count'))
ratings_trainings_merge.head(2)

Unnamed: 0,asin,overall_mean,overall_count
0,B000MAJD4W,5.0,1
1,B0047NFF4C,5.0,1


In [36]:
# 計算平均分
ratings_training_3 = ratings_trainings.groupby('asin', as_index = False)['overall'].mean()

# 計算評論數
rating_comment = pd.DataFrame(ratings_trainings.asin.value_counts())
rating_comment.reset_index(inplace=True)
rating_comment.columns = ['asin', 'count']

# merge
rating_comment = rating_comment.merge(ratings_training_3, on='asin')
rating_comment.head()

Unnamed: 0,asin,count,overall
0,B01DKQAXC0,51,4.862745
1,B00W259T7G,37,4.810811
2,B01DLR9IDI,26,5.0
3,B013XKHA4M,25,4.92
4,B0195R1FT8,20,4.8


#### 計算tvalue
- 這裡參考同學David Chang使用NLTK

In [37]:
metadata_f = metadata_f.merge(rating_comment, on = 'asin')
metadata_f.head(2)

Unnamed: 0,asin,rank,brand,price,also_view,also_buy,similar_item,description,title,rank_no,text,count,overall
0,B000MAJD4W,in Beauty & Personal Care (,Paul Brown Hawaii,,"['B000N2Q4XC', 'B000ULNFDS']","['B000N2Q4XC', 'B002BX9T8I', 'B07BYJG3CW', 'B0...",,[ ' I n f u s e d w i t h k u k u i n u ...,Paul Brown Hawaii Gelatine Goo Firm Holding Ge...,479011,inbeautypersonalcareinfusedwithkukuinutlipidst...,1,5.0
1,B0047NFF4C,in Beauty & Personal Care (,,15.0,"['B01N6MSWUE', 'B005IA463E', 'B0762DMDRV', 'B0...",,,[ ' W H Y W E L O V E I T W i n t h ...,Benefit Cosmetics The Porefessional Pores Away...,862335,inbeautypersonalcarewhyweloveitwinthefightagai...,1,5.0


In [None]:
# !pip install --user nltk

In [38]:
'''
NLTK 全名是Natural Language Tool Kit， 是一套基於Python 的自然語言處理工具箱
stopwords => 去除停用詞
WordNetLemmatizer => 詞形還原就是去掉單詞的詞綴，提取單詞的主幹部分
punkt =>「punkt」包含了許多預訓練好的分詞模型

'''
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [39]:
'''
https://vimsky.com/zh-tw/examples/detail/python-method-nltk.word_tokenize.html
lemmatize =>詞型還原
設定稍後取用 English 的停用詞語料庫
'''
# set lemmatizer
lemmatizer = WordNetLemmatizer()

# remove stopword
def remove_stopword(clean_text):
  tokens = nltk.word_tokenize(clean_text)
  result = [lemmatizer.lemmatize(voca) for voca in tokens if not voca in stopwords.words('english')]
  return ' '.join(result)

In [40]:
metadata_f['text'] = metadata_f['text'].apply(lambda x :remove_stopword(x))
display(metadata_f['text'].head(2))

0    inbeautypersonalcareinfusedwithkukuinutlipidst...
1    inbeautypersonalcarewhyweloveitwinthefightagai...
Name: text, dtype: object

In [41]:
'''
TDIDF，計算商品用標題所表示的 tfidf 矩陣
'''
from sklearn.feature_extraction.text import TfidfVectorizer
metadata_f = metadata_f.drop_duplicates('text')
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(metadata_f['text'])

In [42]:
'''
計算任兩筆商品之間的cosine_similarity
'''

from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(metadata_f.index,index = metadata_f['asin'])

In [43]:
mapping

asin
B000MAJD4W      0
B0047NFF4C      1
B00AN382P4      2
B00BPM41MA      3
B00GCRTTHA      4
             ... 
B01HC6G4D6    428
B01HCPNYR6    429
B01HD23OJG    430
B01HEPD32K    431
B01HI1YKIG    432
Length: 432, dtype: int64

### 產生推薦

In [44]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
  try:
    item_index = mapping[item_input]
    similarity_score = list(enumerate(similarity_matrix[item_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    similarity_score = similarity_score[:k]
    item_indices = [i[0] for i in similarity_score]
    return (metadata_f['asin'].iloc[item_indices].tolist())
  except:
    return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
  res = []
  for d in items:
    res.extend(recommend_item(d, k))
  return res

In [55]:
def recommender(training_data, users=[], k=10):
  '''
  * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
  * users: [] 需要被推薦的使用者
  * k: int 每個使用者需要推薦的商品數
  * recommendations: dict
    {
        使用者一： [推薦商品一, 推薦商品二, ...],
        使用者二： [...], ...
    }
  '''
  recommendations = {}
  ratings_trainings = training_data
  for user in users:

    # content based
    recom_list = recommend_items(
        metadata_f[metadata_f['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['asin'].tolist(), k)
    if recom_list:
      recommendations[user] = recom_list
    else:
      # base-ruled，用count + overall排序
      rating_comment['count_overall'] = rating_comment['count'] + rating_comment['overall']
      recommendations[user] = rating_comment.sort_values(by = 'count_overall', ascending= False).asin[:k].tolist()
      

  return recommendations

In [56]:
# rating_comment['count_overall'] = rating_comment['count'] + rating_comment['overall']
# rating_comment.sort_values(by = 'count_overall', ascending= False)

### 結果評估

In [46]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
  '''
  * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
  * ratings_by_user: dict 利用訓練資料學習的推薦商品
  * method: str
  * score: float
  '''
  total = 0
  for d in ratings_testings_by_user:
      if d in ratings_by_user:
          total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

  score = total / len(ratings_testings)
  return score

In [59]:
for k in [10, 20, 30]:
  ratings_by_user = recommender(ratings_trainings, users, k=k)
  print('前{k}個推薦：{recall}'.format(k=k, recall = evaluate(ratings_testings_by_user, ratings_by_user)))


前10個推薦：0.1423728813559322
前20個推薦：0.21016949152542372
前30個推薦：0.25254237288135595
