<a href="https://colab.research.google.com/github/ianfanggis/ac-data-course/blob/main/w3_Collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install surprise



In [2]:
from surprise import Reader, Dataset, KNNBasic
import time

In [3]:
import numpy as np
import re

import plotly.express as px
import matplotlib.pyplot as plt

from itertools import combinations
from collections import defaultdict

In [4]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [5]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-09 11:01:58--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-09 11:01:59 (24.6 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-09 11:01:59--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-09 11:02:00 (18.6 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



#### metadata資訊
* asin - ID of the product, e.g. 0000031852 ← 商品 ID
* title - name of the product
* feature - bullet-point format features of the product
* description - description of the product
* price - price in US dollars (at time of crawl)
* imageURL - url of the product image
* imageURL - url of the high resolution product image
* related - related products (also bought, also viewed, bought together, buy after viewing)
* salesRank - sales rank information
* brand - brand name
* categories - list of categories the product belongs to
* tech1 - the first technical detail table of the product
* tech2 - the second technical detail table of the product
* similar - similar product table



In [6]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [7]:
metadata.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]


In [8]:
ratings.head(2)

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800


#### 對metadata處理

In [9]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32892 entries, 0 to 32891
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   category         32892 non-null  object
 1   tech1            32892 non-null  object
 2   description      32892 non-null  object
 3   fit              32892 non-null  object
 4   title            32892 non-null  object
 5   also_buy         32892 non-null  object
 6   tech2            32892 non-null  object
 7   brand            32892 non-null  object
 8   feature          32892 non-null  object
 9   rank             32892 non-null  object
 10  also_view        32892 non-null  object
 11  details          32892 non-null  object
 12  main_cat         32892 non-null  object
 13  similar_item     32892 non-null  object
 14  date             32892 non-null  object
 15  price            32892 non-null  object
 16  asin             32892 non-null  object
 17  imageURL         32892 non-null

In [10]:
# metadata[metadata.applymap(str).eq('[]').T.any()]

In [11]:
'''
1. 空值用nan替代
2. 移除重複
'''
metadata = metadata.mask(metadata.applymap(str).eq('[]')).replace('', np.nan)
display(metadata.info())
display(metadata.shape)

metadata = metadata.astype(str).drop_duplicates().reset_index(drop = True)
display(metadata.head(2))
display(metadata.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32892 entries, 0 to 32891
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category         0 non-null      float64
 1   tech1            10 non-null     object 
 2   description      15119 non-null  object 
 3   fit              0 non-null      float64
 4   title            32891 non-null  object 
 5   also_buy         6597 non-null   object 
 6   tech2            0 non-null      float64
 7   brand            17219 non-null  object 
 8   feature          269 non-null    object 
 9   rank             32515 non-null  object 
 10  also_view        8132 non-null   object 
 11  details          32892 non-null  object 
 12  main_cat         32892 non-null  object 
 13  similar_item     1304 non-null   object 
 14  date             19 non-null     object 
 15  price            11459 non-null  object 
 16  asin             32892 non-null  object 
 17  imageURL    

None

(32892, 19)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,,,"[""Loud 'N Clear Personal Sound Amplifier allow...",,Loud 'N Clear&trade; Personal Sound Amplifier,,,idea village,,"2,938,573 in Beauty & Personal Care (",,{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,,
1,,,['No7 Lift & Luminate Triple Action Serum 50ml...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"['B01E7LCSL6', 'B008X5RVME']",,,,"872,854 in Beauty & Personal Care (",,"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,,


(32488, 19)

#### 對ratings處理 (資料切分)

In [12]:
# 十位數轉時間指標
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'],unit = 's')

In [13]:
'''
提供使用者購買商品的紀錄
'''
ratings.head(2)

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18


In [14]:
'''
這次與上次不同，不特別只取一個月內的資料
'''
ratings_trainings = ratings.query('DATE < "2018-09-01" ')
ratings_testings = ratings.query('DATE <= "2018-09-30" & DATE >= "2018-09-01" ')
print('ratings_trainings:', ratings_trainings.shape)
print('ratings_testings:', ratings_testings.shape)
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
display(ratings_testings_by_user[1])
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())
display(users[:5])

ratings_trainings: (370752, 5)
ratings_testings: (590, 5)


{'asin': ['B01DKQAXC0'], 'reviewerID': 'A103T1QOGFCSEH'}

['A100XQFWKQ30O2',
 'A103T1QOGFCSEH',
 'A106UKKSJ2KXPF',
 'A10A7GV4D5A11V',
 'A1119JJ37ZLB8R']

### 推薦-User-based collaborative filtering

In [15]:
class User_based:

  # 移除重複的使用者，並只保留每位使用者最新的評論
  def __init__(self, training_data):
    self.training_data = training_data.sort_values('DATE', ascending = False).groupby(by = ['asin','reviewerID']).head(1)
  
  # 
  def user_recommender(self, users=[], k=10):


    user_to_items = defaultdict(dict)
    for _, row in self.training_data.iterrows():
      row = dict(row)
      user = row['reviewerID']
      item = row['asin']
      rating = float(row['overall'])

      user_to_items[user][item] = rating
    
    print("total users before filtering: ", len(user_to_items))

    # remove obscure user to decrease data size
    # filtering params

    # filter掉n<3的使用者

    remove_obscure_user = True
    user_rating_threshold = 3

    all_users = list(user_to_items.keys())
    for user in all_users:
      ratings = user_to_items[user]
      if remove_obscure_user and len(ratings) < user_rating_threshold:
        del user_to_items[user]
    print("total users  after filtering: ", len(user_to_items))

    # generate item to user mapping dict
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }

    item_to_users = defaultdict(dict)
    for user, items in user_to_items.items():
      for item, rating in items.items():
        item_to_users[item][user] = rating

    # prepare data of computing user similarity
    init_sim = lambda: [0 for _ in range(3)]
    factory = lambda: defaultdict(init_sim)
    pre_user_similarity = defaultdict(factory)
    n = len(item_to_users)
    index = 0

    for item, user_ratings in item_to_users.items():
      if len(user_ratings) > 1:
        # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
        for user1, user2 in combinations(user_ratings.keys(), 2):
          xy = user_ratings[user1] * user_ratings[user2]
          xx = user_ratings[user1] ** 2
          yy = user_ratings[user2] ** 2
          pre_user_similarity[user1][user2][0] += xy
          pre_user_similarity[user1][user2][1] += xx
          pre_user_similarity[user1][user2][2] += yy

          pre_user_similarity[user2][user1][0] += xy
          pre_user_similarity[user2][user1][1] += xx
          pre_user_similarity[user2][user1][2] += yy
      index += 1

    # 建立user_similarity，並用於找出「相似行為使用者」買過的產品，且「被推薦者」還沒購買的
    user_similarity = {}
    for src_user in pre_user_similarity:
      user_similarity_order = []
      for dst_user, val in pre_user_similarity[src_user].items():
        xy = val[0]
        xx = val[1]
        yy = val[2]
        div = ((xx*yy) ** 0.5)
        if div == 0:
          continue
        similarity = xy / div

        if similarity < 0:
          continue

        for i, s in enumerate(user_similarity_order):
          target_similarity = s[1]
          if target_similarity < similarity:
            user_similarity_order.insert(i, (dst_user, similarity))
            break
        else:
          user_similarity_order.append((dst_user, similarity))
      user_similarity[src_user] = user_similarity_order
    
    # 建立推薦系統
    recommendation = {}
    for user in users:
      if user in user_similarity:
        sim_users = user_similarity[user]
        recommended_items = []
        recommended_items_set = set()
        user_have_rated = set(user_to_items[user])
        stop_recommend = False

        for sim_user, _ in sim_users:
          items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
          for item, _ in items_from_sim_user:
            if item not in user_have_rated and item not in recommended_items_set:
              recommended_items.append(item)
              recommended_items_set.add(item)

            # 如果推薦超過K個商品 
            if len(recommended_items) >= k:
              stop_recommend = True
              break
          # 如果停止推薦，迴圈也終止
          if stop_recommend:
            break
        # 產生推薦
        recommendation[user] = recommended_items
      # 如果沒有在 user_similarity中，回傳[]
      else:
        recommendation[user] = []
    return recommendation

  # evaluate
  def user_evaluate(self, ratings_testings_by_user={}, ratings_by_user={}, method=None):

    '''
    ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    ratings_by_user: dict 利用訓練資料學習的推薦商品
    method: str
    score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
      if d in ratings_by_user:
        total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))
    score = total / len(ratings_testings)
    return score

In [16]:
# ratings_trainings.sort_values('DATE', ascending = False).groupby(by = ['asin','reviewerID']).head(1)

In [17]:
user_based = User_based(ratings_trainings)
ratings_by_user = user_based.user_recommender(users)
print('user_based：{recall}'.format( recall = user_based.user_evaluate(ratings_testings_by_user, ratings_by_user)))

total users before filtering:  323489
total users  after filtering:  4793
user_based：0.0


推薦-Item-based collaborative filtering

In [18]:
class Item_based:

  # 移除重複的使用者，並只保留每位使用者最新的評論
  def __init__(self, training_data):
    self.training_data = training_data.sort_values('DATE', ascending = False).groupby(by = ['asin','reviewerID']).head(1)
  
  # 
  def item_recommender(self, users=[], k=10):


    item_to_users = defaultdict(dict)
    for _, row in self.training_data.iterrows():
      row = dict(row)
      user = row['reviewerID']
      item = row['asin']
      rating = float(row['overall'])

      item_to_users[item][user] = rating
    
    print("data converted")

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
      for user, rating in rating_users.items():
        user_to_items[user][item] = rating
    print("data inverted")

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)

    for user, items in user_to_items.items():
      if len(items) > 1:
        for i1, i2 in combinations(items.keys(), 2):
          xy = items[i1] * items[i2]
          xx = items[i1] ** 2
          yy = items[i2] ** 2
          pre_item_similarity[i1][i2][0] += xy
          pre_item_similarity[i1][i2][1] += xx
          pre_item_similarity[i1][i2][2] += yy

          pre_item_similarity[i2][i1][0] += xy
          pre_item_similarity[i2][i1][1] += xx
          pre_item_similarity[i2][i1][2] += yy
    print("sim data prepared")

    item_similarity = {}
    for src_item in pre_item_similarity:
      item_similarity_order = []
      for dst_item, val in pre_item_similarity[src_item].items():
        xy = val[0]
        xx = val[1]
        yy = val[2]
        div = ((xx*yy) ** 0.5)
        if div == 0:
          continue
        similarity = xy / div
        if similarity < 0:
          continue
        for i, s in enumerate(item_similarity_order):
          target_similarity = s[1]
          if target_similarity < similarity:
            item_similarity_order.insert(i, (dst_item, similarity))
            break
          else:
            item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order
    
    # 建立推薦系統
    recommendation = {}
    for user in users:
      items = []
      items_set = set()
      stop = False
      user_has_rated = set(user_to_items[user])
      for item in user_has_rated:
        if item in item_similarity:
          for sim_item, _ in item_similarity[item]:
            # skip the item user has rated
            if sim_item not in user_has_rated and sim_item not in items_set:
              items.append(sim_item)
              items_set.add(sim_item)
            if len(items) >= k:
              stop = True
              break
          if stop:
            break
      recommendation[user] = items
    return recommendation

  # evaluate
  def user_evaluate(self, ratings_testings_by_user={}, ratings_by_user={}, method=None):

    '''
    ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    ratings_by_user: dict 利用訓練資料學習的推薦商品
    method: str
    score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
      if d in ratings_by_user:
        total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))
    score = total / len(ratings_testings)
    return score

In [19]:
item_based = Item_based(ratings_trainings)
ratings_by_user = item_based.item_recommender(users)
print('item_based：{recall}'.format( recall = item_based.user_evaluate(ratings_testings_by_user, ratings_by_user)))

data converted
data inverted
sim data prepared
item_based：0.0


### 推薦-surprise collaborative filtering

In [24]:
class Surprise_based:

  # 移除重複的使用者，並只保留每位使用者最新的評論
  def __init__(self, training_data):
    self.training_data = training_data.sort_values('DATE', ascending = False).groupby(by = ['asin','reviewerID']).head(1)
  
  # 
  def surprise_recommender(self, users=[], k=10, user_based=False, algo=KNNBasic):

    reader = Reader(rating_scale=(0, 5))
    # 因為資料量過大造成記憶體不足，所以取至2017-09-01
    # self.data = self.training_data[['reviewerID', 'asin', 'overall']]
    training_data = self.training_data.query('DATE >= "2017-09-01"')[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)
    
    #計算相似度 
    sim_options = {
        'name': 'cosine',
        'user_based': user_based
    }

    algo_impl = algo(sim_options = sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    # 建立推薦系統
    recommendation = {}
    

    for user in users:
      items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
      recommend_item_list = []
      recommend_item_set = set()
      for item in items_user_rated:
        iid = algo_impl.trainset.to_inner_iid(item)
        recommend_items_iid = algo_impl.get_neighbors(iid, k)
        for sim_item_iid in recommend_items_iid:
          item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
          if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
            recommend_item_list.append(item_raw_id)
            recommend_item_set.add(item_raw_id)
        if len(recommend_item_list) >= k:
          recommend_item_list = recommend_item_list[:k]
          break

      recommendation[user] = recommend_item_list
    return recommendation

  # evaluate
  def user_evaluate(self, ratings_testings_by_user={}, ratings_by_user={}, method=None):

    '''
    ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    ratings_by_user: dict 利用訓練資料學習的推薦商品
    method: str
    score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
      if d in ratings_by_user:
        total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))
    score = total / len(ratings_testings)
    return score

In [25]:
surprise_based = Surprise_based(ratings_trainings)
ratings_by_user = surprise_based.surprise_recommender(users)
print('surprise_based：{recall}'.format( recall = surprise_based.user_evaluate(ratings_testings_by_user, ratings_by_user)))

Computing the cosine similarity matrix...
Done computing similarity matrix.
surprise_based：0.001694915254237288
