<a href="https://colab.research.google.com/github/limitlin840318/data-course-sample/blob/main/sample-cf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 基礎建設

In [11]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [12]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from os.path import exists
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime
sns.set_style("whitegrid")

In [13]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english'))
def content_filter(text):
    # stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if (w.lower() not in stop_words) & (w.isalnum()) ]
    return content

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 載入資料

In [80]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-09 15:47:09--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.2’


2022-01-09 15:47:10 (18.9 MB/s) - ‘All_Beauty.csv.2’ saved [15499476/15499476]

--2022-01-09 15:47:10--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.2’


2022-01-09 15:47:11 (14.2 MB/s) - ‘meta_All_Beauty.json.gz.2’ saved [10329961/10329961]



In [81]:
#商品的基本資訊 metadata,購買商品的紀錄 ratings
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 資料整理

Step1 : Clean data

*   Convert time format

In [82]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

*   Dropout the duplicated rows
*   Parsing the `rank` column for generating `rank` and `categories`

In [83]:
# Drop duplicates rows
metadata_clean = metadata.loc[metadata.astype(str).drop_duplicates().index]

# 商品資訊- select useful columns
metadata_clean = metadata_clean[['asin', 'brand', 'title', 'rank','description']]

In [84]:
metadata_clean['rank'] = metadata_clean['rank'].str.replace('&amp;', '&')

# 買什麼種類商品
metadata_clean['categories'] = metadata_clean['rank'].str.split(' in ').str[1]
metadata_clean['categories'] = metadata_clean['categories'].str[:-2]

# 商品排名，轉換型別
metadata_clean['rank'] = metadata_clean['rank'].str.split(expand = True)[0].str.replace(',', '')
metadata_clean['rank'] = pd.to_numeric(metadata_clean['rank'])

In [85]:
metadata_clean['categories'].unique()

array(['Beauty & Personal Care', nan, 'Grocery & Gourmet Food',
       'Health & Household', 'Sports & Outdoors', 'Toys & Games', 'Baby',
       'Clothing, Shoes & Jewelry', 'Home & Kitchen', 'Automotive',
       'Tools & Home Improvement'], dtype=object)


*   僅使用（beauty）的類別資料



In [86]:
metadata_clean = metadata_clean[metadata_clean['categories']=='Beauty & Personal Care']

In [87]:
metadata_clean['title'] = metadata_clean['title'].str.replace('&amp;', '&')
metadata_clean['description'] = metadata_clean['description'].str[0].replace(np.nan, '')

*   Combine `description` and `title` to create new `desc`
*   Made `desc` string lowercase

In [88]:
metadata_clean['desc'] = metadata_clean["title"] +' '+ metadata_clean["description"]
metadata_clean['desc'] = metadata_clean['desc'].str.lower()

*   Create new attribute `asin_rating` : asin mean average rating



In [89]:
#asin mean score
asin_rating = ratings.groupby('asin')[['overall']].mean().reset_index().rename(columns={"asin": "asin", "overall": "rating_mean"})

In [90]:
metadata_clean = pd.merge(metadata_clean,asin_rating , on="asin", how="left")

In [91]:
metadata_clean.head(1)

Unnamed: 0,asin,brand,title,rank,description,categories,desc,rating_mean
0,6546546450,idea village,Loud 'N Clear&trade; Personal Sound Amplifier,2938573.0,Loud 'N Clear Personal Sound Amplifier allows ...,Beauty & Personal Care,loud 'n clear&trade; personal sound amplifier ...,2.5


In [92]:
metadata_clean = metadata_clean[['asin', 'brand', 'rank','categories','desc','rating_mean']]

# 資料切分

In [93]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
    ]

ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]

In [94]:
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

# EDA

*   整體而言，62%的商品分數為5分



In [95]:
ratings_count = ratings.groupby(['overall']).size().reset_index(name='counts')
# Calculating Percentage
ratings_count['percent'] = (ratings_count['counts'] / ratings_count['counts'].sum()) * 100
ratings_count.sort_values(by='percent')

Unnamed: 0,overall,counts,percent
1,2.0,20293,5.46473
2,3.0,29555,7.958906
0,1.0,39261,10.572648
3,4.0,52687,14.188154
4,5.0,229549,61.815562


*   訓練資料中，用戶購買次數分佈大都落在1次（佔90%)

In [96]:
ratings_trainings = ratings_trainings.drop_duplicates()

In [97]:
df = ratings_trainings.groupby(['reviewerID']).agg(count=('overall','count'))
df1 = df.groupby(['count']).size().reset_index(name='no_of_purchase')
df1['percent'] = (df1['no_of_purchase'] / df1['no_of_purchase'].sum()) * 100
df1.head(5)

Unnamed: 0,count,no_of_purchase,percent
0,1,293485,90.724878
1,2,25071,7.750186
2,3,3015,0.932026
3,4,1177,0.363845
4,5,503,0.155492


# CF-user-based recommendation
*   單純使用user-based，推薦分數0



產生推薦

In [98]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

# header: user_id,item_id,rating,timestamp

def recommender(training_data, users=[], k=10):

    # loading data from dataframe
    # user_to_items dict:
    # {
    #   'user': {
    #       'item': ratings...
    #   }...
    # }
    user_to_items = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])

        user_to_items[user][item] = rating

    print("total users before filtering: ", len(user_to_items))

    # remove obscure user to decrease data size
    # filtering params
    remove_obscure_user = True
    user_rating_threshold = 3
    all_users = list(user_to_items.keys())
    for user in all_users:
        ratings = user_to_items[user]
        if remove_obscure_user and len(ratings) < user_rating_threshold:
            del user_to_items[user]

    print("total users  after filtering: ", len(user_to_items))

    # generate item to user mapping dict
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for user, items in user_to_items.items():
        for item, rating in items.items():
            item_to_users[item][user] = rating

    # prepare data of computing user similarity 
    init_sim = lambda: [0 for _ in range(3)]
    factory = lambda: defaultdict(init_sim)
    pre_user_similarity = defaultdict(factory)
    n = len(item_to_users)
    index = 0
    for item, user_ratings in item_to_users.items():
        if len(user_ratings) > 1:
            # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
            for user1, user2 in combinations(user_ratings.keys(), 2):
                xy = user_ratings[user1] * user_ratings[user2]
                xx = user_ratings[user1] ** 2
                yy = user_ratings[user2] ** 2
                pre_user_similarity[user1][user2][0] += xy
                pre_user_similarity[user1][user2][1] += xx
                pre_user_similarity[user1][user2][2] += yy

                pre_user_similarity[user2][user1][0] += xy
                pre_user_similarity[user2][user1][1] += xx
                pre_user_similarity[user2][user1][2] += yy
        index += 1

    user_similarity = {}
    for src_user in pre_user_similarity:
        user_similarity_order = []
        for dst_user, val in pre_user_similarity[src_user].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(user_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    user_similarity_order.insert(i, (dst_user, similarity))
                    break
            else:
                user_similarity_order.append((dst_user, similarity))
        user_similarity[src_user] = user_similarity_order

    recommendation = {}
    recommendation_count = 0
    for user in users:
        if user in user_similarity:
            sim_users = user_similarity[user]
            recommended_items = []
            recommended_items_set = set()
            user_have_rated = set(user_to_items[user])
            stop_recommend = False
            for sim_user, _ in sim_users:
                items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
                for item, _ in items_from_sim_user:
                    if item not in user_have_rated and item not in recommended_items_set:
                        recommended_items.append(item)
                        recommended_items_set.add(item)
                    if len(recommended_items) >= k:  # 推薦超過k個商品
                        stop_recommend = True 
                        break # 停止推薦
                if stop_recommend: # 如果已經超過k個item所以停止推薦，那再上一層也中止
                    break
            recommendation[user] = recommended_items # 產生推薦
            if len(recommended_items)> 0:
              recommendation_count += 1
        else:
            recommendation[user] = [] # 如果沒有在 user_similarity 的user名單中，則產生[]
    print("total users to be recommended: ", recommendation_count)
    return recommendation

ratings_by_user_cf_userbased = recommender(ratings_trainings, users)
#ratings_by_user_cf_userbased

total users before filtering:  323489
total users  after filtering:  4793
total users to be recommended:  3


In [99]:
newDict = dict()
for key, value in ratings_by_user_cf_userbased.items():
    if len(value) != 0:
        newDict[key] = value

print(newDict)

{'A1SA3N793WT2LM': ['B0007QKY80', 'B0001TMDF0', 'B0021YV3C2', 'B01BZV6VVY', 'B015ZVFK1W', 'B019Z9JUC0', 'B019Z9L8PW', 'B00NPJVLYW', 'B00CR6X0IY', 'B00L5E74VS'], 'ACE1ZIOI3U6PY': ['B00PMRE5A2', 'B00QHX7NT0', 'B01E7UKR38', 'B005IZD7BG', 'B000067E30', 'B006WZ9ZHQ'], 'ASGIVOW34XNQS': ['1620213982', 'B000KNELAW', 'B01DJI7796', 'B00006IGL2', 'B0001WXTPA', 'B0002JHI1I', 'B00120VWTK', 'B01DEDYWD0', 'B00021DJ32', 'B003I5SDJO']}


結果評估

In [100]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user_cf_userbased)

0.0

產生推薦
*   將cf-user-based 結合之前作業rule-based
*   推薦分數為13.38



In [107]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

# header: user_id,item_id,rating,timestamp

def recommender(training_data, users=[], k=10,user_rating_threshold=3):
    # 延續之前rule-based的內容
    ratings_trainings = training_data
    # 近三個月作為沒有購買記錄使用者的推薦商品
    ratings_trainings_3m = ratings_trainings[
    (ratings_trainings['DATE'] < '2018-09-01') & (ratings_trainings['DATE'] >= '2018-06-01')
    ]

    #purchased_user_list = ratings_trainings.reviewerID.tolist()
    top_ten = ratings_trainings_3m.groupby('asin').count().sort_values('overall',ascending=False).head(k).index.tolist()

    # loading data from dataframe
    # user_to_items dict:
    # {
    #   'user': {
    #       'item': ratings...
    #   }...
    # }
    user_to_items = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])

        user_to_items[user][item] = rating

    print("total users before filtering: ", len(user_to_items))

    # remove obscure user to decrease data size
    # filtering params
    remove_obscure_user = True
    user_rating_threshold = 3
    all_users = list(user_to_items.keys())
    for user in all_users:
        ratings = user_to_items[user]
        if remove_obscure_user and len(ratings) < user_rating_threshold:
            del user_to_items[user]

    print("total users  after filtering: ", len(user_to_items))

    # generate item to user mapping dict
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for user, items in user_to_items.items():
        for item, rating in items.items():
            item_to_users[item][user] = rating

    # prepare data of computing user similarity 
    init_sim = lambda: [0 for _ in range(3)]
    factory = lambda: defaultdict(init_sim)
    pre_user_similarity = defaultdict(factory)
    n = len(item_to_users)
    index = 0
    for item, user_ratings in item_to_users.items():
        if len(user_ratings) > 1:
            # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
            for user1, user2 in combinations(user_ratings.keys(), 2):
                xy = user_ratings[user1] * user_ratings[user2]
                xx = user_ratings[user1] ** 2
                yy = user_ratings[user2] ** 2
                pre_user_similarity[user1][user2][0] += xy
                pre_user_similarity[user1][user2][1] += xx
                pre_user_similarity[user1][user2][2] += yy

                pre_user_similarity[user2][user1][0] += xy
                pre_user_similarity[user2][user1][1] += xx
                pre_user_similarity[user2][user1][2] += yy
        index += 1

    user_similarity = {}
    for src_user in pre_user_similarity:
        user_similarity_order = []
        for dst_user, val in pre_user_similarity[src_user].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(user_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    user_similarity_order.insert(i, (dst_user, similarity))
                    break
            else:
                user_similarity_order.append((dst_user, similarity))
        user_similarity[src_user] = user_similarity_order

    recommendation = {}
    recommendation_count = 0
    for user in users:
        if user in user_similarity:
            sim_users = user_similarity[user]
            recommended_items = []
            recommended_items_set = set()
            user_have_rated = set(user_to_items[user])
            stop_recommend = False
            for sim_user, _ in sim_users:
                items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
                for item, _ in items_from_sim_user:
                    if item not in user_have_rated and item not in recommended_items_set:
                        recommended_items.append(item)
                        recommended_items_set.add(item)
                    if len(recommended_items) >= k:  # 推薦超過k個商品
                        stop_recommend = True 
                        break # 停止推薦
                if stop_recommend: # 如果已經超過k個item所以停止推薦，那再上一層也中止
                    break
            recommendation[user] = recommended_items # 產生推薦
            if len(recommended_items)> 0:
              recommendation_count += 1
        else:
            recommendation[user] = top_ten # 如果沒有在 user_similarity 的user名單中，則給rule-based 
    print("total users to be recommended: ", recommendation_count)
    return recommendation

ratings_by_user_cf_userbased_2 = recommender(ratings_trainings, users)
#ratings_by_user_cf_userbased_2

total users before filtering:  323489
total users  after filtering:  4793
total users to be recommended:  3


結果評估

In [108]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user_cf_userbased_2)

0.13389830508474576

# CF-Item-Based recommendation
*   推薦分數0.16



產生推薦

In [114]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

def recommender(training_data, users=[], k=10):

    # loading data from dataframe
    # item_to_users dict:
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])
        item_to_users[item][user] = rating

    print("data converted")

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
        for user, rating in rating_users.items():
            user_to_items[user][item] = rating

    print("data inverted")

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)
    for user, items in user_to_items.items():
        if len(items) > 1:
            for i1, i2 in combinations(items.keys(), 2):
                xy = items[i1] * items[i2]
                xx = items[i1] ** 2
                yy = items[i2] ** 2
                pre_item_similarity[i1][i2][0] += xy
                pre_item_similarity[i1][i2][1] += xx
                pre_item_similarity[i1][i2][2] += yy

                pre_item_similarity[i2][i1][0] += xy
                pre_item_similarity[i2][i1][1] += xx
                pre_item_similarity[i2][i1][2] += yy

    print("sim data prepared")

    item_similarity = {}
    for src_item in pre_item_similarity:
        item_similarity_order = []
        for dst_item, val in pre_item_similarity[src_item].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(item_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    item_similarity_order.insert(i, (dst_item, similarity))
                    break
            else:
                item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order

    #print(f"get {k} recommendation items for for user: {users}")

    recommendation = {}
    recommendation_count_b = 0
    for user in users:
        items = []
        items_set = set()
        stop = False
        user_has_rated = set(user_to_items[user])
        for item in user_has_rated:
            if item in item_similarity:
                for sim_item, _ in item_similarity[item]:
                    # skip the item user has rated
                    if sim_item not in user_has_rated and sim_item not in items_set:
                        items.append(sim_item)
                        items_set.add(sim_item)
                    if len(items) >= k:
                        stop = True
                        break
                if stop:
                    break
        recommendation[user] = items
        if len(items) > 0:
            recommendation_count_b += 1
    print("total users recommended by item-based:", recommendation_count_b)
    return recommendation    

ratings_by_user_cf_itembased = recommender(ratings_trainings, users)
#ratings_by_user_cf_itembased

data converted
data inverted
sim data prepared
total users recommended by item-based: 32


結果評估

In [115]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user_cf_itembased)

0.001694915254237288

產生推薦
*   將cf-item-based 結合之前作業rule-based
*   推薦分數13.22



In [117]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

def recommender(training_data, users=[], k=10):
    # 延續之前rule-based的內容
    ratings_trainings = training_data
    # 近三個月作為沒有購買記錄使用者的推薦商品
    ratings_trainings_3m = ratings_trainings[
    (ratings_trainings['DATE'] < '2018-09-01') & (ratings_trainings['DATE'] >= '2018-06-01')
    ]

    #purchased_user_list = ratings_trainings.reviewerID.tolist()
    top_ten = ratings_trainings_3m.groupby('asin').count().sort_values('overall',ascending=False).head(k).index.tolist()

    # loading data from dataframe
    # item_to_users dict:
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])
        item_to_users[item][user] = rating

    print("data converted")

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
        for user, rating in rating_users.items():
            user_to_items[user][item] = rating

    print("data inverted")

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)
    for user, items in user_to_items.items():
        if len(items) > 1:
            for i1, i2 in combinations(items.keys(), 2):
                xy = items[i1] * items[i2]
                xx = items[i1] ** 2
                yy = items[i2] ** 2
                pre_item_similarity[i1][i2][0] += xy
                pre_item_similarity[i1][i2][1] += xx
                pre_item_similarity[i1][i2][2] += yy

                pre_item_similarity[i2][i1][0] += xy
                pre_item_similarity[i2][i1][1] += xx
                pre_item_similarity[i2][i1][2] += yy

    print("sim data prepared")

    item_similarity = {}
    for src_item in pre_item_similarity:
        item_similarity_order = []
        for dst_item, val in pre_item_similarity[src_item].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(item_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    item_similarity_order.insert(i, (dst_item, similarity))
                    break
            else:
                item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order

    #print(f"get {k} recommendation items for for user: {users}")

    recommendation = {}
    recommendation_count_b = 0
    for user in users:
        items = []
        items_set = set()
        stop = False
        user_has_rated = set(user_to_items[user])
        for item in user_has_rated:
            if item in item_similarity:
                for sim_item, _ in item_similarity[item]:
                    # skip the item user has rated
                    if sim_item not in user_has_rated and sim_item not in items_set:
                        items.append(sim_item)
                        items_set.add(sim_item)
                    if len(items) >= k:
                        stop = True
                        break
                if stop:
                    break
        
        if len(items)!=0:
            recommendation[user] = items
            recommendation_count_b += 1
        else:
            recommendation[user] = top_ten
    print("total users recommended by item-based:", recommendation_count_b)
    return recommendation    

ratings_by_user_cf_itembased_2 = recommender(ratings_trainings, users)
#ratings_by_user_cf_itembased

data converted
data inverted
sim data prepared
total users recommended by item-based: 32


結果評估

In [119]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user_cf_itembased_2)

0.13220338983050847

# CF-surprise recommendation


*   推薦分數0.16



In [121]:
! pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 19.4 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619416 sha256=d7f6090f9b90541abc0484fbda505105686fd41ca557e2d24e090dafdcf5bb11
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [122]:
import time
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

產生推薦


*   因記憶體不足，故訓練資料僅採用近一年的訓練資料



In [123]:
#使用近一年訓練資料
ratings_trainings_1y = ratings[
    (ratings['DATE'] < '2018-09-01') & (ratings['DATE'] >= '2017-09-01')
    ]

*   surprise-item_based







In [127]:
def recommender(training_data, users=[], k=10, user_based=False, algo=KNNBasic):

    training_data = (
        training_data
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    recommendation_count_c = 0
    for user in users:
        items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
        recommend_item_list = []
        recommend_item_set = set()
        for item in items_user_rated:
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, k)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                    recommend_item_list.append(item_raw_id)
                    recommend_item_set.add(item_raw_id)

            if len(recommend_item_list) >= k:
                recommend_item_list = recommend_item_list[:k]
                break
        recommendation[user] = recommend_item_list
        if len(recommend_item_list) > 0:
            recommendation_count_c += 1
    print("total users recommended(surprise):", recommendation_count_c)

    return recommendation

ratings_by_user_surprise_itembased = recommender(ratings_trainings_1y, users)

Computing the cosine similarity matrix...
Done computing similarity matrix.
total users recommended(surprise): 13


結果評估

In [125]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user_surprise_itembased)

0.001694915254237288

*   surprise-item_based + rule-based

In [128]:
def recommender(training_data, users=[], k=10, user_based=False, algo=KNNBasic):
    # 延續之前rule-based的內容
    ratings_trainings = training_data
    # 近三個月作為沒有購買記錄使用者的推薦商品
    ratings_trainings_3m = ratings_trainings[
    (ratings_trainings['DATE'] < '2018-09-01') & (ratings_trainings['DATE'] >= '2018-06-01')
    ]

    #purchased_user_list = ratings_trainings.reviewerID.tolist()
    top_ten = ratings_trainings_3m.groupby('asin').count().sort_values('overall',ascending=False).head(k).index.tolist()

    training_data = (
        training_data
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    recommendation_count_c = 0
    for user in users:
        items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
        recommend_item_list = []
        recommend_item_set = set()
        for item in items_user_rated:
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, k)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                    recommend_item_list.append(item_raw_id)
                    recommend_item_set.add(item_raw_id)

            if len(recommend_item_list) >= k:
                recommend_item_list = recommend_item_list[:k]
                break
        
        if len(recommend_item_list) > 0:
            recommendation[user] = recommend_item_list
            recommendation_count_c += 1
        else:
            recommendation[user] = top_ten
           
    print("total users recommended(surprise):", recommendation_count_c)

    return recommendation

ratings_by_user_surprise_itembased_2 = recommender(ratings_trainings_1y, users)

Computing the cosine similarity matrix...
Done computing similarity matrix.
total users recommended(surprise): 13


結果評估

In [129]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user_surprise_itembased_2)

0.13389830508474576