## Collaborative  Filtering：Item-based Recommendation

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
warnings.filterwarnings('ignore')

In [2]:
# Load data
df = pd.read_csv("RawData/df.csv")

# Only keep the orders that only contains 1 order
order_product = df.groupby('order_id').product_id.count().sort_values(ascending=False)
df = df[df.order_id.isin((order_product[order_product == 1]).index)]

# Select columns for analysis
item_profile = df[['customer_unique_id', 'product_id','review_score']]
item_profile.head()

Unnamed: 0,customer_unique_id,product_id,review_score
0,861eff4711a542e4b93843c6dd7febb0,a9516a079e37a9c9c36b9b78b10169e8,4.0
3,3c799d181c34d51f6d44bbbc563024db,a9516a079e37a9c9c36b9b78b10169e8,3.0
4,23397e992b09769faf5e66f9e171a241,a9516a079e37a9c9c36b9b78b10169e8,4.0
5,567ab47ca4deb92d46dbf54dce07d0a7,a9516a079e37a9c9c36b9b78b10169e8,4.0
6,f40ab89b622248b7ca125af4b486b887,a9516a079e37a9c9c36b9b78b10169e8,4.0


In [5]:
# Calculate average rating of different product
Ratings_mean = item_profile.groupby('product_id')['review_score'].mean().reset_index().rename(columns = {'review_score': 'mean_rating'})
item_profile = pd.merge(item_profile, Ratings_mean, how='inner', on=['product_id'])
item_profile.head()

Unnamed: 0,customer_unique_id,product_id,review_score,mean_rating_x,mean_rating_y,mean_rating
0,861eff4711a542e4b93843c6dd7febb0,a9516a079e37a9c9c36b9b78b10169e8,4.0,3.538462,3.538462,3.538462
1,3c799d181c34d51f6d44bbbc563024db,a9516a079e37a9c9c36b9b78b10169e8,3.0,3.538462,3.538462,3.538462
2,23397e992b09769faf5e66f9e171a241,a9516a079e37a9c9c36b9b78b10169e8,4.0,3.538462,3.538462,3.538462
3,567ab47ca4deb92d46dbf54dce07d0a7,a9516a079e37a9c9c36b9b78b10169e8,4.0,3.538462,3.538462,3.538462
4,f40ab89b622248b7ca125af4b486b887,a9516a079e37a9c9c36b9b78b10169e8,4.0,3.538462,3.538462,3.538462


In [6]:
print(len(item_profile.customer_unique_id.drop_duplicates().tolist()))
print(len(item_profile.product_id.drop_duplicates().tolist()))

80694
27571


In [7]:
# Create a pivot table
new_df=item_profile.head(10000)
pivot = pd.pivot_table(new_df, index='product_id', columns='customer_unique_id', values='review_score')
pivot.head()

customer_unique_id,0005ef4cd20d2893f0d9fbd94d3c0d97,0010a452c6d13139e50b57f19f52e04e,00115fc7123b5310cf6d3a3aa932699e,0019da6aa6bcb27cc32f1249bd12da05,0019e8c501c85848ac0966d45226fa1d,001a2bf0e46c684031af91fb2bce149d,001a34eb30ecb8e3aacb07c475ca4dd1,001deb796b28a3a128d6113857569aa4,001f3c4211216384d5fe59b041ce1461,002043098f10ba39a4600b6c52fbfe3c,...,ffbb866d7c0d272f9fe12de1b9ee9173,ffbb8dfaa0e54649d8690b85a3ef890d,ffd2aa973e106c7d7218a960320420bd,ffddf4e5baa1623f69d3c5e0d775e1af,ffde9f4d5007c6675904e26947ba4538,ffec10ad4229ba46818560e1c8b40a68,ffedff0547d809c90c05c2691c51f9b7,ffef0ffa736c7b3d9af741611089729b,fff2ae16b99c6f3c785f0e052f2a9cfb,fff96bc586f78b1f070da28c4977e810
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000b8f95fcb9e0096488278317764d19,,,,,,,,,,,...,,,,,,,,,,
00250175f79f584c14ab5cecd80553cd,,,,,,,,,,,...,,,,,,,,,,
002ec297b1b00fb9dde7ee6ac24b6771,,,,,,,,,,,...,,,,,,,,,,
004636c889c7c3dad6631f136b7fa082,,,,,,,,,,,...,,,,,,,,,,
007c63ae4b346920756b5adcad8095de,,,,,,,,,,,...,,,,,,,,,,


In [8]:
pivot.shape

(3056, 9893)

In [60]:
# Center the mean around 0 (centered cosine/pearson)
# pivot_norm = pivot.apply(lambda x: x - np.nanmean(x), axis=1)
# pivot_norm.head()

## Item Based CF

In [9]:
# Fill NaN with 0
pivot.fillna(0, inplace=True)
pivot.head()

customer_unique_id,0005ef4cd20d2893f0d9fbd94d3c0d97,0010a452c6d13139e50b57f19f52e04e,00115fc7123b5310cf6d3a3aa932699e,0019da6aa6bcb27cc32f1249bd12da05,0019e8c501c85848ac0966d45226fa1d,001a2bf0e46c684031af91fb2bce149d,001a34eb30ecb8e3aacb07c475ca4dd1,001deb796b28a3a128d6113857569aa4,001f3c4211216384d5fe59b041ce1461,002043098f10ba39a4600b6c52fbfe3c,...,ffbb866d7c0d272f9fe12de1b9ee9173,ffbb8dfaa0e54649d8690b85a3ef890d,ffd2aa973e106c7d7218a960320420bd,ffddf4e5baa1623f69d3c5e0d775e1af,ffde9f4d5007c6675904e26947ba4538,ffec10ad4229ba46818560e1c8b40a68,ffedff0547d809c90c05c2691c51f9b7,ffef0ffa736c7b3d9af741611089729b,fff2ae16b99c6f3c785f0e052f2a9cfb,fff96bc586f78b1f070da28c4977e810
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000b8f95fcb9e0096488278317764d19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00250175f79f584c14ab5cecd80553cd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002ec297b1b00fb9dde7ee6ac24b6771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
004636c889c7c3dad6631f136b7fa082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007c63ae4b346920756b5adcad8095de,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculate Similar Items

In [10]:
# Convert into dataframe
item_sim_df = pd.DataFrame(cosine_similarity(pivot, pivot), 
                           index=pivot.index, 
                           columns=pivot.index)
item_sim_df

product_id,000b8f95fcb9e0096488278317764d19,00250175f79f584c14ab5cecd80553cd,002ec297b1b00fb9dde7ee6ac24b6771,004636c889c7c3dad6631f136b7fa082,007c63ae4b346920756b5adcad8095de,008cff0e5792219fae03e570f980b330,0091d85023824de2d33252310c952307,00929aaa7751a77220db9caa1ae6d3ab,00ae7076313576f94d9107599d79a978,00b4155166f994ba9da3cf001eb80505,...,fee4f51fea27d262dd116b53ddf31ce6,ff25a6dd5b0db59bf1f8977c272270d0,ff37b24bee4345bc69d11fae0e6cd3da,ff3ec972ebd16eb98d55e8ea2fa255ee,ff5bdcc63751127e726cb69cff0cb5de,ff7fccf8513f360157f0660fe51d1d88,ff9fa77f938462abd16b53c0d934099f,ffaaddefb271481c66d4bd79844ecdae,ffc88104d219c1b767d566fd93653dd2,ffe0fc4e02c3559643ac063fa5cf9d07
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000b8f95fcb9e0096488278317764d19,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00250175f79f584c14ab5cecd80553cd,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002ec297b1b00fb9dde7ee6ac24b6771,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
004636c889c7c3dad6631f136b7fa082,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007c63ae4b346920756b5adcad8095de,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff7fccf8513f360157f0660fe51d1d88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ff9fa77f938462abd16b53c0d934099f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
ffaaddefb271481c66d4bd79844ecdae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
ffc88104d219c1b767d566fd93653dd2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
def get_similar_product(product_id):
    if product_id not in pivot.index:
        return None, None
    else:
        sim_product = item_sim_df.sort_values(by=product_id, ascending=False).index[1:]
        sim_score = item_sim_df.sort_values(by=product_id, ascending=False).loc[:, product_id].tolist()[1:]
        return sim_product, sim_score

In [12]:
product, score = get_similar_product("a9516a079e37a9c9c36b9b78b10169e8")
for x,y in zip(product[:10], score[:10]):
    print("{} with similarity of {}".format(x, y))

aa5d6a9bd4be367ae1a3a29c323c7c11 with similarity of 0.0
aa7f707ef77cdfa8a3059da401c11311 with similarity of 0.0
aa829330d8c41c82f96b3252043c6791 with similarity of 0.0
aa8627a375771ab01288705307ec4ae5 with similarity of 0.0
aa8dbe0ebad6906e9253479e1027185e with similarity of 0.0
aa8e397582b49721494b929b3216b41f with similarity of 0.0
aa8fce5e65ab73ab73101ea58d710905 with similarity of 0.0
aa968acdefd6d319d1f19bdc5c04670d with similarity of 0.0
aa96d291abe7cd0000c25e34bc8ed316 with similarity of 0.0
aaa80d9e907a758e151586557e332acb with similarity of 0.0


In [15]:
# Predict the rating of product x by user y
def predict_rating(customer_unique_id, product_id, max_neighbor=10):
    product, scores = get_similar_product(product_id)
    product_arr = np.array([x for x in product])
    sim_arr = np.array([x for x in scores])
    
    # Select only the product that has already rated by user x
    filtering = pivot[customer_unique_id].loc[product_arr] != 0
    
    # Calculate the predicted score
    s = np.dot(sim_arr[filtering][:max_neighbor], pivot[customer_unique_id].loc[product_arr[filtering][:max_neighbor]]) \
            / np.sum(sim_arr[filtering][:max_neighbor])
    
    return s

In [16]:
predict_rating("00115fc7123b5310cf6d3a3aa932699e", "a9516a079e37a9c9c36b9b78b10169e8")

nan

## Get recommendation

In [17]:
# Recommend top n_product for customer x
def get_recommendation(customer_unique_id, n_product=1):
    predicted_rating = np.array([])
    
    for _product in pivot.index:
        predicted_rating = np.append(predicted_rating, predict_rating(customer_unique_id, _product))
    
    # Don't recommend sth that the customer has already rated
    temp = pd.DataFrame({'predicted':predicted_rating, 'name':pivot.index})
    filtering = (pivot[customer_unique_id] == 0.0)
    temp = temp.loc[filtering.values].sort_values(by='predicted', ascending=False)

    # Recommend n_product product
    return product.loc[product_index.loc[temp.name[:n_product]]]

In [None]:
# Recommendation for a particular customer
get_recommendation("0010a452c6d13139e50b57f19f52e04e")

### Summary
Since most customers (70%+) only rate 1 product, it's hard to recommend similar products at leaset for the subset (i.e. first 10000 rows). Possible solutions:
- Use the whole dataset: same concern of the limited ratings
- Group the items based on the categories