## Collaborative Filtering: Using KNN & SVD

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from surprise import KNNWithMeans
from surprise import accuracy
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import SVD

from sklearn.decomposition import TruncatedSVD

In [2]:
# Load data
df = pd.read_csv('RawData/df.csv')

In [3]:
# Check number of different products in each order
order_product = df.groupby('order_id').product_id.count().sort_values(ascending=False)

# Only keep the orders that only contains 1 order
df = df[df.order_id.isin((order_product[order_product == 1]).index)]
df.head()

Unnamed: 0,customer_unique_id,order_id,order_status,order_item_id,product_id,product_category_name_english,review_score,review_comment_message,order_purchase_timestamp
0,861eff4711a542e4b93843c6dd7febb0,00e7ee1b050b8499577073aeb2a297a1,delivered,1.0,a9516a079e37a9c9c36b9b78b10169e8,office_furniture,4.0,,2017-05-16 15:05:35
3,3c799d181c34d51f6d44bbbc563024db,b1a5d5365d330d10485e0203d54ab9e8,delivered,1.0,a9516a079e37a9c9c36b9b78b10169e8,office_furniture,3.0,"Produto compatível com seu valor, muito bonito...",2017-05-07 20:11:26
4,23397e992b09769faf5e66f9e171a241,2e604b3614664aa66867856dba7e61b7,delivered,1.0,a9516a079e37a9c9c36b9b78b10169e8,office_furniture,4.0,Entregou antes do prazo,2018-02-03 19:45:40
5,567ab47ca4deb92d46dbf54dce07d0a7,574fe1739f65af76badd0999db300b4f,delivered,1.0,a9516a079e37a9c9c36b9b78b10169e8,office_furniture,4.0,,2017-03-23 15:10:17
6,f40ab89b622248b7ca125af4b486b887,e0b26f14d2bcc710bb02f77a4628763b,delivered,1.0,a9516a079e37a9c9c36b9b78b10169e8,office_furniture,4.0,Os encaixes para o encosto da cadeira estavam ...,2017-05-16 10:00:49


In [4]:
# Choose the columns for the analysis
item_profile = df[['customer_unique_id', 'product_id','review_score']]

# Read data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(item_profile,reader)

In [5]:
# Split the training and test set
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Collaborative filtering using KNN 

Use "user_based" True/False to switch between user-based or item-based collaborative filtering.

In [12]:
# Use cosine similarity
algo = KNNWithMeans(k=5, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x12e69b198>

In [13]:
# Run the train model against the test set
test_pred = algo.test(testset)
test_pred

[Prediction(uid='689dd283adf3d5ecc00c4f454019e96c', iid='d285360f29ac7fd97640bf0baef03de0', r_ui=1.0, est=4.208389429258626, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid='f7f1f7eb8e7acd908f4c4fe555ccb410', iid='bded2bc068525f00b659bd2e7268b4c1', r_ui=5.0, est=4.208389429258626, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid='1ae128962347cc6c8295ebea5186c808', iid='065a46b0be3f31498ebdd3f4c668a6b6', r_ui=3.0, est=4.208389429258626, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid='2dd0f0f812e746c1bc7c09ac0932756a', iid='2d4086091519b04cd687a38e33dc276c', r_ui=5.0, est=4.208389429258626, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid='70e46594addc93a88cb7f41b2cf2a383', iid='ebe731afbcf530cfaa1e5260be512e9b', r_ui=1.0, est=4.208389429258626, details={'was_impossible': True, 'reason': 'User and/or item is unkown.

In [14]:
# Get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.2573


1.2573291849070476

### Collaborative filtering using SVD

In [16]:
new_df=df.head(10000)
ratings_matrix = new_df.pivot_table(values='review_score', index='product_id', columns='customer_unique_id', fill_value=0)
ratings_matrix.head()

customer_unique_id,0005ef4cd20d2893f0d9fbd94d3c0d97,0010a452c6d13139e50b57f19f52e04e,00115fc7123b5310cf6d3a3aa932699e,0019da6aa6bcb27cc32f1249bd12da05,0019e8c501c85848ac0966d45226fa1d,001a2bf0e46c684031af91fb2bce149d,001a34eb30ecb8e3aacb07c475ca4dd1,001deb796b28a3a128d6113857569aa4,001f3c4211216384d5fe59b041ce1461,002043098f10ba39a4600b6c52fbfe3c,...,ffbb866d7c0d272f9fe12de1b9ee9173,ffbb8dfaa0e54649d8690b85a3ef890d,ffd2aa973e106c7d7218a960320420bd,ffddf4e5baa1623f69d3c5e0d775e1af,ffde9f4d5007c6675904e26947ba4538,ffec10ad4229ba46818560e1c8b40a68,ffedff0547d809c90c05c2691c51f9b7,ffef0ffa736c7b3d9af741611089729b,fff2ae16b99c6f3c785f0e052f2a9cfb,fff96bc586f78b1f070da28c4977e810
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000b8f95fcb9e0096488278317764d19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00250175f79f584c14ab5cecd80553cd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
002ec297b1b00fb9dde7ee6ac24b6771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
004636c889c7c3dad6631f136b7fa082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
007c63ae4b346920756b5adcad8095de,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
ratings_matrix.shape

(3056, 9893)

In [19]:
# Unique products in subset of data
X = ratings_matrix

# Decompose the Matrix
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix

array([[-1.58061131e-09, -5.79886098e-09,  5.05639598e-08, ...,
         2.28955867e-07,  1.34092392e-07, -6.43052981e-07],
       [-6.95478028e-08, -2.81281977e-07,  4.00367534e-06, ...,
        -9.14425663e-06, -4.54757339e-06, -3.42754755e-05],
       [-2.95969797e-12,  7.38246854e-11, -4.79501672e-10, ...,
         2.54217200e-09,  7.31258535e-10,  7.31044325e-09],
       ...,
       [-1.20541915e-08, -1.36729480e-08, -4.41867086e-07, ...,
         2.26016710e-06, -5.04857518e-06,  8.70168740e-06],
       [-1.04183238e-16,  1.78249053e-14,  5.21776558e-13, ...,
         2.60400765e-12,  3.28921549e-12,  1.84074082e-11],
       [ 5.21131562e-10, -8.25804638e-09,  2.36214270e-08, ...,
         3.60898770e-08, -3.57477419e-07, -5.86492297e-08]])

In [20]:
decomposed_matrix.shape

(3056, 10)

In [28]:
# Correlation matrix
correlation_matrix = np.corrcoef(decomposed_matrix)
pd.DataFrame(correlation_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3046,3047,3048,3049,3050,3051,3052,3053,3054,3055
0,1.000000,0.816297,-0.717413,0.507656,0.762807,-0.882055,0.078901,0.074740,-0.919019,-0.700706,...,-0.898944,0.607483,-0.755381,-0.668911,0.451841,-0.819617,-0.865890,-0.775216,-0.851179,0.062609
1,0.816297,1.000000,-0.706818,0.542583,0.807395,-0.893176,-0.066914,0.243555,-0.733450,-0.578027,...,-0.712468,0.598261,-0.955021,-0.749300,0.682543,-0.884736,-0.853296,-0.666912,-0.801467,0.170348
2,-0.717413,-0.706818,1.000000,-0.787217,-0.919431,0.816543,0.183133,-0.328855,0.706217,0.955362,...,0.859189,-0.234814,0.723669,0.980136,-0.113830,0.920545,0.915714,0.851857,0.938396,-0.172380
3,0.507656,0.542583,-0.787217,1.000000,0.909148,-0.481563,-0.565378,0.788671,-0.497921,-0.794480,...,-0.637372,0.092630,-0.707244,-0.765359,0.023914,-0.722132,-0.588247,-0.879635,-0.677338,-0.165090
4,0.762807,0.807395,-0.919431,0.909148,1.000000,-0.791218,-0.317937,0.566884,-0.735141,-0.875743,...,-0.843575,0.314577,-0.887676,-0.915109,0.279121,-0.933998,-0.856206,-0.928066,-0.884765,-0.006761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3051,-0.819617,-0.884736,0.920545,-0.722132,-0.933998,0.937566,0.019049,-0.347927,0.843200,0.849704,...,0.884258,-0.308881,0.882168,0.918785,-0.436742,1.000000,0.961201,0.871599,0.910515,-0.021344
3052,-0.865890,-0.853296,0.915714,-0.588247,-0.856206,0.949876,-0.113431,-0.136566,0.864214,0.826330,...,0.922908,-0.431995,0.790409,0.908530,-0.347035,0.961201,1.000000,0.786825,0.962534,-0.163548
3053,-0.775216,-0.666912,0.851857,-0.879635,-0.928066,0.739199,0.198029,-0.577039,0.827477,0.892004,...,0.849858,-0.124606,0.742054,0.780096,-0.218353,0.871599,0.786825,1.000000,0.790745,0.280127
3054,-0.851179,-0.801467,0.938396,-0.677338,-0.884765,0.858339,0.058544,-0.187110,0.791345,0.844378,...,0.926220,-0.526457,0.768286,0.932650,-0.178930,0.910515,0.962534,0.790745,1.000000,-0.247840


In [29]:
correlation_matrix.shape

(3056, 3056)

In [30]:
# Index No. of product ID purchased by customer
i = "004636c889c7c3dad6631f136b7fa082"
product_names = list(X.index)
product_ID = product_names.index(i)
product_ID

3

Correlation for all items with the item purchased by this customer based on items rated by other customers people who bought the same product

In [31]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID

array([ 0.50765646,  0.54258323, -0.78721655, ..., -0.87963529,
       -0.67733846, -0.16508971])

Top 10 products displayed by the recommendation system to the above customer based on the purchase history of other customers in the website.

In [32]:
# Recommend Top 10 highly correlated products (with 0.75+ correlation) in sequence
Recommend = list(X.index[correlation_product_ID > 0.75])

# Removes the item already bought by the customer
Recommend.remove(i) 
Recommend[0:10]

['007c63ae4b346920756b5adcad8095de',
 '00929aaa7751a77220db9caa1ae6d3ab',
 '026b77b15418f2a8114dfb0026739be4',
 '06484143f7046fc1c2ec000aba13e28a',
 '08279c494018541f71443c07d77560f8',
 '0950783476d3f6529a03b28fcac219d1',
 '0a1902453c06a6bb74a24a4fc60d1611',
 '0a32d8349e99e8cdb9a5dec7d3f94985',
 '0a41699cb57f40c605d3cfdddd61673b',
 '0a8b35c2fbc824f7e1015cf330e6cc3c']