## A collaborative filtering models for recommending products to customers using purchase data. 

In [65]:

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.cross_validation import train_test_split

import sys
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data
* recommend_1.csv = consisting of a list of 1000 customer IDs to recommend as output
* trx_data.csv = consisting of user transactions


In [66]:
customers = pd.read_csv('./data/recommend_1.csv')
transactions = pd.read_csv('./data/trx_data.csv')

### Invstigate the data

In [67]:
print(customers.shape)
customers.head()

(1000, 1)


Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [68]:
print(transactions.shape)
transactions.head()

(62483, 2)


Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


62483 transaction records

products columns contains '|' separated products id bought by the customer

In [69]:
transactions.isnull().any()  # NO NAN found

customerId    False
products      False
dtype: bool

### Data Preparation
Break down each list of items in the products column into rows and count the number of products bought by a user.

In [70]:
# split product column
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()

Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0


In [71]:
# organize a given table into a dataframe with customerId, single productId, and purchase count
s=time.time()

data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

print("Execution time:", round((time.time()-s)/60,2), "minutes")

Execution time: 0.29 minutes


In [72]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2


In [77]:
# create a dummy column for whether the customer bought the product
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

In [78]:
data_dummy.head()

Unnamed: 0,customerId,productId,purchase_count,purchase_dummy
0,0,1,2,1
1,0,13,1,1
2,0,19,3,1
3,0,20,1,1
4,0,31,2,1


In [79]:
# Normalize item values across users
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
df_matrix.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [80]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
print(df_matrix_norm.shape)
df_matrix_norm.head()

(24429, 300)


productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.1,,,,,,,,,...,,,,,,,,,,
1,,,0.166667,,,,,,,,...,,,,0.0,,,0.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [81]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(133585, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


Now the purchase history is normalized, from 0-1 (with 1 being the most number of purchase for an item and 0 being 0 purchase count for that item).

### Split train and test set
Split the data into training and testing sets for evaluating predictive modeling, in this case a collaborative filtering model. Typically, we use a larger portion of the data for training and a smaller portion for testing.
We use 80:20 ratio for our train-test set size.

In [82]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [83]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

### Define Models using Turicreate library

In [84]:
# constant variables to define field names include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [86]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

### Popularity Model as Baseline
The popularity model takes the most popular items for recommendation. These items are products with the highest number of sells across customers.


In [88]:
# i. Using Purchase Count
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.2641509433962264 |  1   |
|    1553    |     37    |     3.06640625     |  2   |
|    1553    |     34    | 3.040650406504065  |  3   |
|    1553    |     0     | 2.9806201550387597 |  4   |
|    1553    |     27    | 2.8976377952755907 |  5   |
|    1553    |    248    | 2.880952380952381  |  6   |
|    1553    |     3     | 2.7604166666666665 |  7   |
|    1553    |    110    | 2.7011494252873565 |  8   |
|    1553    |     10    | 2.6553846153846155 |  9   |
|    1553    |    230    | 2.651851851851852  |  10  |
|   20400    |    132    | 3.2641509433962264 |  1   |
|   20400    |     37    |     3.06640625     |  2   |
|   20400    |     34    | 3.040650406504065  |  3   |
|   20400    |     0     | 2.9806201550387597 |  4   |
|   20400    |     27    | 2.8976377952755907 |  5   |
|   20400 

In [89]:
# ii. Using Purchase Dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |     47    |  1.0  |  1   |
|    1553    |    126    |  1.0  |  2   |
|    1553    |    146    |  1.0  |  3   |
|    1553    |    122    |  1.0  |  4   |
|    1553    |     1     |  1.0  |  5   |
|    1553    |    172    |  1.0  |  6   |
|    1553    |     34    |  1.0  |  7   |
|    1553    |     29    |  1.0  |  8   |
|    1553    |     57    |  1.0  |  9   |
|    1553    |     37    |  1.0  |  10  |
|   20400    |     14    |  1.0  |  1   |
|   20400    |    126    |  1.0  |  2   |
|   20400    |    146    |  1.0  |  3   |
|   20400    |    122    |  1.0  |  4   |
|   20400    |     1     |  1.0  |  5   |
|   20400    |    172    |  1.0  |  6   |
|   20400    |     34    |  1.0  |  7   |
|   20400    |     29    |  1.0  |  8   |
|   20400    |     57    |  1.0  |  9   |
|   20400    |     37    |  1.0  |  10  |
|   19750    |     14    |  1.0  |

In [90]:
# iii. Using Scaled Purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
|    1553    |    226    |  0.7361111111111112 |  1   |
|    1553    |    247    |  0.3438320209973753 |  2   |
|    1553    |    230    | 0.33185185185185134 |  3   |
|    1553    |    125    | 0.25931034482758586 |  4   |
|    1553    |    248    | 0.25277777777777777 |  5   |
|    1553    |    276    | 0.24615384615384617 |  6   |
|    1553    |    294    | 0.24427480916030506 |  7   |
|    1553    |     83    | 0.23850574712643682 |  8   |
|    1553    |     72    |  0.2310838445807771 |  9   |
|    1553    |    204    | 0.22891566265060223 |  10  |
|   20400    |    226    |  0.7361111111111112 |  1   |
|   20400    |    247    |  0.3438320209973753 |  2   |
|   20400    |    230    | 0.33185185185185134 |  3   |
|   20400    |    125    | 0.25931034482758586 |  4   |
|   20400    |    248    | 0.25277777777777777 |

### Collaborative Filtering Model
In collaborative filtering, we would recommend items based on how similar users purchase items. For instance, if customer 1 and customer 2 bought similar items, e.g. 1 bought X, Y, Z and 2 bought X, Y, we would recommend an item Z to customer 2.

In [91]:
# I. COSINE similarity using purchase count
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.11356961727142334  |  1   |
|    1553    |     35    | 0.07707769870758056  |  2   |
|    1553    |     1     | 0.07667785882949829  |  3   |
|    1553    |     33    | 0.060876095294952394 |  4   |
|    1553    |     17    | 0.059446310997009276 |  5   |
|    1553    |    148    | 0.05065848827362061  |  6   |
|    1553    |     21    | 0.04794074296951294  |  7   |
|    1553    |     13    | 0.042415738105773926 |  8   |
|    1553    |     47    |  0.0412045955657959  |  9   |
|    1553    |    233    | 0.04057788848876953  |  10  |
|   20400    |    280    | 0.09493756294250488  |  1   |
|   20400    |    215    | 0.046443164348602295 |  2   |
|   20400    |     26    | 0.04573565721511841  |  3   |
|   20400    |    122    | 0.04553091526031494  |  4   |
|   20400    |     6     | 0.04

In [93]:
# COSINE similarity using purchase dummy data
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.09617377519607544  |  1   |
|    1553    |     35    | 0.08738603591918945  |  2   |
|    1553    |     1     | 0.08446981906890869  |  3   |
|    1553    |     5     | 0.07363030910491944  |  4   |
|    1553    |     21    | 0.06667494773864746  |  5   |
|    1553    |     17    |  0.0598006010055542  |  6   |
|    1553    |     33    | 0.05541661977767944  |  7   |
|    1553    |     8     | 0.05355746746063232  |  8   |
|    1553    |     85    |  0.0485872745513916  |  9   |
|    1553    |     15    | 0.04795979261398316  |  10  |
|   20400    |     14    |         0.0          |  1   |
|   20400    |    126    |         0.0          |  2   |
|   20400    |    146    |         0.0          |  3   |
|   20400    |    122    |         0.0          |  4   |
|   20400    |     1     |     

In [94]:
# iii. COSINE similarity using scaled purchased count
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)


+------------+-----------+-----------------------+------+
| customerId | productId |         score         | rank |
+------------+-----------+-----------------------+------+
|    1553    |    145    |          0.0          |  1   |
|    1553    |     19    |          0.0          |  2   |
|    1553    |    268    |          0.0          |  3   |
|    1553    |     17    |          0.0          |  4   |
|    1553    |    244    |          0.0          |  5   |
|    1553    |     61    |          0.0          |  6   |
|    1553    |     78    |          0.0          |  7   |
|    1553    |    122    |          0.0          |  8   |
|    1553    |    223    |          0.0          |  9   |
|    1553    |    119    |          0.0          |  10  |
|   20400    |     1     |  0.004158867597579956 |  1   |
|   20400    |     2     | 0.0040831840038299564 |  2   |
|   20400    |     0     | 0.0025605010986328126 |  3   |
|   20400    |     38    | 0.0022001171112060545 |  4   |
|   20400    |

In [95]:
# i. PEARSON similarity using purchase count

name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.2636384030558023 |  1   |
|    1553    |     37    | 3.066406249999998  |  2   |
|    1553    |     34    | 3.022607401037604  |  3   |
|    1553    |     0     | 2.977848753633424  |  4   |
|    1553    |     27    | 2.8976377952755903 |  5   |
|    1553    |    248    | 2.877944051084065  |  6   |
|    1553    |     3     | 2.7594530502955132 |  7   |
|    1553    |    110    | 2.701149425287355  |  8   |
|    1553    |     10    | 2.6548393282523524 |  9   |
|    1553    |    230    | 2.6450958203386383 |  10  |
|   20400    |    132    | 3.2544571419931803 |  1   |
|   20400    |     37    | 3.065196394920347  |  2   |
|   20400    |     34    | 3.040650406504065  |  3   |
|   20400    |     0     | 2.9787908884906016 |  4   |
|   20400    |     27    | 2.896157394713304  |  5   |
|   20400 

In [96]:
# iii. PEARSON similarity using purchase dummy

name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |     47    |  0.0  |  1   |
|    1553    |    126    |  0.0  |  2   |
|    1553    |    146    |  0.0  |  3   |
|    1553    |    122    |  0.0  |  4   |
|    1553    |     1     |  0.0  |  5   |
|    1553    |    172    |  0.0  |  6   |
|    1553    |     34    |  0.0  |  7   |
|    1553    |     29    |  0.0  |  8   |
|    1553    |     57    |  0.0  |  9   |
|    1553    |     37    |  0.0  |  10  |
|   20400    |     14    |  0.0  |  1   |
|   20400    |    126    |  0.0  |  2   |
|   20400    |    146    |  0.0  |  3   |
|   20400    |    122    |  0.0  |  4   |
|   20400    |     1     |  0.0  |  5   |
|   20400    |    172    |  0.0  |  6   |
|   20400    |     34    |  0.0  |  7   |
|   20400    |     29    |  0.0  |  8   |
|   20400    |     57    |  0.0  |  9   |
|   20400    |     37    |  0.0  |  10  |
|   19750    |     14    |  0.0  |

In [97]:
# iii. PEARSON similarity using scaled purchase count

name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)


+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
|    1553    |    226    |  0.7361111111111114 |  1   |
|    1553    |    247    |  0.3438320209973752 |  2   |
|    1553    |    230    | 0.33142941386611374 |  3   |
|    1553    |    125    | 0.25915108121674635 |  4   |
|    1553    |    248    |  0.2527777777777778 |  5   |
|    1553    |    276    | 0.24615384615384608 |  6   |
|    1553    |    294    | 0.24409355143554332 |  7   |
|    1553    |     83    |  0.2385057471264369 |  8   |
|    1553    |     72    | 0.23091834235045083 |  9   |
|    1553    |    204    | 0.22891566265060243 |  10  |
|   20400    |    226    |  0.7360998541778991 |  1   |
|   20400    |    247    |  0.3438276329034268 |  2   |
|   20400    |    230    |  0.3318420707296442 |  3   |
|   20400    |    125    |  0.2593046621207533 |  4   |
|   20400    |    248    | 0.25276674018965833 |

### Model Evaluation
For evaluating recommendation engines, we can use the concept of RMSE and precision-recall.

RMSE (Root Mean Squared Errors) - Measures the error of predicted values

Recall - What percentage of products that a user buys are actually recommended?

Precision - Out of all the recommended items, how many the user actually liked?

In [101]:
models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [103]:
# Model on purchase count
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0013696655132641297 | 0.0008614475201845439 |
|   2    | 0.0035322952710496198 |  0.003770717523335437 |
|   3    |  0.004060938100730498 |  0.006154516002505907 |
|   4    |  0.007857554786620561 |  0.017218489274524202 |
|   5    |  0.006862745098039233 |  0.018653663017223814 |
|   6    |  0.005971261053440989 |  0.019327991572175185 |
|   7    |  0.006220135112868704 |  0.02370657307013393  |
|   8    | 0.0058931660899654145 |  0.02556725984107153  |
|   9    |  0.005919197744457262 |  0.029243329942314763 |
|   10   | 0.0056084198385236645 |  0.031085856168491247 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0443758220475765

Per User RMSE (best)
+------------+----------------------+-------


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.11483564013840804  |  0.0661707046357781 |
|   2    | 0.09443483275663177  | 0.10750569345670986 |
|   3    |  0.0795367166474432  | 0.13395848290582346 |
|   4    |  0.0692942618223762  | 0.15317101497518903 |
|   5    | 0.06208189158016154  | 0.17158664942564206 |
|   6    | 0.05667291426374469  | 0.18678929976453676 |
|   7    | 0.05216057011039717  | 0.19967243645111424 |
|   8    |  0.0483077422145327  | 0.21065842948457636 |
|   9    | 0.045439254133025665 | 0.22248450559752594 |
|   10   | 0.04261822376009228  |  0.2309134447979826 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 1.8918169620472374

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count 


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.001369665513264124 | 0.0008614475201845454 |
|   2    | 0.0034962514417531745 |  0.003760419286393593 |
|   3    |  0.00403690888119956  |  0.006136956188233271 |
|   4    |  0.007875576701268767 |  0.017242518494055167 |
|   5    |  0.006862745098039186 |  0.018653663017223824 |
|   6    |  0.005983275663206441 |  0.01935202079170619  |
|   7    |  0.006240731586752391 |  0.023718587679899383 |
|   8    |  0.005911188004613631 |  0.025615318280133553 |
|   9    |  0.005871139305395342 |  0.02895497930794334  |
|   10   |  0.005608419838523672 |  0.031085856168491428 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0416442875686065

Per User RMSE (best)
+------------+----------------------+-------

In [104]:
# Model on purchase dummy
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Dummy



Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.004965815041381791 | 0.0026697967180155063 |
|   2    | 0.00705289672544082  | 0.0075272023166949465 |
|   3    | 0.008540242293390902 |  0.01319096740291419  |
|   4    | 0.007538682979489023 |  0.015420843400332432 |
|   5    | 0.007873335732277807 |  0.01947245629505424  |
|   6    | 0.01758426292431324  |  0.05849642421071027  |
|   7    | 0.016223718706626172 |  0.06299184882128378  |
|   8    | 0.015068369917236429 |  0.06641871242986745  |
|   9    | 0.014529606972931895 |  0.07151718159094922  |
|   10   | 0.014573587621446499 |   0.0793909698713581  |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.12515293270960715  | 0.07156885610690958 |
|   2    | 0.10039582583663213  | 0.11216019411998207 |
|   3    |  0.0839390668106035  | 0.13911640671311434 |
|   4    | 0.07229219143576876  | 0.15907443709723992 |
|   5    | 0.06449802087081707  | 0.17572044936532843 |
|   6    | 0.05930190716084898  | 0.19438363301906597 |
|   7    | 0.05415103068935411  | 0.20650261135790676 |
|   8    | 0.050332853544440455 | 0.21882866906134907 |
|   9    | 0.04691535724281336  | 0.22857114500929457 |
|   10   | 0.04426772220223142  |  0.238904508219593  |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9691769775115747

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count 


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.00496581504138177  | 0.0026697967180155037 |
|   2    |  0.007052896725440803 |  0.007527202316694946 |
|   3    |  0.008540242293390815 |  0.013190967402914113 |
|   4    | 0.0075386829794890295 |  0.015420843400332382 |
|   5    |  0.007873335732277794 |  0.019472456295054275 |
|   6    |  0.01758426292431333  |  0.05849642421070974  |
|   7    |  0.016223718706626283 |  0.06299184882128422  |
|   8    |  0.015068369917236393 |   0.0664187124298674  |
|   9    |  0.01452960697293192  |  0.07151718159094896  |
|   10   |   0.0145735876214466  |  0.07939096987135848  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |

In [105]:
# Model on Normalized purchase dummy
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Scaled Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002581570455360341 |  0.001576431217879766 |
|   2    |  0.002402294729293645 | 0.0027430634780939555 |
|   3    |  0.002438149874506991 | 0.0041208427045644645 |
|   4    | 0.0022230190032269687 |  0.005056064408879059 |
|   5    | 0.0019361778415202691 |  0.005473299786357446 |
|   6    |  0.002019839847018045 |  0.006398021055288034 |
|   7    | 0.0020795984223736137 |  0.007677452153650631 |
|   8    | 0.0020168519182502667 |  0.008504510836571699 |
|   9    | 0.0026771841759292356 |  0.013058057365736706 |
|   10   | 0.0025959125134456737 |  0.014153858898970981 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.13437389791088916

Per User RMSE (best)
+------------+------------------------+----


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.06518465399784852  | 0.03646266445455433 |
|   2    | 0.05306561491574048  | 0.05902893189152128 |
|   3    | 0.04558384128122384  | 0.07491083546403716 |
|   4    | 0.03990677662244546  | 0.08575869979500651 |
|   5    | 0.03592685550376455  | 0.09526498941930209 |
|   6    |  0.0332377196127643  | 0.10551865034345374 |
|   7    | 0.03130666393484625  | 0.11592319520300066 |
|   8    | 0.02941914664754397  | 0.12380507342339381 |
|   9    | 0.027879367355882166 | 0.13158300276608526 |
|   10   | 0.026267479383291513 | 0.13743980104131148 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.16306207417582452

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------+------+----


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0025815704553603337 |  0.001576431217879766 |
|   2    |  0.002402294729293661 | 0.0027430634780939607 |
|   3    | 0.0024381498745069804 | 0.0041208427045644775 |
|   4    | 0.0022230190032269696 |  0.005056064408879037 |
|   5    | 0.0019361778415202568 |  0.005473299786357371 |
|   6    | 0.0020198398470180515 |  0.006398021055288052 |
|   7    |  0.00207959842237362  |  0.007677452153650687 |
|   8    |  0.002025815704553608 |  0.008540365981785017 |
|   9    | 0.0026931197960240547 |  0.013165622801376729 |
|   10   |  0.002595912513445668 |  0.014153858898971064 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.13408530253111237

Per User RMSE (best)
+------------+-----------------------+-----

Popularity v. Collaborative Filtering: We can see that the collaborative filtering algorithms work better than popularity model for purchase counts. Indeed, popularity model doesn’t give any personalizations as it only gives the same list of recommended items to every user.

Precision and recall: Looking at the summary above, we see that the precision and recall for Purchase Counts > Purchase Dummy > Normalized Purchase Counts. However, because the recommendation scores for the normalized purchase data is zero and constant, we choose the dummy. In fact, the RMSE isn’t much different between models on the dummy and those on the normalized data.

RMSE: Since RMSE is higher using pearson distance than cosine, we would choose model the smaller mean squared errors, which in this case would be cosine.

#### Therefore, we select the Cosine similarity on Purchase Dummy approach as our final model.

### Recommend

In [108]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.12324784994125366  |  1   |
|    1553    |     35    | 0.10447167158126831  |  2   |
|    1553    |     1     | 0.10348175764083863  |  3   |
|    1553    |     5     |  0.0906752586364746  |  4   |
|    1553    |     17    | 0.07659814357757569  |  5   |
|    1553    |     21    | 0.07491707801818848  |  6   |
|    1553    |     33    |  0.0668614387512207  |  7   |
|    1553    |     47    | 0.06058878898620605  |  8   |
|    1553    |     61    | 0.060317397117614746 |  9   |
|    1553    |     15    | 0.05949603319168091  |  10  |
|   20400    |     26    | 0.05812269449234009  |  1   |
|   20400    |     6     | 0.05361741781234741  |  2   |
|   20400    |    113    | 0.05312788486480713  |  3   |
|   20400    |     1     | 0.05210459232330322  |  4   |
|   20400    |     15    | 0.04

In [109]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(10000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1553,2,0.123248,1
1,1553,35,0.104472,2
2,1553,1,0.103482,3
3,1553,5,0.090675,4
4,1553,17,0.076598,5


In [114]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('./data/output/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [115]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(1000, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
4,226|247|230|125|248|276|294|83|72|204
11,226|247|230|125|248|276|294|83|72|204
12,226|247|230|125|248|276|294|83|72|204
16,226|247|230|125|248|276|294|83|72|204
21,226|247|230|125|248|276|294|83|72|204


### Customer recommendation function
Given a customer Id return the list of recommended products

In [116]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [118]:
customer_recomendation(4)

recommendedProducts    226|247|230|125|248|276|294|83|72|204
Name: 4, dtype: object

In [119]:
customer_recomendation(21)

recommendedProducts    226|247|230|125|248|276|294|83|72|204
Name: 21, dtype: object