In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sys
sys.path.append("..")
import data_layer
import helper_functions

In [3]:
# data folder path
PATH_TO_DATA = 'data/'
# read csv files
# dec_df = pd.read_csv(PATH_TO_DATA + '2019-Dec.csv')
nov_df = pd.read_csv(PATH_TO_DATA + '2019-Nov.csv')
oct_df = pd.read_csv(PATH_TO_DATA + '2019-Oct.csv')

In [4]:
# filtering purchase data only 
# dec_df = dec_df.loc[dec_df['event_type'] == 'purchase']
nov_df = nov_df.loc[nov_df['event_type'] == 'purchase']
oct_df = oct_df.loc[oct_df['event_type'] == 'purchase']

In [5]:
# concate 2 month data
Final_data = pd.concat([oct_df, nov_df], ignore_index=True)
Final_data.shape

(1065266, 9)

In [6]:
# drop dublicate if any 
Final_data.drop_duplicates(inplace=True)
# final data
Final_data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:02:14 UTC,purchase,1004856,2053013555631882655,electronics.smartphone,samsung,130.76,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564
1,2019-10-01 00:04:37 UTC,purchase,1002532,2053013555631882655,electronics.smartphone,apple,642.69,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68
2,2019-10-01 00:06:02 UTC,purchase,5100816,2053013553375346967,,xiaomi,29.51,514591159,0e5dfc4b-2a55-43e6-8c05-97e1f07fbb56
3,2019-10-01 00:07:07 UTC,purchase,13800054,2053013557418656265,furniture.bathroom.toilet,santeri,54.42,555332717,1dea3ee2-2ded-42e8-8e7a-4e2ad6ae942f
4,2019-10-01 00:09:26 UTC,purchase,4804055,2053013554658804075,electronics.audio.headphone,apple,189.91,524601178,2af9b570-0942-4dcd-8f25-4d84fba82553


In [7]:
# Threshold data to only include users and models with min 4 products.
threshold_data_interations = helper_functions.threshold_interactions_df(Final_data, 'user_id', 'product_id', 4, 4)

Starting interactions info
Number of rows: 378642
Number of cols: 71122
Sparsity: 0.004%
Ending interactions info
Number of rows: 56150
Number of columns: 21245
Sparsity: 0.047%


In [8]:
data = threshold_data_interations[['product_id', 'user_id']]

In [9]:
data = threshold_data_interations[['product_id', 'user_id']].groupby(['user_id', 'product_id']).agg({'product_id': 'count'}).rename(columns={'product_id': 'purchase_count'}) \
    .reset_index().sort_values(by='user_id')

In [15]:
# getting unique items and users list, this will use when we recommend products to certain user
items = helper_functions.get_item_list(threshold_data_interations, "product_id")
users_list = helper_functions.get_user_list(threshold_data_interations, "user_id")

In [16]:
# Go from dataframe to interaction matrix
# Also, build index to ID mappers.
interations, uid_to_idx, idx_to_uid,\
mid_to_idx, idx_to_mid = helper_functions.df_to_matrix(threshold_data_interations, 'user_id', 'product_id')

interations

<56150x21245 sparse matrix of type '<class 'numpy.float64'>'
	with 410712 stored elements in Compressed Sparse Row format>

In [17]:
# train test split
# train, test, user_index = helper_functions.train_test_split(interations, 4, fraction=0.2)

In [40]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)

(328569, 3) (82143, 3)


In [41]:
# Using turicreate library, we convert dataframe to SFrame - this will be useful in the modeling part

train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [43]:
test_data

user_id,product_id,purchase_count
564745688,5858981,1
520620948,1005159,3
578149517,5774072,1
579491283,5877382,1
537390974,1004833,1
540507047,5760783,1
565781552,5731119,1
522130011,5736325,1
569211045,5655257,1
203257482,5859090,1


In [44]:
# variables to define field names
user_id = 'user_id'
item_id = 'product_id'
target = 'purchase_count'
users_to_recommend = list(threshold_data_interations[user_id])
n_rec = 10 # number of items to recommend
n_display = 30

In [45]:
popularity_model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)

In [47]:
# Get recommendations for a list of users to recommend (from customers file)
# Printed below is head / top 30 rows for first 3 customers with 10 recommendations each

popularity_recomm = popularity_model.recommend(users=users_to_recommend, k=n_rec)
popularity_recomm.print_rows(5)

+-----------+------------+-------+------+
|  user_id  | product_id | score | rank |
+-----------+------------+-------+------+
| 543272936 |  17301130  |  35.0 |  1   |
| 543272936 |  6500548   |  29.0 |  2   |
| 543272936 |  12708501  |  27.0 |  3   |
| 543272936 |  14700842  |  25.0 |  4   |
| 543272936 |  1306613   |  25.0 |  5   |
+-----------+------------+-------+------+
[5565910 rows x 4 columns]



In [48]:
# Since turicreate is very accessible library, we can define a model selection function as below

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [56]:
# variables to define field names
# constant variables include:
user_id = 'user_id'
item_id = 'product_id'
users_to_recommend = list(threshold_data_interations[user_id])
n_rec = 10 # number of items to recommend
n_display = 20 # to print the head / first few rows in a defined dataset

## 6. Collaborative Filtering Model

* In collaborative filtering, we would recommend items based on how similar users purchase items. For instance, if customer 1 and customer 2 bought similar items, e.g. 1 bought X, Y, Z and 2 bought X, Y, we would recommend an item Z to customer 2.

* To define similarity across users, we use the following steps:
    1. Create a user-item matrix, where index values represent unique customer IDs and column values represent unique product IDs
    
    2. Create an item-to-item similarity matrix. The idea is to calculate how similar a product is to another product. There are a number of ways of calculating this. In steps 6.1 and 6.2, we use cosine and pearson similarity measure, respectively.  
    
        * To calculate similarity between products X and Y, look at all customers who have rated both these items. For example, both X and Y have been rated by customers 1 and 2. 
        * We then create two item-vectors, v1 for item X and v2 for item Y, in the user-space of (1, 2) and then find the `cosine` or `pearson` angle/distance between these vectors. A zero angle or overlapping vectors with cosine value of 1 means total similarity (or per user, across all items, there is same rating) and an angle of 90 degree would mean cosine of 0 or no similarity.
        
    3. For each customer, we then predict his likelihood to buy a product (or his purchase counts) for products that he had not bought. 
    
        * For our example, we will calculate rating for user 2 in the case of item Z (target item). To calculate this we weigh the just-calculated similarity-measure between the target item and other items that customer has already bought. The weighing factor is the purchase counts given by the user to items already bought by him. 
        * We then scale this weighted sum with the sum of similarity-measures so that the calculated rating remains within a predefined limits. Thus, the predicted rating for item Z for user 2 would be calculated using similarity measures.

* While I wrote python scripts for all the process including finding similarity using python scripts (which can be found in `scripts` folder, we can use `turicreate` library for now to capture different measures like using `cosine` and `pearson` distance, and evaluate the best model.

### 6.1. `Cosine` similarity
* Similarity is the cosine of the angle between the 2 vectors of the item vectors of A and B
* It is defined by the following formula
![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTnRHSAx1c084UXF2wIHYwaHJLmq2qKtNk_YIv3RjHUO00xwlkt)
* Closer the vectors, smaller will be the angle and larger the cosine

In [54]:
# these variables will change accordingly
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-----------+------------+----------------------+------+
|  user_id  | product_id |        score         | rank |
+-----------+------------+----------------------+------+
| 543272936 |  1004833   |  3.668076127767563   |  1   |
| 543272936 |  1004767   |  2.7470069229602814  |  2   |
| 543272936 |  1004858   |  2.0656363666057587  |  3   |
| 543272936 |  1004750   |  1.8056735396385193  |  4   |
| 543272936 |  1004857   |  1.7164364755153656  |  5   |
| 543272936 |  1005100   |  1.7118978798389435  |  6   |
| 543272936 |  1004836   |  1.4483079314231873  |  7   |
| 543272936 |  1004834   |  1.3835522830486298  |  8   |
| 543272936 |  2701735   |  1.3820402324199677  |  9   |
| 543272936 |  1004238   |  1.253294199705124   |  10  |
| 551377651 |  1004434   | 0.07582694292068481  |  1   |
| 551377651 |  1004833   | 0.05733774767981635  |  2   |
| 551377651 |  1004767   | 0.056982212596469455 |  3   |
| 551377651 |  8800911   | 0.05633528365029229  |  4   |
| 551377651 |  1005169   | 0.04

### 6.2. `Pearson` similarity
* Similarity is the pearson coefficient between the two vectors.
* It is defined by the following formula
</br>

![](http://critical-numbers.group.shef.ac.uk/glossary/images/correlationKT1.png)

In [55]:
# these variables will change accordingly
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-----------+------------+-------+------+
|  user_id  | product_id | score | rank |
+-----------+------------+-------+------+
| 543272936 |  17301130  |  35.0 |  1   |
| 543272936 |  6500548   |  29.0 |  2   |
| 543272936 |  12708501  |  27.0 |  3   |
| 543272936 |  14700842  |  25.0 |  4   |
| 543272936 |  1306613   |  25.0 |  5   |
| 543272936 |  28300780  |  22.0 |  6   |
| 543272936 |  18000904  |  20.0 |  7   |
| 543272936 |  50300160  |  20.0 |  8   |
| 543272936 |  13700159  |  18.0 |  9   |
| 543272936 |  28720765  |  17.0 |  10  |
| 551377651 |  17301130  |  35.0 |  1   |
| 551377651 |  6500548   |  29.0 |  2   |
| 551377651 |  12708501  |  27.0 |  3   |
| 551377651 |  14700842  |  25.0 |  4   |
| 551377651 |  1306613   |  25.0 |  5   |
| 551377651 |  28300780  |  22.0 |  6   |
| 551377651 |  18000904  |  20.0 |  7   |
| 551377651 |  50300160  |  20.0 |  8   |
| 551377651 |  13700159  |  18.0 |  9   |
| 551377651 |  28720765  |  17.0 |  10  |
| 551377651 |  17301130  |  35.0 |

#### Note
* In collaborative filtering above, we used two approaches: cosine and pearson distance. We also got to apply them on training_data.
* We can see that the recommendations are different for each user. This suggests that personalization does exist. 
* But how good is this model compared to the baseline, and to each other? We need some means of evaluating a recommendation engine. Lets focus on that in the next section.

## 7. Model Evaluation
For evaluating recommendation engines, we can use the concept of precision-recall.

* RMSE (Root Mean Squared Errors)
    * Measures the error of predicted values
    * Lesser the RMSE value, better the recommendations
* Recall
    * What percentage of products that a user buys are actually recommended?
    * If a customer buys 5 products and the recommendation decided to show 3 of them, then the recall is 0.6
* Precision
    * Out of all the recommended items, how many the user actually liked?
    * If 5 products were recommended to the customer out of which he buys 4 of them, then precision is 0.8
    
* Why are both recall and precision important?
    * Consider a case where we recommend all products, so our customers will surely cover the items that they liked and bought. In this case, we have 100% recall! Does this mean our model is good?
    * We have to consider precision. If we recommend 300 items but user likes and buys only 3 of them, then precision is 0.1%! This very low precision indicates that the model is not great, despite their excellent recall.
    * So our aim has to be optimizing both recall and precision (to be close to 1 as possible).

Lets compare all the models we have built based on precision-recall characteristics:

In [58]:
models_w_counts = [popularity_model, cos, pear]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']

In [59]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    |          0.0           |          0.0           |
|   7    | 3.907577965949367e-06  | 1.3676522880822783e-05 |
|   8    | 3.4191307202056957e-06 | 1.3676522880822783e-05 |
|   9    | 3.0392273068495095e-06 | 1.3676522880822783e-05 |
|   10   | 2.735304576164559e-06  | 1.3676522880822783e-05 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.5440161118616589

Per User RMSE (best)
+-----------+---


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.06209141387893563  | 0.035684631929059955 |
|   2    | 0.052463141770836205 | 0.05975919327922139  |
|   3    | 0.045579291920821476 |  0.0770122176313907  |
|   4    | 0.04141251128313149  | 0.09371952414254806  |
|   5    | 0.03810826335512605  |  0.1079691854693925  |
|   6    | 0.035276311350602295 | 0.11966847799720469  |
|   7    |  0.0330346641241355  |  0.1302117044033948  |
|   8    | 0.03112434694603231  | 0.14039219937908784  |
|   9    | 0.029471387194518233 |  0.1495703150110693  |
|   10   | 0.028121666347548573 | 0.15843346450398282  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 2.0622516357924594

Per User RMSE (best)
+-----------+---------------------+-------+
|  user_id  |         rmse  


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    |          0.0           |          0.0           |
|   7    | 3.907577965949367e-06  | 1.3676522880822783e-05 |
|   8    | 3.4191307202056957e-06 | 1.3676522880822783e-05 |
|   9    | 3.0392273068494917e-06 | 1.3676522880822783e-05 |
|   10   | 2.735304576164554e-06  | 1.3676522880822783e-05 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.550614476147725

Per User RMSE (best)
+-----------+----

## 8. Model Selection
### 8.1. Evaluation summary
* Based on RMSE


    1. Popularity on purchase counts: 1.5440161118616589
    2. Cosine similarity on purchase counts: 2.0622516357924594
    3. Pearson similarity on purchase counts: 1.5531102838192284

In [62]:
users_to_recommend = list(threshold_data_interations[user_id])

final_model = tc.item_similarity_recommender.create(tc.SFrame(data), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_count')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+-----------+------------+----------------------+------+
|  user_id  | product_id |        score         | rank |
+-----------+------------+----------------------+------+
| 543272936 |  1004833   | 0.08455404043197631  |  1   |
| 543272936 |  1004870   | 0.06114791631698609  |  2   |
| 543272936 |  1004836   | 0.05349932909011841  |  3   |
| 543272936 |  1004750   | 0.04826837778091431  |  4   |
| 543272936 |  1004873   | 0.03947700262069702  |  5   |
| 543272936 |  1005098   | 0.03814225196838379  |  6   |
| 543272936 |  1004857   | 0.03758982419967651  |  7   |
| 543272936 |  1004858   | 0.03724290132522583  |  8   |
| 543272936 |  1004209   | 0.031753814220428465 |  9   |
| 543272936 |  1002544   | 0.030005264282226562 |  10  |
| 551377651 |  1004833   | 0.03707390481775457  |  1   |
| 551377651 |  1004767   | 0.03470703146674416  |  2   |
| 551377651 |  1002544   |  0.0309004621072249  |  3   |
| 551377651 |  1004836   | 0.02744315971027721  |  4   |
| 551377651 |  1004249   | 0.02

In [63]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(5565910, 4)


Unnamed: 0,user_id,product_id,score,rank
0,543272936,1004833,0.084554,1
1,543272936,1004870,0.061148,2
2,543272936,1004836,0.053499,3
3,543272936,1004750,0.048268,4
4,543272936,1004873,0.039477,5


In [64]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))
df_output = df_rec[['user_id', 'recommendedProducts']].drop_duplicates().sort_values('user_id').set_index('user_id')

In [65]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [66]:
import random
customer_recomendation(random.choice(users_to_recommend))

recommendedProducts    1004856|1004873|1005100|32801121|1004768|10025...
Name: 512577810, dtype: object