In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")
import scripts.data_layer as data_layer

## 2. Load data
Two datasets are used in this exercise, which can be found in `data` folder: 
* `recommend_1.csv` consisting of a list of 1000 customer IDs to recommend as output
* `trx_data.csv` consisting of user transactions

The format is as follows.

In [2]:
customers = pd.read_csv('../data/recommend_1.csv')
transactions = pd.read_csv('../data/trx_data.csv')

## 3. Data preparation
* Our goal here is to break down each list of items in the `products` column into rows and count the number of products bought by a user

In [3]:
# example 1: split product items
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()


Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0


In [4]:
# example 2: organize a given table into a dataframe with customerId, single productId, and purchase count
pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})

Unnamed: 0,customerId,productId,purchase_count
0,0,20.0,1
1,1,2.0,2
2,1,23.0,1
3,1,29.0,1
4,1,68.0,2
5,1,86.0,1
6,1,107.0,1
7,1,111.0,1
8,1,152.0,1


### 3.1. Create data with user, item, and target field
* This table will be an input for our modeling later
    * In this case, our user is `customerId`, `productId`, and `purchase_count`

In [5]:
s=time.time()

data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

print("Execution time:", round((time.time()-s)/60,2), "minutes")

Execution time: 0.32 minutes


In [17]:
print(data.shape)
print(data)
data.head()
data.to_excel('../output/data.xlsx')


(133585, 3)
        customerId  productId  purchase_count
0                0          1               2
1                0         13               1
2                0         19               3
3                0         20               1
4                0         31               2
5                0         52               1
6                0         69               2
7                0         93               3
8                0        136               2
9                0        157               1
10               0        198               1
11               0        216               1
12               0        255               2
13               0        256               1
14               0        260               5
15               1          2               6
16               1         23               1
17               1         25               1
18               1         29               1
19               1         61               2
20               1    

## 4. Split train and test set
* Splitting the data into training and testing sets is an important part of evaluating predictive modeling, in this case a collaborative filtering model. Typically, we use a larger portion of the data for training and a smaller portion for testing. 
* We use 80:20 ratio for our train-test set size.
* Our training portion will be used to develop a predictive model, while the other to evaluate the model's performance.
* Now that we have three datasets with purchase counts, purchase dummy, and scaled purchase counts, we would like to split each.

In [7]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)

(106868, 3) (26717, 3)


In [8]:
# Using turicreate library, we convert dataframe to SFrame - this will be useful in the modeling part

train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [9]:
train_data

customerId,productId,purchase_count
13656,0,5
27244,44,1
8424,65,1
7359,60,2
11559,18,1
3104,232,1
1738,168,6
15880,65,1
22791,31,2
5254,85,1


In [10]:
test_data

customerId,productId,purchase_count
7933,20,1
263,25,1
4440,280,1
7977,205,1
1410,9,2
4153,80,1
7800,217,1
14036,13,1
21117,31,2
1077,2,3


## 5. Baseline Model
Before running a more complicated approach such as collaborative filtering, we would like to use a baseline model to compare and evaluate models. Since baseline typically uses a very simple approach, techniques used beyond this approach should be chosen if they show relatively better accuracy and complexity.

### 5.1. Using a Popularity model as a baseline
* The popularity model takes the most popular items for recommendation. These items are products with the highest number of sells across customers.
* We use `turicreate` library for running and evaluating both baseline and collaborative filtering models below
* Training data is used for model selection

#### Using purchase counts

In [29]:
# variables to define field names
user_id = 'customerId'
item_id = 'productId'
target = None
users_to_recommend = list(transactions[user_id])
n_rec = 10 # number of items to recommend
n_display = 30

In [30]:
popularity_model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)

In [31]:
# Get recommendations for a list of users to recommend (from customers file)
# Printed below is head / top 30 rows for first 3 customers with 10 recommendations each

popularity_recomm = popularity_model.recommend(users=users_to_recommend, k=n_rec)
popularity_recomm.print_rows(n_display)

+------------+-----------+--------+------+
| customerId | productId | score  | rank |
+------------+-----------+--------+------+
|     0      |     2     | 2893.0 |  1   |
|     0      |     5     | 1533.0 |  2   |
|     0      |     7     | 1116.0 |  3   |
|     0      |     0     | 1062.0 |  4   |
|     0      |     4     | 1002.0 |  5   |
|     0      |     9     | 980.0  |  6   |
|     0      |     14    | 976.0  |  7   |
|     0      |     15    | 967.0  |  8   |
|     0      |     17    | 912.0  |  9   |
|     0      |     21    | 911.0  |  10  |
|     1      |     1     | 3732.0 |  1   |
|     1      |     5     | 1533.0 |  2   |
|     1      |     7     | 1116.0 |  3   |
|     1      |     0     | 1062.0 |  4   |
|     1      |     13    | 1061.0 |  5   |
|     1      |     4     | 1002.0 |  6   |
|     1      |     9     | 980.0  |  7   |
|     1      |     14    | 976.0  |  8   |
|     1      |     15    | 967.0  |  9   |
|     1      |     20    | 931.0  |  10  |
|     2    

In [25]:
train.groupby(by=item_id)['purchase_count'].mean().sort_values(ascending=False).head(20)

productId
248    3.106383
132    3.096774
34     3.040816
37     3.034749
0      2.946328
27     2.832061
3      2.784810
110    2.781065
230    2.633094
32     2.609524
82     2.608511
10     2.604720
245    2.571429
226    2.545455
129    2.528409
58     2.500000
54     2.427046
173    2.386503
41     2.347692
83     2.344262
Name: purchase_count, dtype: float64

#### Notes
* Once we created the model, we predicted the recommendation items using scores by popularity. As you can tell for each model results above, the rows show the first 30 records from 1000 users with 10 recommendations. These 30 records include 3 users and their recommended items, along with score and descending ranks. 
* In the result, although different models have different recommendation list, each user is recommended the same list of 10 items. This is because popularity is calculated by taking the most popular items across all users.
* If a grouping example below, products 132, 248, 37, and 34 are the most popular (best-selling) across customers. Using their purchase counts divided by the number of customers, we see that these products are at least bought 3 times on average in the training set of transactions (same as the first popularity measure on `purchase_count` variable)