In [1]:
#Recommendation engine for customer purchase history
#import packages

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys

In [2]:

customers = pd.read_csv('../medium/items-recommender/data/recommend_1.csv')
transactions = pd.read_csv('../medium/items-recommender/data/trx_data.csv')

In [3]:
print(customers.shape)
customers.head()

(1000, 1)


Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [4]:
print(transactions.shape)
transactions.head()

(62483, 2)


Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


### Data Preparation

Our goal here is to break down each list of items in the products column into rows and count the number of products bought by a user

In [5]:
# example 1: split product items
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()

Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0


In [6]:
# example 2: organize a given table into a dataframe with customerId, single productId, and purchase count
pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})

Unnamed: 0,customerId,productId,purchase_count
0,0,20.0,1
1,1,2.0,2
2,1,23.0,1
3,1,29.0,1
4,1,68.0,2
5,1,86.0,1
6,1,107.0,1
7,1,111.0,1
8,1,152.0,1


### Create data with user, item, and target field

This table will be an input for our modeling later
- In this case, our user is customerId, productId, and purchase_count

In [7]:

s=time.time()

data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

print("Executionaa time:", round((time.time()-s)/60,2), "minutes")

Executionaa time: 0.23 minutes


In [8]:
data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products')

In [9]:
data.head()

Unnamed: 0,customerId,variable,products
0,0,0,20.0
1,1,0,2.0
2,2,0,111.0
3,3,0,164.0
4,5,0,2.0


In [10]:
data = data.dropna().drop(['variable'], axis=1)

In [11]:
data.head()

Unnamed: 0,customerId,products
0,0,20.0
1,1,2.0
2,2,111.0
3,3,164.0
4,5,2.0


In [12]:
data = data.groupby(['customerId', 'products']).agg({'products': 'count'})

In [13]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,products
customerId,products,Unnamed: 2_level_1
0,1.0,2
0,13.0,1
0,19.0,3
0,20.0,1
0,31.0,2


In [14]:
data = data.rename(columns={'products': 'purchase_count'})

In [15]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,purchase_count
customerId,products,Unnamed: 2_level_1
0,1.0,2
0,13.0,1
0,19.0,3
0,20.0,1
0,31.0,2


In [16]:
data = data.reset_index()

In [17]:
data = data.rename(columns={'products': 'productId'})

In [18]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1.0,2
1,0,13.0,1
2,0,19.0,3
3,0,20.0,1
4,0,31.0,2


In [19]:
data['productId'] = data['productId'].astype(np.int64)

### Create Dummy

Dummy for marking whether a customer bought that item or not.
If one buys an item, then purchase_dummy are marked as 1
Why create a dummy instead of normalizing it, you ask?
- Normalizing the purchase count, say by each user, would not work because customers may have different buying frequency don't have the same taste
- However, we can normalize items by purchase frequency across all users.

In [20]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

In [21]:
data_dummy = create_data_dummy(data)

### Normalize item values across users

we normalize purchase frequency of each item across users by first creating a user-item matrix as follows

In [22]:

df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
df_matrix.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [23]:
print (df_matrix.shape)

(24429, 300)


In [24]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
print(df_matrix_norm.shape)
df_matrix_norm.head()

(24429, 300)


productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.1,,,,,,,,,...,,,,,,,,,,
1,,,0.166667,,,,,,,,...,,,,0.0,,,0.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [27]:
# create a table for input to the modeling

d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchase_freq']
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(133585, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


We can normalize the their purchase history, from 0-1 (with 1 being the most number of purchase for an item and 0 being 0 purchase count for that item).

### Split Train and Test set

- Splitting the data into training and testing sets is an important part of evaluating predictive modeling, in this case a collaborative filtering model. Typically, we use a larger portion of the data for training and a smaller portion for testing.
- We use 80:20 ratio for our train-test set size.
- Our training portion will be used to develop a predictive model, while the other to evaluate the model's performance.
- Now that we have three datasets with purchase counts, purchase dummy, and scaled purchase counts, we would like to split each.

In [28]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)

(106868, 3) (26717, 3)


Recommedation Engine from Scratch

In [34]:
n_users = data.customerId.unique().shape[0]
n_items = data.productId.unique().shape[0]

In [38]:
data_matrix = np.zeros((n_users, n_items))
for line in data.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

IndexError: index 24429 is out of bounds for axis 0 with size 24429

### Using Turicreate

In [27]:

# Using turicreate library, we convert dataframe to SFrame - this will be useful in the modeling part

train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [29]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [30]:
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

### Baseline Model

Before running a more complicated approach such as collaborative filtering, we would like to use a baseline model to compare and evaluate models. Since baseline typically uses a very simple approach, techniques used beyond this approach should be chosen if they show relatively better accuracy and complexity.

#### Popularity model as baseline

- The popularity model takes the most popular items for recommendation. These items are products with the highest number of sells across customers.
- We use turicreate library for running and evaluating both baseline and collaborative filtering models below
- Training data is used for model selection

In [31]:
#using purchase . count

In [32]:

# variables to define field names
user_id = 'customerId'
item_id = 'productId'
target = 'purchase_count'
users_to_recommend = list(transactions[user_id])
n_rec = 10 # number of items to recommend
n_display = 30

In [33]:

popularity_model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)

In [34]:
# Get recommendations for a list of users to recommend (from customers file)
# Printed below is head / top 30 rows for first 3 customers with 10 recommendations each

popularity_recomm = popularity_model.recommend(users=users_to_recommend, k=n_rec)
popularity_recomm.print_rows(n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|     0      |    132    | 3.0701754385964914 |  1   |
|     0      |    248    | 3.0285714285714285 |  2   |
|     0      |     37    | 3.0116731517509727 |  3   |
|     0      |     0     | 2.979923518164436  |  4   |
|     0      |     34    | 2.9150579150579152 |  5   |
|     0      |     3     | 2.825806451612903  |  6   |
|     0      |    110    | 2.773006134969325  |  7   |
|     0      |     27    | 2.6793893129770994 |  8   |
|     0      |    230    | 2.6527777777777777 |  9   |
|     0      |     32    | 2.628140703517588  |  10  |
|     1      |    132    | 3.0701754385964914 |  1   |
|     1      |    248    | 3.0285714285714285 |  2   |
|     1      |     37    | 3.0116731517509727 |  3   |
|     1      |     0     | 2.979923518164436  |  4   |
|     1      |     34    | 2.9150579150579152 |  5   |
|     1   

In [35]:
# Since turicreate is very accessible library, we can define a model selection function as below

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [36]:
# variables to define field names
# constant variables include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to print the head / first few rows in a defined dataset

### Using Purchase dummy

In [None]:
# these variables will change accordingly
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)