In [31]:
#importing libraries
import numpy as np
import numpy.ma as ma
import pandas as pd
# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings("ignore")
import sys
import scipy.sparse as sparse
import scipy.sparse.linalg as linalg
from scipy.sparse import coo_matrix, csr_matrix
from numpy import bincount, log, sqrt
import itertools
import time
from pathlib import Path
import pickle
# lightfm 
from lightfm import LightFM as lightfm
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from numpy import random as random
from lightfm.data import Dataset

Step 1: Data Preperation
In order to prepare the data in a format that works for a recommendation system, we'll need to arrange it in a matrix format where the product id are listed as the the columns, and user ids are listed as the rows.

For the "rating" part of the equation, although we do not have customers' "review" for each product they have purchased, since our expansive dataset covers a customers last 3-99 orders, we can reasonably believe that the number of times a customer has purchase a product throughout their order history is by nature a rating. 

In [35]:
#reading in data. 
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')

Here, we can see that the prior dataset maps the customer to their order id, but what product is included in each order is hosted in another dataset, orders. We'll merge these two datasets together so that we have a raw dataset that contains both customers and products they have bought across multiple orders. 

In [42]:
#displaying the layout of the data sets
prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [43]:
#displaying the layout of the data sets
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [36]:
#merging prior order with order id
customer_orders_product= pd.merge(orders, prior, on="order_id")
#extracting only user and product id information 
customer_orders_product = customer_orders_product[["user_id", "product_id"]]
customer_orders_product.head()

Unnamed: 0,user_id,product_id
0,1,196
1,1,14084
2,1,12427
3,1,26088
4,1,26405


The dataframe customer_orders_product now hosts users and the products they've ordered in the past. Now we move on to count the number of times each user purchased a product as discussed.

In [37]:
#summarizing how many of each product each customer bought and creating a new column as "purchase."
customer_product_purchase=customer_orders_product.groupby(["user_id","product_id"]).size().reset_index(name='purchase')


With the 'customer_product_purchase' dataframe, which now includes three columns: user_id, product_id, purchase, we'll conduct some prelimitary exploration.

In [34]:
#how many unique products are in this dataset?
print ("Unique products in the dataset: " + str(len(customer_product_purchase['product_id'].unique())))
#how many unique customers are there?
print ("Unique customers in the dataset: " + str(len(customer_product_purchase['user_id'].unique())))
#frequency of purchase: what are the most frequently purchase items by customers?
customer_product_purchase.nlargest(20, 'purchase')

NameError: name 'customer_product_purchase' is not defined

LightFM requires that the rows and columns of the matrix be the consequtive integers in increasing order; However, our dataset already has user_id in this nature; we'll need to map product_id differently.

In [84]:
#checking number of unique users and user_id
def index_mapping_check(array):
    n = len(array) - 1 
    if sum(np.diff(sorted(array)) == 1) >= n:
        print ("data can be indexed as the consequtive integers")
    else:
        print ("please reformat data")
user_id=customer_product_purchase['user_id'].unique().astype('int')
product_id=customer_product_purchase['product_id'].unique().astype('int')
index_mapping_check(user_id)
index_mapping_check(product_id)

data can be indexed as the consequtive integers
please reformat data


In [90]:
def index_creation(array):
    """taking in an array of data and creating an index representing the array.
    returning 2 dictionaries: index_id and id_index."""
    index_id= {}
    id_index= {}
    for index, id in enumerate(array):
        id_index[id] = index
        index_id[index] = id
    
    return index_id,id_index

In [93]:
index_to_product_id,product_id_to_index=index_creation(product_id)

In [49]:
#since user_id and product_id are by nature categorical varaibles, we'll encode them as such to prep for the training matrix transformation.
customer_product_purchase['user_id']=customer_product_purchase['user_id'].astype('category')
customer_product_purchase['product_id']=customer_product_purchase['product_id'].astype('category')

In [106]:
#creating a train_matrix that has user_id on the rows, product_id as the columns, and purchase as the value.
customer_product_purchase_matrix = sparse.coo_matrix((customer_product_purchase['purchase'],(customer_product_purchase['user_id'].cat.codes.copy(),customer_product_purchase['product_id'].apply(lambda x: product_id_to_index[x]).cat.codes.copy())))
#saving the meatrix to the file
sparse.save_npz('matrix_user_product_purchase.npz', customer_product_purchase_matrix)

In [3]:
customer_product_purchase_matrix=sparse.load_npz('matrix_user_product_purchase.npz')

In [4]:
customer_product_purchase_matrix.shape

(206209, 49677)

Step 2: Modeling and Evaluation
With the prepared matrix, we'll now conduct modeling and evaluation with lightfm's built-in functins. 

In [9]:
#splitting train/test matrices with a test percentage of 20%
train_matrix, test_matrix=random_train_test_split(customer_product_purchase_matrix,test_percentage=0.2)

In [11]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_collaborative_filtering= lightfm(loss = "warp")
#fitting the model
model_collaborative_filtering.fit(train_matrix, epochs=1, num_threads=3)

<lightfm.lightfm.LightFM at 0x1a15923310>

In [12]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_collaborative_filtering = auc_score(model=model_collaborative_filtering,test_interactions = test_matrix, num_threads = 3, check_intersections = False)
end = time.time()

print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score = {0:.{1}f}".format(auc_collaborative_filtering.mean(), 2))

time for evaluation = 394.94 seconds
AUC score = 0.94


Now for precision evaluation, we'll take the average order size from the original prior data as our k.

In [19]:
#precision evaluation
start = time.time()
Model_precision=precision_at_k(model = model_collaborative_filtering, 
                        test_interactions = test_matrix,k=10,
                        num_threads = 4, check_intersections = False)
end = time.time()
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

time taken for precision at k evaluation = 399.97 seconds
