In [1]:
import pandas as pd # pandas for data manipulation
import numpy as np # numpy for sure
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix # for constructing sparse matrix
# lightfm 
from lightfm import LightFM # model
from lightfm.evaluation import auc_score



In [2]:
%%time
path = '/Users/Jeff/Documents/Insight/Data'
aisles = pd.read_csv(path+'/instacart_2017_05_01/aisles.csv')
departments = pd.read_csv(path+'/instacart_2017_05_01/departments.csv')
orders = pd.read_csv(path+'/instacart_2017_05_01/orders.csv')
order_products__prior = pd.read_csv(path+'/instacart_2017_05_01/order_products__prior.csv')
order_products__train = pd.read_csv(path+'/instacart_2017_05_01/order_products__train.csv')
products = pd.read_csv(path+'/instacart_2017_05_01/products.csv')

# see 0_instacart_eda for more info

CPU times: user 8.72 s, sys: 1.96 s, total: 10.7 s
Wall time: 10.3 s


In [3]:
# Start with the most recent order by user
users_order_most_recent = orders.loc[orders['eval_set'].map(lambda x: x in ['train'])]
display(len(users_order_most_recent)) #206209 users total, 131209 for train
users_order_most_recent['user_id'].value_counts() #only one of every user, as expected.
display(users_order_most_recent.sort_values('user_id').head(10))

131209

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,train,11,4,8,14.0
25,1492625,2,train,15,1,11,30.0
49,2196797,5,train,5,0,11,6.0
74,525192,7,train,21,2,11,6.0
78,880375,8,train,4,1,14,10.0
82,1094988,9,train,4,6,10,30.0
88,1822501,10,train,6,0,19,30.0
115,1827621,13,train,13,0,21,8.0
129,2316178,14,train,14,2,19,11.0
200,2180313,17,train,41,3,10,30.0


In [4]:
%%time

# For each user, count number of products they've purchased in prior

# Get prior orders by user
users_order_prior = orders.loc[orders['eval_set'].map(lambda x: x in ['prior'])]

# Merge products in prior orders to the user for each order
print(len(order_products__prior))
display(order_products__prior.head(5))
user_order_products_prior_train = order_products__prior.merge(users_order_prior[['order_id','user_id']],
                                                        on='order_id')
print(len(user_order_products_prior_train))
display(user_order_products_prior_train.head(5))

# Check join worked
display(users_order_prior.loc[users_order_prior['order_id']==2]) # does this return user 202279?

32434489


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


32434489


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
0,2,33120,1,1,202279
1,2,28985,2,1,202279
2,2,9327,3,0,202279
3,2,45918,4,1,202279
4,2,30035,5,0,202279


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
3355525,2,202279,prior,3,5,9,8.0


CPU times: user 4.81 s, sys: 2.95 s, total: 7.76 s
Wall time: 7.7 s


In [5]:
# Count frequency of each product bought by each user. These will be the "ratings" for lightfm
user_products_freq_train_long = user_order_products_prior_train[['user_id','product_id']]\
                                .groupby(['user_id','product_id']).size()
# display(user_products_freq_train_long.head(25))

# Reset index to remove MultiIndex
user_products_freq_train_long = user_products_freq_train_long.reset_index()
user_products_freq_train_long.columns = ['user_id','product_id','freq']
display(user_products_freq_train_long.head(25))

Unnamed: 0,user_id,product_id,freq
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3
5,1,13176,2
6,1,14084,1
7,1,17122,1
8,1,25133,8
9,1,26088,2


In [6]:
# Turn long user/product frequency table into a wide matrix

# This fails so try to do it using coo_matrix() below
# user_products_freq_train_wide = user_products_freq_train_long.unstack(fill_value=0)

# Row index will be user_id -1 (because no row_id=0)
row_ind = user_products_freq_train_long['user_id'].values-1
num_rows = max(user_products_freq_train_long['user_id'].unique())

# Column index will be prod_id -1 (because no prod_id=0)
col_ind = user_products_freq_train_long['product_id'].values-1
num_cols = max(user_products_freq_train_long['product_id'].unique())

# Frequency data are cell values
freq = user_products_freq_train_long['freq'].values

# Create sparse matrix
user_prod_interaction_train = coo_matrix((freq, (row_ind, col_ind)), shape = (num_rows, num_cols))

user_prod_interaction_train

<206209x49688 sparse matrix of type '<class 'numpy.int64'>'
	with 13307953 stored elements in COOrdinate format>

In [7]:
print(206209*49688) # 10 billion
print(13307953/(206209*49688)) #only 0.1% have entries

10246112792
0.0012988294458744038


In [None]:
%%time
# Initialising model with warp loss function
model = LightFM(loss = "warp")

# this fails on my local machine, but worked on the colab notebook!
model.fit(user_prod_interaction_train,epochs=1)

In [None]:
%%time

# auc metric score (ranging from 0 to 1)


#===================

auc_without_features = auc_score(model = model, 
                        test_interactions = user_to_product_interaction_test,check_intersections = False)
#===================


print("time taken = {0:.{1}f} seconds".format(end - start, 2))
print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_without_features.mean(), 2))