In [0]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve


In [0]:
grouped_purchased = pd.read_csv('cvs_data_file/online-retail.csv', header = 0)
grouped_purchased.head()

FileNotFoundError: ignored

In [0]:
customers = list(np.sort(grouped_purchased.CustomerID.unique())) # Get our unique customers
products = list(grouped_purchased.StockCode.unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased.Quantity) # All of our purchases

rows = grouped_purchased.CustomerID.astype('category', categories = customers).cat.codes 
# Get the associated row indices
cols = grouped_purchased.StockCode.astype('category', categories = products).cat.codes 
# Get the associated column indices
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [0]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

In [0]:
import random

In [0]:
def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered

In [0]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [0]:
item_lookup = pd.read_csv('cvs_data_file/item_lookup.csv', header = 0)
item_lookup.head()

In [0]:
import pickle
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score


# Instantiate and train the model
model = LightFM(loss='warp')
model.fit_partial(product_train, epochs=40, num_threads=2)

with open('saved_model','wb') as f:
            saved_model={'model':model}
            pickle.dump(saved_model, f)


# Evaluate the trained model
train_auc = auc_score(model, product_train).mean()
test_auc = auc_score(model, product_test).mean()

print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))