In [1]:
#importing libraries
import numpy as np
import numpy.ma as ma
import pandas as pd
# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings("ignore")
import sys
import scipy.sparse as sparse
from scipy.sparse import coo_matrix, csr_matrix
from numpy import bincount, log, sqrt
import itertools
import time
import pickle
# lightfm 
from lightfm import LightFM as lightfm
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.data import Dataset
from sklearn.model_selection import train_test_split as train_test_split

Note:
Throughout the notebook, I applied pickle liberally because 1)many models and evaluations take a long time to run, especially with WARP loss funcions. 2)Pickle allows us to preserve the state of splie between train/test interactions as well as model's state of output for performance comparison.

Step 1: Data Preperation
In order to prepare the data in a format that works for a recommendation system, we'll need to arrange it in a matrix format where the product id are listed as the the columns, and user ids are listed as the rows.

For the "rating" part of the equation, although we do not have customers' "review" for each product they have purchased, since our expansive dataset covers a customers last 3-99 orders, we can reasonably believe that the number of times a customer has purchase a product throughout their order history is by nature a rating. 

In [56]:
#reading in data. 
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')

Here, we can see that the prior dataset maps the customer to their order id, but what product is included in each order is hosted in another dataset, orders. We'll merge these two datasets together so that we have a raw dataset that contains both customers and products they have bought across multiple orders. 

In [None]:
#displaying the layout of the data sets
prior.head()

In [None]:
#displaying the layout of the data sets
orders.head()

In [42]:
#merging prior order with order id
customer_orders_product= pd.merge(orders, prior, on="order_id")
#extracting only user and product id information 
customer_orders_product = customer_orders_product[["user_id", "product_id"]]
customer_orders_product.head()

Unnamed: 0,user_id,product_id
0,1,196
1,1,14084
2,1,12427
3,1,26088
4,1,26405


The dataframe customer_orders_product now hosts users and the products they've ordered in the past. Now we move on to count the number of times each user purchased a product as discussed.

In [47]:
#summarizing how many of each product each customer bought and creating a new column as "purchase."
customer_product_purchase=customer_orders_product.groupby(["user_id","product_id"]).size().reset_index(name='purchase')
#pickle.dump(customer_product_purchase,open('customer_product_purchase.p','wb'))

With the 'customer_product_purchase' dataframe, which now includes three columns: user_id, product_id, purchase, we'll conduct some prelimitary exploration.

In [None]:
#how many unique products are in this dataset?
print ("Unique products in the dataset: " + str(len(customer_product_purchase['product_id'].unique())))
#how many unique customers are there?
print ("Unique customers in the dataset: " + str(len(customer_product_purchase['user_id'].unique())))
#frequency of purchase: what are the most frequently purchase items by customers?
customer_product_purchase.nlargest(20, 'purchase')

LightFM requires that the rows and columns of the matrix be the consequtive integers in increasing order; However, our dataset already has user_id in this nature; we'll need to map product_id differently.

In [5]:
#checking number of unique users and user_id
def index_mapping_check(array):
    n = len(array) - 1 
    if sum(np.diff(sorted(array)) == 1) >= n:
        print ("data can be indexed as the consequtive integers")
    else:
        print ("please reformat data")
user_id=customer_product_purchase['user_id'].unique().astype('int')
product_id=customer_product_purchase['product_id'].unique().astype('int')
index_mapping_check(user_id)
index_mapping_check(product_id)

data can be indexed as the consequtive integers
please reformat data


In [6]:
def index_creation(array):
    """taking in an array of data and creating an index representing the array.
    returning 2 dictionaries: index_id and id_index."""
    index_id= {}
    id_index= {}
    for index, id in enumerate(array):
        id_index[id] = index
        index_id[index] = id
    
    return index_id,id_index

In [7]:
index_to_product_id,product_id_to_index=index_creation(product_id)
pickle.dump(index_to_product_id, open( "index_to_product_id.p", "wb" ))
pickle.dump(product_id_to_index, open( "product_id_to_index.p", "wb" ))

In [115]:
product_id_to_index= pickle.load(open( "product_id_to_index.p", "rb" ))
index_to_product_id=pickle.load(open( "index_to_product_id.p", "rb" ))

In [8]:
#since user_id and product_id are by nature categorical varaibles, we'll encode them as such to prep for the training matrix transformation.
customer_product_purchase['user_id']=customer_product_purchase['user_id'].astype('category')
customer_product_purchase['product_id']=customer_product_purchase['product_id'].astype('category')

In [None]:
#creating a train_matrix that has user_id on the rows, product_id as the columns, and purchase as the value.
customer_product_purchase_matrix = sparse.coo_matrix((customer_product_purchase['purchase'],(customer_product_purchase['user_id'].cat.codes.copy(),customer_product_purchase['product_id'].apply(lambda x: product_id_to_index[x]).cat.codes.copy())))
#saving the meatrix to the file
sparse.save_npz('matrix_user_product_purchase.npz', customer_product_purchase_matrix)

In [213]:
#customer_product_purchase_matrix=sparse.load_npz('matrix_user_product_purchase.npz')

Step 2: Modeling and Evaluation
With the prepared matrix, we'll now conduct modeling and evaluation with lightFM's built-in functins. 

In [214]:
#splitting train/test matrices with a test percentage of 20% 
train_matrix, test_matrix=random_train_test_split(customer_product_purchase_matrix,test_percentage=0.2)

In [215]:
#saving the matrix here to preserve the state of the split
pickle.dump(train_matrix, open( "train_matrix.p", "wb" ) )
pickle.dump(test_matrix, open( "test_matrix.p", "wb" ) )

In [216]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_collaborative_filtering= lightfm(loss = "warp")
#fitting the model
model_collaborative_filtering.fit(train_matrix, epochs=1, num_threads=4)
pickle.dump(model_collaborative_filtering, open( "model_collaborative_filtering.p", "wb" ) )

In [121]:
#model_collaborative_filtering=pickle.load(open( "model_collaborative_filtering.p", "rb" ))

LightFM provides a variety of methods to evaluate the accuracy of the model prediction. To get a "general idea" of how well the model fit, we'll first apply AUC score, which randomly takes a pair of postive(confirmed bought by the customer in our case) and negative(confirmed not bought by the customer in our case) and compare their recommendation scores. If the model is accruate, the recommendation score for the positive item should be higher than that of the negative item. A perfect score for AUC is 1, meaning the aforementioned scenario applied to all pairs. Correspondingly, the worst score for AUC is 0. 

In [None]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_collaborative_filtering = auc_score(model=model_collaborative_filtering,test_interactions = test_matrix, num_threads = 3, check_intersections = False)
end = time.time()

print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score = {0:.{1}f}".format(auc_collaborative_filtering.mean(), 2))
pickle.dump(auc_collaborative_filtering, open( "auc_collaborative_filtering.p", "wb" ) )

Now for precision evaluation, we'll take the average order size from the original prior data here and define k=10. Precison at k measures the percentage of total actually purchased items that ended up amongst top k of the recommendations. A perfect score is 1 and the worst score is 0.

In [None]:
#precision evaluation
start = time.time()
Model_precision_collaborative_filtering=precision_at_k(model = model_collaborative_filtering, 
                        test_interactions = test_matrix,k=10,
                        num_threads = 4, check_intersections = False)
end = time.time()
pickle.dump(Model_precision_collaborative_filtering, open( "Model_precision_collaborative_filtering.p", "wb" ) )
print("Precision at k score = {0:.{1}f}".format(auc_collaborative_filtering.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

To make our recommendation more understandable, we'll now convert the items id back to item names using the product dataset.

In [225]:
#products=pd.read_csv('products.csv')
products[(products.product_id==37117)]

Unnamed: 0,product_id,product_name,aisle_id,department_id
37116,37117,Health Elderberry Immune Defense Herbal Capsul...,47,11


In [98]:
product_dictionary=products.set_index('product_id')['product_name'].to_dict()
#pickle.dump(product_dictionary, open( "product_dictionary.p", "wb" ) )

In [120]:
def sample_recommendation(model, matrix, user_ids):
    """Takes in a lightFM model, user-item interaction matrix, and list of user ids.
    Output the known purchase from a user, and top 3 recommendation based on lightFM 
    prediction score."""

    n_users, n_items = matrix.shape
    items=range(n_items)
    for user_id in user_ids:
        known_positives = matrix.tocsr()[user_id].indices
        
        know_positives_products=[]
        for i in known_positives:
            know_positives_products.append(product_dictionary[index_to_product_id[i]])

        scores = model.predict(user_id, np.arange(n_items))
        top_items = np.argsort(-scores)
        top_items = top_items[:10]
        
        top_items_products=[]
        for i in top_items:
            top_items_products.append(product_dictionary[index_to_product_id[i]])
        

        print("User %s" % user_id)
        print("     Customer already have:")

        for x in know_positives_products:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items_products:
            print("        %s" % x)

In [144]:
def get_actual_purchase(matrix, user_ids):
    for user_id in user_ids:
        actual_purchase = matrix.tocsr()[user_id].indices  
        actual_purchase_products=[]
        for i in actual_purchase:
            actual_purchase_products.append(product_dictionary[index_to_product_id[i]])
        print("User %s" % user_id)
        print ("     Customer already have:")

        for x in actual_purchase_products:
            print("        %s" % x)

In [134]:
def get_similar_tags(model, product_id):
    # Define similarity as the cosine of the angle
    # between the tag latent vectors

    # Normalize the vectors to unit length
    tag_embeddings = (model.item_embeddings.T
                      / np.linalg.norm(model.item_embeddings, axis=1)).T

    query_embedding = tag_embeddings[product_id_to_index[product_id]]
    similarity = np.dot(tag_embeddings, query_embedding)
    most_similar = np.argsort(-similarity)[1:10]
    most_similar_products=[product_dictionary[index_to_product_id[i]]for i in most_similar]
    

    return most_similar_products

In [206]:
tag_embeddings = (model_collaborative_filtering.item_embeddings.T
                      / np.linalg.norm(model_collaborative_filtering.item_embeddings, axis=1)).T

query_embedding = tag_embeddings[product_id_to_index[37117]]
similarity = np.dot(tag_embeddings, query_embedding)
most_similar = np.argsort(-similarity)[1:10]

After testing out the collaborative fitering model, we'll add in the item and user features. In the previous step, we manually created a matrix of user-product interaction. With the new model, we'll try out a different method and use lightFM's built-in dataset tools to create and merge user-product interaction and user/product features.

In [44]:
#Creating a custome_gender dataframe that classifies customers that bought products from feminine care aisle as female, and others as male.
product_feature=products[["product_id","aisle_id"]]
customer_product_aisle=pd.merge(product_feature,customer_orders_product, on="product_id")
def label_gender (i):
    if i == 126:
        return 'f'
    else:
        return 'm'

customer_product_aisle['gender']=customer_product_aisle['aisle_id'].apply(lambda x: label_gender(x))
customer_gender=customer_product_aisle[['user_id','gender']]
customer_gender['gender']=customer_gender['gender'].astype('category')
#pickle.dump(customer_gender,open("customer_gender.p","wb"))

In [57]:
#starting a dataset incident and fitting it to the original cusotmer_product_purchase datafrmae
dataset = Dataset()
dataset.fit((x[0] for x in customer_product_purchase.itertuples(index=False)),
            (x[1] for x in customer_product_purchase.itertuples(index=False)))

#fitting the same dataframe with user feature
dataset.fit_partial(users=(x[0] for x in customer_gender.itertuples(index=False)),items=None,
            user_features=(x[1] for x in customer_gender.itertuples(index=False)))

#building user-product interaction as interaction. weights is another product from the function that's not as relevent ot our use case.
interactions, weights = dataset.build_interactions(((x[0], x[1])
                                                      for x in customer_product_purchase.itertuples(index=False)))
#build user feature
user_features = dataset.build_user_features((x[0], [x[1]])
                                              for x in customer_gender.itertuples(index=False))

#split the train-test matrices
train_interactions, test_interactions=random_train_test_split(interactions,test_percentage=0.2)

In [58]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_hybrid= lightfm(loss = "warp")
#fitting the model with additional user features
model_hybrid.fit(train_interactions,
                 user_features=user_features, 
          epochs=1, 
          num_threads=4)
pickle.dump(model_hybrid,open('model_hybrid.p','wb'))

<lightfm.lightfm.LightFM at 0x10cf96a10>

In [63]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_hybrid = auc_score(model=model_hybrid,test_interactions = train_interactions, num_threads = 4, check_intersections = False,user_features=user_features,item_features=None)
end = time.time()

print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score for hybrid method= {0:.{1}f}".format(auc_hybrid.mean(), 2))
#pickle.dump(auc_hybrid,open("auc_hybrid.p", "wb"))

time for evaluation = 483.87 seconds


NameError: name 'auc_collaborative_filtering' is not defined

In [81]:
#precision evaluation
start = time.time()
model_precision_hybrid=precision_at_k(model = model_hybrid, 
                        test_interactions = test_interactions,k=10,user_features=user_features, item_features=None,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid, open("model_precision_hybrid.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.05
time taken for precision at k evaluation = 396.53 seconds


Now we add on product feature to see if there's any additional improvement.

In [84]:
dataset2 = Dataset()
dataset2.fit((x[0] for x in customer_product_purchase.itertuples(index=False)),(x[1] for x in customer_product_purchase.itertuples(index=False)))

dataset2.fit_partial(users=(x[0] for x in customer_gender.itertuples(index=False)),
                     items=(x[0] for x in product_feature.itertuples(index=False)),
                     user_features=(x[1] for x in customer_gender.itertuples(index=False)),
                     item_features=(x[1] for x in product_feature.itertuples(index=False)))

interactions2, weights2 = dataset2.build_interactions(((x[0], x[1])
                                                      for x in customer_product_purchase.itertuples(index=False)))

user_features2 = dataset2.build_user_features((x[0], [x[1]])
                                              for x in customer_gender.itertuples(index=False))
item_features2 = dataset2.build_item_features((x[0], [x[1]])
                                              for x in product_feature.itertuples(index=False))

In [85]:
#splitting train/test matrices with a test percentage of 20%
train_matrix2, test_matrix2=random_train_test_split(interactions2,test_percentage=0.2)

In [86]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_hybrid2= lightfm(loss = "warp")
#fitting the model
model_hybrid2.fit(train_matrix2,
                  user_features=user_features2,
                  item_features=item_features2,
                  epochs=1, 
                  num_threads=4)
#pickle.dump(model_hybrid2, open("model_hybrid2.p", "wb" ) )

In [101]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_hybrid2 = auc_score(model=model_hybrid2,test_interactions = test_matrix2, num_threads = 4, check_intersections = False,user_features=user_features2,item_features=item_features2)
end = time.time()
#pickle.dump(auc_hybrid2, open("auc_hybrid2.p", "wb" ) )
print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score for hybrid method= {0:.{1}f}".format(auc_hybrid2.mean(), 2))

time for evaluation = 451.82 seconds
AUC score for hybrid method= 0.92


In [132]:
#precision evaluation
start = time.time()
model_precision_hybrid2=precision_at_k(model = model_hybrid2, 
                        test_interactions = test_matrix2,k=10,user_features=user_features2, item_features=item_features2,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid2.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.04874
time taken for precision at k evaluation = 455.46 seconds


So using both item and user feature didn't really improve the model performance beyond using user feature alone. What about using item feature by itself?

In [171]:
dataset3 = Dataset()
dataset3.fit((x[0] for x in customer_product_purchase.itertuples(index=False)),(x[1] for x in customer_product_purchase.itertuples(index=False)))

dataset3.fit_partial(items=(x[0] for x in product_feature.itertuples(index=False)),
                     item_features=(x[1] for x in product_feature.itertuples(index=False)))

interactions3, weights3 = dataset3.build_interactions(((x[0], x[1])
                                                      for x in customer_product_purchase.itertuples(index=False)))

item_features3 = dataset3.build_item_features((x[0], [x[1]])
                                              for x in product_feature.itertuples(index=False))

In [72]:
train_matrix3, test_matrix3=random_train_test_split(interactions3,test_percentage=0.2)
#pickle.dump(train_matrix3, open( "train_matrix3.p", "wb" ) )
#pickle.dump(test_matrix3, open( "test_matrix3.p", "wb" ) )

In [173]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_hybrid3=lightfm(loss = "warp")
#fitting the model
model_hybrid3.fit(train_matrix3,
                  item_features=item_features3,
                  epochs=1, 
                  num_threads=4)
pickle.dump(model_hybrid3, open("model_hybrid3.p", "wb" ))

In [75]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_hybrid3 = auc_score(model=model_hybrid3,test_interactions = test_matrix3, num_threads = 4, check_intersections = False,item_features=item_features3)
end = time.time()
pickle.dump(auc_hybrid3, open("auc_hybrid3.p", "wb" ) )
print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score for hybrid method= {0:.{1}f}".format(auc_hybrid3.mean(), 2))

time for evaluation = 459.28 seconds
AUC score for hybrid method= 0.93


In [76]:
#precision evaluation
start = time.time()
model_precision_hybrid3=precision_at_k(model = model_hybrid3, 
                        test_interactions = test_matrix3,k=10,item_features=item_features3,
                        num_threads = 4, check_intersections = False)
end = time.time()
pickle.dump(model_precision_hybrid3, open("model_precision_hybrid3.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid3.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.05
time taken for precision at k evaluation = 453.17 seconds


## Apply to Train
Note: This part of the code was not presented.

In the previous section, we applied the lightFM model on the "prior" dataset. The data in the prior dataset is aggregation in natare, and the grain of the dataset is user.
In the "train" dataset, we have one order from each individual user, which provides a unique opporunity for us to test how lightFM would perform on an order basis.

In [57]:
#reading into the dataset
train=pd.read_csv('order_products__train.csv')
#merging prior order with order id
customer_orders_product_train= pd.merge(orders, train, on="order_id")
#extracting only user and product id information 
customer_orders_product_train = customer_orders_product_train[["user_id", "product_id"]]
customer_product_purchase_train=customer_orders_product_train.groupby(["user_id","product_id"]).size().reset_index(name='purchase')
customer_product_purchase_train["user_id"]=customer_orders_product_train["user_id"].astype('category')
customer_product_purchase_train["product_id"]=customer_orders_product_train["product_id"].astype('category')
pickle.dump(customer_product_purchase_train, open("customer_product_purchase_train.p", "wb" ) )

In [58]:
#creating a dataset using the lightFM native methods
dataset_train = Dataset()
dataset_train.fit((x[0] for x in customer_product_purchase_train.itertuples(index=False)),(x[1] for x in customer_product_purchase_train.itertuples(index=False)))

dataset_train.fit_partial(users=(x[0] for x in customer_gender.itertuples(index=False)),
                     items=(x[0] for x in product_feature.itertuples(index=False)),
                     user_features=(x[1] for x in customer_gender.itertuples(index=False)),
                     item_features=(x[1] for x in product_feature.itertuples(index=False)))

user_features_train = dataset_train.build_user_features((x[0], [x[1]])
                                              for x in customer_gender.itertuples(index=False))
item_features_train = dataset_train.build_user_features((x[0], [x[1]])
                                              for x in product_feature.itertuples(index=False))

interactions_train, weights_train = dataset_train.build_interactions(((x[0], x[1])
                                                      for x in customer_product_purchase_train.itertuples(index=False)))

train_matrix_modeling, test_matrix_testing=random_train_test_split(interactions_train,test_percentage=0.2)

In [60]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_hybrid_train= lightfm(loss = "warp")
#fitting the model with user features and item features on the 'train' dataset
model_hybrid_train.fit(train_matrix_modeling,
                 user_features=user_features_train,
                 item_features=item_features_train,
                       epochs=1, 
                       num_threads=4)
pickle.dump(model_hybrid_train, open("model_hybrid_train.p", "wb" ) )

In [61]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_hybrid_train = auc_score(model=model_hybrid_train,test_interactions = test_matrix_testing, num_threads = 4, check_intersections = False,user_features=user_features_train,item_features=item_features_train)
end = time.time()

print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score for hybrid method= {0:.{1}f}".format(auc_hybrid_train.mean(), 2))
pickle.dump(auc_hybrid_train, open("auc_hybrid_train.p", "wb" ) )

time for evaluation = 192.90 seconds
AUC score for hybrid method= 0.92


The sad moment when we realized that applying user and item feature to the 'train' dataset gets us an abismal precision at k score...

In [62]:
#precision evaluation
start = time.time()
model_precision_hybrid_train=precision_at_k(model = model_hybrid_train, 
                        test_interactions = test_matrix_testing,k=10,user_features=user_features_train, item_features=item_features_train,
                        num_threads = 4, check_intersections = False)
end = time.time()
pickle.dump(model_precision_hybrid_train, open("model_precision_hybrid_train.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid_train.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.01
time taken for precision at k evaluation = 197.40 seconds


Looking at the results above...maybe we are creating too much noise with additional features since a lot of items have missing features. let's try it again with only user.

In [63]:
dataset_train_noitem = Dataset()
dataset_train_noitem.fit((x[0] for x in customer_product_purchase_train.itertuples(index=False)),(x[1] for x in customer_product_purchase_train.itertuples(index=False)))

dataset_train_noitem.fit_partial(users=(x[0] for x in customer_gender.itertuples(index=False)),
                     user_features=(x[1] for x in customer_gender.itertuples(index=False)))

user_features_train_noitem = dataset_train.build_user_features((x[0], [x[1]])
                                              for x in customer_gender.itertuples(index=False))

interactions_train_noitem, weights_train_noitem = dataset_train.build_interactions(((x[0], x[1])
                                                      for x in customer_product_purchase_train.itertuples(index=False)))

train_matrix_noitem, test_matrix_noitem=random_train_test_split(interactions_train_noitem,test_percentage=0.2)

In [66]:
#creating a lightfm model instance with all default parameters except the loss function, where the default is logistic
model_hybrid_train_noitem= lightfm(loss = "warp")
#fitting the model
model_hybrid_train_noitem.fit(train_matrix_noitem,
                 user_features=user_features_train_noitem,
                 item_features=None,
          epochs=1, 
          num_threads=4)
#pickle.dump(model_hybrid_train_noitem, open("model_hybrid_train_noitem.p", "wb" ) )

In [67]:
#evaluating the accuracy with auc. Since this part is iterative and time-consuming, we set a timer here to monitor how long it's been running.
start = time.time()
auc_hybrid_train_noitem= auc_score(model=model_hybrid_train_noitem,test_interactions = test_matrix_noitem, num_threads = 4, check_intersections = False,user_features=user_features_train_noitem,item_features=None)
end = time.time()
#pickle.dump(auc_hybrid_train_noitem, open("auc_hybrid_train_noitem.p", "wb" ) )
print("time for evaluation = {0:.{1}f} seconds".format(end - start, 2))
print("AUC score for hybrid method= {0:.{1}f}".format(auc_hybrid_train_noitem.mean(), 2))

time for evaluation = 180.77 seconds
AUC score for hybrid method= 0.92


In [68]:
#precision evaluation
start = time.time()
model_precision_hybrid_train_noitem=precision_at_k(model = model_hybrid_train_noitem, 
                        test_interactions = test_matrix_noitem,k=10,user_features=user_features_train_noitem, item_features=None,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid_train_noitem, open("model_precision_hybrid_train_noitem.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid_train_noitem.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.02
time taken for precision at k evaluation = 181.45 seconds
