## Load dataset
1. Ratings matrix
2. Item Feature matrix
3. User Feature matrix

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import pickle

In [2]:
# Load the main ratings matrix
f = open("data/X_mat_jupyter.data", 'rb')
X_mat = pickle.load(f)
f.close() # save the sparse matrix to a file

X_mat

<30755x359966 sparse matrix of type '<class 'numpy.int8'>'
	with 7377418 stored elements in Compressed Sparse Row format>

In [3]:
# Load feature matrices
X_item_features = pd.read_csv('data/X_item_features.csv')
X_user_features = pd.read_csv('data/X_user_features.csv')

## Run model

In [4]:
# Install LightFM if you didn't install it.
# !pip install LightFM

In [5]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank
import lightfm.cross_validation

In [6]:
(tr, ts) = lightfm.cross_validation.random_train_test_split(X_mat)

In [7]:
# Populate Feature Matrices
item_features_col_names = X_item_features.columns.values.tolist()
user_features_col_names = X_user_features.columns.values.tolist()
item_features_col_names.remove('item_id')
item_features_col_names.remove('Unnamed: 0')
user_features_col_names.remove('user_id')
user_features_col_names.remove('Unnamed: 0')
item_features = csr_matrix(X_item_features[item_features_col_names])
user_features = csr_matrix(X_user_features[user_features_col_names])

In [8]:
X_item_features.shape

(359966, 41)

In [9]:
X_user_features.shape

(30755, 42)

In [10]:
print(len(item_features_col_names))
print(len(user_features_col_names))

39
40


In [11]:
# Set the number of threads; you can increase this
# if you have more physical cores available.
NUM_THREADS = 8
NUM_COMPONENTS = 79
NUM_EPOCHS = 3 
ITEM_ALPHA = 1e-3 # L2 penalty on item features
USER_ALPHA = 1e-3 # L2 penalty on user features.

In [12]:
# Define a new model instance
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                user_alpha=USER_ALPHA,
                no_components=NUM_COMPONENTS,
                random_state=0)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(tr,
                  item_features=item_features,
                  user_features=user_features,
                  epochs=NUM_EPOCHS,
                  num_threads=NUM_THREADS)

## Recommendation System Evaluation

In [19]:
# Test score
test_auc = auc_score(model,
                    ts,
                    train_interactions=tr,
                    item_features=item_features,
                     user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.83231


## Traditional Evaluation Metrics
### Set the median of the recommendation as a threshold

In [35]:
from sklearn.metrics import accuracy_score, auc, recall_score, precision_score, f1_score,  roc_curve

In [13]:
test_pred = model.predict_rank(ts, train_interactions=tr, item_features=item_features,
                  user_features=user_features, num_threads=NUM_THREADS)

# import pickle
# f = open("test_pred.data", 'wb')
# pickle.dump(pred, f)
# f.close() # save the sparse matrix to a file

In [44]:
threshold = np.median(test_pred.data)
print("Accuracy: ", accuracy_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Precision: ", precision_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Recall: ", recall_score(ts.tocsr().data>1, test_pred.data<threshold))
print("F1: ", f1_score(ts.tocsr().data>1, test_pred.data<threshold))
fpr, tpr, _ = roc_curve(ts.tocsr().data>1, test_pred.data<threshold)
print("AUC: ", auc(fpr, tpr))

Accuracy:  0.545850717459
Precision:  0.5487558898
Recall:  0.545568484123
F1:  0.547157545031
AUC:  0.545852365935


In [45]:
threshold = np.median(test_pred.data)*0.8
print("Accuracy: ", accuracy_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Precision: ", precision_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Recall: ", recall_score(ts.tocsr().data>1, test_pred.data<threshold))
print("F1: ", f1_score(ts.tocsr().data>1, test_pred.data<threshold))
fpr, tpr, _ = roc_curve(ts.tocsr().data>1, test_pred.data<threshold)
print("AUC: ", auc(fpr, tpr))

Accuracy:  0.544466764804
Precision:  0.552284873039
Recall:  0.497482567996
F1:  0.523453264342
AUC:  0.544741191287


In [46]:
threshold = np.median(test_pred.data)*0.9
print("Accuracy: ", accuracy_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Precision: ", precision_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Recall: ", recall_score(ts.tocsr().data>1, test_pred.data<threshold))
print("F1: ", f1_score(ts.tocsr().data>1, test_pred.data<threshold))
fpr, tpr, _ = roc_curve(ts.tocsr().data>1, test_pred.data<threshold)
print("AUC: ", auc(fpr, tpr))


Accuracy:  0.545386463018
Precision:  0.550508337126
Recall:  0.52329028902
F1:  0.536554359128
AUC:  0.545515522909


In [47]:
threshold = np.median(test_pred.data)*1.2
print("Accuracy: ", accuracy_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Precision: ", precision_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Recall: ", recall_score(ts.tocsr().data>1, test_pred.data<threshold))
print("F1: ", f1_score(ts.tocsr().data>1, test_pred.data<threshold))
fpr, tpr, _ = roc_curve(ts.tocsr().data>1, test_pred.data<threshold)
print("AUC: ", auc(fpr, tpr))

Accuracy:  0.546165190541
Precision:  0.545913798745
Recall:  0.580056493977
F1:  0.562467493433
AUC:  0.54596723736


In [48]:
threshold = np.median(test_pred.data)*1.4
print("Accuracy: ", accuracy_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Precision: ", precision_score(ts.tocsr().data>1, test_pred.data<threshold))
print("Recall: ", recall_score(ts.tocsr().data>1, test_pred.data<threshold))
print("F1: ", f1_score(ts.tocsr().data>1, test_pred.data<threshold))
fpr, tpr, _ = roc_curve(ts.tocsr().data>1, test_pred.data<threshold)
print("AUC: ", auc(fpr, tpr))

Accuracy:  0.54585817264
Precision:  0.543980878676
Recall:  0.599629932105
F1:  0.570451440773
AUC:  0.54554410119
