# Purpose

The purpose of this notebook is to build a factorization machine model using the movielens dataset. This consists of the following steps:
1. Load in movielens data
2. preprocess the data, and format into sparse matrix
3. train test split the sparse data
4. Calculate baseline scores for popularity vs factorization machine model
5. model tuning

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
cd ../

/Users/scottcronin/gh/recommender_deployed


In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import copy
import pandas as pd
import pickle
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import scipy.sparse as scs
from sklearn.base import TransformerMixin
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity
from lightfm import LightFM, cross_validation, evaluation

sns.set_context('notebook', font_scale=1.4)

  return f(*args, **kwds)


# Load Data

In [4]:
interactions = pd.read_csv('data/ratings.dat',
                           sep='::', engine='python',
                           header=None,
                           names=['uid', 'iid', 'rating', 'timestamp'],
                           usecols=['uid', 'iid', 'rating'],
                          )
display(interactions.sample(5))
print('Shape: {:>9,} x {}'.format(*interactions.shape))

Unnamed: 0,uid,iid,rating
3172023,22908,7153,3.5
1491751,10961,2324,5.0
5364696,38328,628,4.5
9581748,68670,1339,3.0
6510704,46568,3735,4.0


Shape: 10,000,054 x 3


# Preprocess data

In [5]:
from app.preprocess import Preprocessor
pp = Preprocessor(min_rating=4.0)
csr = pp.fit_transform(interactions)

# Build a model

In [6]:
from app.models import FM
lfm = LightFM(no_components=30, loss='warp', learning_rate=0.05)
fm = FM(fm_model=lfm, preprocessor=pp)
fm.fit(csr, epochs=3)

# Calculate cosine similarities on item embedding vectors

In [10]:
cs = cosine_similarity(fm.model.item_embeddings)
sims = np.argsort(-cs)

In [40]:
# pop_iids = [527,6,110,7,457,8,50,9,589,10,257]
# pop_idxs = [fm.iid_to_idx[iid] for iid in pop_iids]
pop_idxs = fm.pop_model[:20]
POSTERS = joblib.load('app/objects/posters.pkl.gz')
BASE_URL = 'https://image.tmdb.org/t/p/w200'
# poster_urls = [BASE_URL + POSTERS[iid] for iid in recs]

In [41]:
pop_idxs

array([622,  80, 528,   7,  23,  75,  22, 116, 141,  19, 133,  81, 118,
       770,  25,  14, 120,  48, 285,  83])

In [44]:
idxs = sims[23, :][:10]
urls = [BASE_URL + POSTERS[fm.idx_to_iid[idx]] for idx in idxs]

In [45]:
from IPython.display import HTML
for url in urls:
    display(HTML('<img src="{}">'.format(url)))

Let's build a popularity model simply by deleting user and item embedding vectors

In [25]:
likes = fm.get_historical_likes(12244, 10)

In [26]:
# uid = interactions.uid.sample(1).values[0]
uid = 12244
t = interactions.query("uid == @uid and rating >=4.0")

In [32]:
pd.Series(likes).isin(t.iid)

0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
8    True
9    True
dtype: bool

In [34]:
joblib.dump(fm, 'app/objects/joblib_0.pkl')


PicklingError: Can't pickle <class 'app.models.fm.FM'>: it's not the same object as app.models.fm.FM

In [7]:
pop = copy.deepcopy(lfm)
pop.user_embeddings[:, :] = 0.0
pop.item_embeddings[:, :] = 0.0

In [8]:
def evaluate_model(model, train, test):
    model_rr = evaluation.reciprocal_rank(
        model=model,
        test_interactions=test,
        train_interactions=train,
        num_threads=2
    )
    model_auc = evaluation.auc_score(
        model=model,
        test_interactions=test,
        train_interactions=train,
        num_threads=2
    )
    return model_rr, model_auc

In [9]:
%%time
fm_rr, fm_auc = evaluate_model(lfm, tr, te)
pop_rr, pop_auc = evaluate_model(pop, tr, te)

CPU times: user 4min 53s, sys: 1.82 s, total: 4min 55s
Wall time: 4min 56s


In [24]:
print('{:>10}:\n\t{:>20}: {:0.3}\n\t{:>20}: {:0.3}'.format(
    'Factorization Machine',
        'Mean Reciprocal Rank', fm_rr.mean(),
        'Mean ROC-AUC', fm_auc.mean()
))
print('{:>10}:\n\t{:>20}: {:0.3}\n\t{:>20}: {:0.3}'.format(
    'Popularity Model',
        'Mean Reciprocal Rank', pop_rr.mean(),
        'Mean ROC-AUC', pop_auc.mean()
))

Factorization Machine:
	Mean Reciprocal Rank: 0.41
	        Mean ROC-AUC: 0.972
Popularity Model:
	Mean Reciprocal Rank: 0.281
	        Mean ROC-AUC: 0.944


In [38]:
fn = os.path.join(
    os.path.dirname(os.getcwd()),
    'models/fm_no_side_data.pkl.gz'
)
_ = joblib.dump(lfm, fn)

fn = os.path.join(
    os.path.dirname(os.getcwd()),
    'models/popularity.pkl.gz'
)
_ = joblib.dump(pop, fn)