# Purpose

The purpose of this notebook is to build a factorization machine model using the movielens dataset. This consists of the following steps:
1. Load in movielens data
2. preprocess the data, and format into sparse matrix
3. train test split the sparse data
4. Calculate baseline scores for popularity vs factorization machine model
5. model tuning

In [4]:
cd ../

/Users/scottcronin/gh/recommender_deployed


In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import copy
import pandas as pd
import pickle
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import scipy.sparse as scs
from sklearn.base import TransformerMixin
from sklearn.externals import joblib
from lightfm import LightFM, cross_validation, evaluation

sns.set_context('notebook', font_scale=1.4)

In [6]:
interactions = pd.read_csv('data/ratings.dat',
                           sep='::', engine='python',
                           header=None,
                           names=['uid', 'iid', 'rating', 'timestamp'],
                           usecols=['uid', 'iid', 'rating'],
                          )
display(interactions.sample(5))
print('Shape: {:>9,} x {}'.format(*interactions.shape))

Unnamed: 0,uid,iid,rating
2187027,15969,246,4.0
4675054,33395,4478,4.0
5167350,36957,488,4.0
4087236,29268,3745,4.0
6446600,46071,3873,3.0


Shape: 10,000,054 x 3


In [18]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [63]:
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())

True

In [59]:
left = pd.DataFrame.from_dict(pp.iid_to_idx, orient='index')
right = links.set_index('movieId')[['tmdbId']]
left.join(right).dropna().astype('int64').set_index(0).todict()

Unnamed: 0_level_0,tmdbId
0,Unnamed: 1_level_1
0,11066
1,1642
2,8467
3,6950
4,2164
5,193
6,888
7,13
8,10714
9,8587


In [65]:
import tmdbsimple as tmdb
tmdb.API_KEY = os.environ['TMDB_API_KEY']

In [80]:
poster = tmdb.Movies(603).info()['poster_path']

In [81]:
poster

'/hEpWvX6Bp79eLxY1kX5ZZJcme5U.jpg'

In [82]:
from IPython.display import HTML

In [88]:
%timeit HTML('<img src="https://image.tmdb.org/t/p/w600_and_h900_bestv2/hEpWvX6Bp79eLxY1kX5ZZJcme5U.jpg">')

6.63 µs ± 147 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [79]:
movie

{'adult': False,
 'backdrop_path': '/7u3pxc0K1wx32IleAkLv78MKgrw.jpg',
 'belongs_to_collection': {'backdrop_path': '/RhUxjzNojIJsdZSYTn0CQvdKsn.jpg',
  'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/lh4aGpd3U9rm9B8Oqr6CUgQLtZL.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 34.566899,
 'poster_path': '/hEpWvX6Bp79eLxY1kX5ZZJcme5U.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': ''},
  {'id': 372,
   'logo_path': None,
   'name': 'Groucho II Film

'The Matrix'

In [50]:
TMDB_API_KEYa

0          0
tmdbId    37
dtype: int64

In [32]:
iid_to_
i = {idx: iid for iid, idx in pp.iid_to_idx.items()}

In [36]:
max(i.values())

65133

In [17]:
links = pd.read_csv('data/links.csv')
meta = pd.read_csv('data/movies_metadata.csv', usecols=['id', ''])
display(links.sample(5))
display(meta.sample(5))
print('Shape: {:>9,} x {}'.format(*meta.shape))

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,movieId,imdbId,tmdbId
24304,113812,34555,252956.0
12557,58627,1023111,8456.0
1416,1453,118691,17894.0
37297,153760,4306116,341007.0
22009,105538,2577990,159015.0


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
14364,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",http://www.wer-frueher-stirbt-ist-laenger-tot.de/,1697,tt0780180,de,Wer früher stirbt ist länger tot,In this black comedy set in small-town Bavaria...,...,2006-08-17,0.0,105.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,,Grave Decisions,False,7.2,30.0
3937,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,2613,tt0105165,en,Prelude to a Kiss,A couple fall in love despite the girl's pessi...,...,1992-07-10,22697691.0,105.0,"[{'iso_639_1': 'nl', 'name': 'Nederlands'}, {'...",Released,,Prelude to a Kiss,False,5.4,31.0
33626,False,,0,[],http://rebootfilm.com,143602,tt2090594,en,Reboot,Set within a dystopian world that is a collisi...,...,2012-12-12,0.0,40.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Your world is about to be reset.,Reboot,False,4.5,2.0
13332,False,,0,[],,250093,tt0068912,en,The Man,When the President and Speaker of the House ar...,...,1972-07-19,0.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It took an accident to make this man President...,The Man,False,8.0,1.0
26091,False,,0,"[{'id': 35, 'name': 'Comedy'}]",http://www.billburr.com/,46967,tt1717578,en,Bill Burr: Let It Go,It’s always been a dream of mine to do a show ...,...,2010-09-18,0.0,65.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Bill Burr: Let It Go,False,8.0,21.0


Shape:    45,466 x 24


In [5]:
class Preprocessor(TransformerMixin):
    def __init__(self, copy=True, min_rating=4.0):
        self.copy = copy
        self.min_rating = min_rating
        self.uid_to_idx = None
        self.iid_to_idx = None
    
    def fit(self, df, y=None, **kwargs):
        self._validate_df(df)
        if self.copy:
            df = df.copy()
        df = self._filter_interactions_to_min_rating(df)
        df = self._drop_duplicate_user_item_interactions(df)

        # create uid to indx mapping
        uniq_uids = df['uid'].unique()
        self.uid_to_idx = dict(zip(uniq_uids, np.arange(len(uniq_uids))))

        # create iid to indx mapping
        uniq_iids = df['iid'].unique()
        self.iid_to_idx = dict(zip(uniq_iids, np.arange(len(uniq_iids))))        
        return self
    
    def transform(self, df, **kwargs):
        self._validate_df(df)
        if self.copy:
            df = df.copy()

        df = self._filter_interactions_to_min_rating(df)
        df = self._drop_duplicate_user_item_interactions(df)
        
        # generate sparse matrix
        row = df['uid'].map(self.uid_to_idx)
        col = df['iid'].map(self.iid_to_idx)
        assert len(row) == len(col)
        data = np.ones(len(row))
        shape = (len(self.uid_to_idx), len(self.iid_to_idx))
        csr = scs.coo_matrix((data, (row, col)), shape=shape).tocsr()
        return csr

    def _drop_duplicate_user_item_interactions(self, df):
        if df.duplicated().sum() != 0:
            df = df.drop_duplicated()
        return df
    
    def _filter_interactions_to_min_rating(self, df):
        df = df.loc[df['rating'] >= self.min_rating, ['uid', 'iid']]
        return df
    
    def _validate_df(self, df):
        assert 'uid' in df.columns
        assert 'iid' in df.columns
        assert 'rating' in df.columns

In [13]:
pp = Preprocessor(min_rating=4.0)
csr = pp.fit_transform(interactions)

Lets begin by creating a simple train test split

In [7]:
tr, te = cross_validation.random_train_test_split(csr)

Let's build a model with train and evaluate it with test

In [6]:
%%time
lfm = LightFM(no_components=30, loss='warp', learning_rate=0.05)
lfm.fit(tr, epochs=3)

CPU times: user 26.9 s, sys: 237 ms, total: 27.2 s
Wall time: 27.3 s


Let's build a popularity model simply by deleting user and item embedding vectors

In [7]:
pop = copy.deepcopy(lfm)
pop.user_embeddings[:, :] = 0.0
pop.item_embeddings[:, :] = 0.0

In [8]:
def evaluate_model(model, train, test):
    model_rr = evaluation.reciprocal_rank(
        model=model,
        test_interactions=test,
        train_interactions=train,
        num_threads=2
    )
    model_auc = evaluation.auc_score(
        model=model,
        test_interactions=test,
        train_interactions=train,
        num_threads=2
    )
    return model_rr, model_auc

In [9]:
%%time
fm_rr, fm_auc = evaluate_model(lfm, tr, te)
pop_rr, pop_auc = evaluate_model(pop, tr, te)

CPU times: user 4min 53s, sys: 1.82 s, total: 4min 55s
Wall time: 4min 56s


In [24]:
print('{:>10}:\n\t{:>20}: {:0.3}\n\t{:>20}: {:0.3}'.format(
    'Factorization Machine',
        'Mean Reciprocal Rank', fm_rr.mean(),
        'Mean ROC-AUC', fm_auc.mean()
))
print('{:>10}:\n\t{:>20}: {:0.3}\n\t{:>20}: {:0.3}'.format(
    'Popularity Model',
        'Mean Reciprocal Rank', pop_rr.mean(),
        'Mean ROC-AUC', pop_auc.mean()
))

Factorization Machine:
	Mean Reciprocal Rank: 0.41
	        Mean ROC-AUC: 0.972
Popularity Model:
	Mean Reciprocal Rank: 0.281
	        Mean ROC-AUC: 0.944


In [38]:
fn = os.path.join(
    os.path.dirname(os.getcwd()),
    'models/fm_no_side_data.pkl.gz'
)
_ = joblib.dump(lfm, fn)

fn = os.path.join(
    os.path.dirname(os.getcwd()),
    'models/popularity.pkl.gz'
)
_ = joblib.dump(pop, fn)