In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightfm import LightFM
import math
import re

In [2]:
# read data
rating = pd.read_csv("../data/movielens/ratings.csv")
movie = pd.read_csv("../data/movielens/movies.csv")
tag = pd.read_csv("../data/movielens/tags.csv")

In [3]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Preprocessing - Features

In [8]:
# extract year
def extract_year(s):
    try:
        return int(re.findall('\(([0-9]+)\)', s)[-1])
    except:
        print(s)
        return 'UNKNOWN'
movie['year'] = movie.title.apply(lambda x: extract_year(x))

Big Bang Theory, The (2007-)
Fawlty Towers (1975-1979)
Hyena Road
The Lovers and the Despot
Stranger Things
Women of '69, Unboxed


In [12]:
# manually impute year
years = {"Big Bang Theory, The (2007-)": 2010,
"Fawlty Towers (1975-1979)": 1970,
"Hyena Road": 2010,
"The Lovers and the Despot": 2016,
"Stranger Things": 2010,
"Women of '69, Unboxed": 2010}

for ix, row in movie.iterrows():
    if row['title'] in years:
        movie.at[ix, 'year'] = years[row['title']]

In [14]:
# convert year to decades
movie['decade'] = movie['year'].apply(lambda x: math.floor(x/10)*10)

In [15]:
# extract genre
movie['genres'] = movie['genres'].replace('(no genres listed)', "NULL")
movie['genres'] = movie['genres'].apply(lambda x: x.split("|"))

In [71]:
# combine all features & one-hot-encode
movie_features = movie.apply(lambda x: x['genres'] + [x['decade']], axis=1)


### Preprocessing - User-Item Interaction

In [18]:
# map movieid & userid to new ids
movie_id_mapping = {i:ix for ix, i in enumerate(movie.movieId)}
user_id_mapping = {i:ix for ix, i in enumerate(rating.userId.unique())}
rating['userId'] = rating['userId'].apply(lambda x: user_id_mapping[x])
rating['movieId'] = rating['movieId'].apply(lambda x: movie_id_mapping[x])

In [20]:
# convert explicit rating to implicit feedback
threshold = 3
rating['rating'] = rating['rating'].apply(lambda x: 0 if x < threshold else 1) 

# Build dataset - Interaction Matrix, Feature Matrix

In [73]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(
    users=set(rating['userId']),
    items=movie_id_mapping.values(),
    item_features=set([f for features in movie_features for f in features])
)


In [74]:
# build interactions matrix from (user_id, item_id) or (user_id, item_id, weight)
iteraction_tuples = rating[rating['rating']==1][['userId', 'movieId']].apply(tuple, axis=1).to_list()
interactions, weights = dataset.build_interactions(iteraction_tuples)

In [76]:
# build item features from (item id, [list of feature names]) or (item id, {feature name: feature weight})
feature_tuples = list(enumerate(movie_features))
item_features = dataset.build_item_features(feature_tuples)

# Model

In [79]:
model = LightFM(
    no_components=200, # embedding dims
    learning_rate=0.05,
    loss='warp'    
)

model.fit(interactions, 
          item_features=item_features,
          epochs=20,
          num_threads=4,
          verbose=True)

Epoch: 100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


<lightfm.lightfm.LightFM at 0x7f9e209abd90>

# Evaluation

In [81]:
from lightfm.evaluation import auc_score

roc/auc: the probability that a randomly chosen positive example has a higher score than a randomly chosen negative example.

In [82]:
auc_scores = auc_score(model, 
                    test_interactions=interactions, 
                    item_features=item_features,
                    num_threads=4)

In [83]:
# auc scores for every user
# if no interaction for the user, then auc=0.5, i.e. random guessing
auc_scores

array([0.97538936, 0.9947653 , 0.949396  , 0.9849852 , 0.9853942 ,
       0.9777733 , 0.9908762 , 0.98355365, 0.97805727, 0.9689699 ,
       0.958389  , 0.9448924 , 0.97703576, 0.98695457, 0.9902244 ,
       0.9790397 , 0.98421156, 0.99252945, 0.9893276 , 0.96330225,
       0.98676604, 0.98534805, 0.9802795 , 0.9939064 , 0.9870993 ,
       0.98657703, 0.99190557, 0.984891  , 0.9621694 , 0.9930819 ,
       0.971728  , 0.9963636 , 0.97870487, 0.9874509 , 0.98038423,
       0.9902052 , 0.97637933, 0.97134405, 0.9946865 , 0.99354464,
       0.98418766, 0.9956515 , 0.97438705, 0.99484676, 0.9658105 ,
       0.9377263 , 0.99767464, 0.98657876, 0.9886416 , 0.9979028 ,
       0.9562175 , 0.9787872 , 0.9624948 , 0.9405936 , 0.9950117 ,
       0.97256184, 0.9838564 , 0.9680385 , 0.97934234, 0.96774447,
       0.9773135 , 0.98989815, 0.98132455, 0.9981988 , 0.9960673 ,
       0.991018  , 0.9877294 , 0.97643065, 0.9891365 , 0.98873425,
       0.9671357 , 0.9924924 , 0.9928606 , 0.9818546 , 0.96361

In [84]:
# mean auc
np.mean(auc_scores)

0.98141104