In [1]:
import numpy as np
import pandas as pd
from lightfm import LightFM



In [2]:
# read_csv
rating_df = pd.read_csv('the-movies-dataset/ratings_small.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
# sort data frame by timestemp for splitting
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [4]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of movies)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [5]:
# train/val split
num_train = int(len(user_ids) * 0.8)
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]

In [6]:
# set up user-movie matrix
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings

In [7]:
# credit to https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    '''
    Compute DCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed dcg
        
    '''
    r = r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    return dcg


In [8]:
def ndcg_at_k(r, k, method=0):
    '''
    Compute NDCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed ndcg
        
    '''
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)

    return dcg_at_k(r, k) / dcg_max

In [9]:
# compute average ndcg for all users
def evaluate_prediction(predictions):
    '''
    Return the average ndcg for each users
    args:
        predictions: np.array user-item predictions
    returns:
        ndcg: float, computed NDCG
    '''
    ndcgs = []
    # iterate
    for target_user in np.unique(val_user_ids):
        # get movie ids and ratings associated with the target user.
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user] 
        target_val_ratings = val_ratings[val_user_ids == target_user] 

        # compute ndcg for this user
        ndcg = ndcg_at_k(target_val_ratings[np.argsort(-predictions[val_user_ids == target_user])], k=30)
        ndcgs.append(ndcg)
    ndcg = np.mean(ndcgs)
    return ndcg


In [10]:
# change to csr_matrix
from scipy.sparse import csr_matrix
user2movie = csr_matrix(user2movie)
user2movie

<671x9066 sparse matrix of type '<class 'numpy.float64'>'
	with 80003 stored elements in Compressed Sparse Row format>

In [11]:
# links
links_df = pd.read_csv('the-movies-dataset/links_small.csv')
links_df.dtypes

movieId      int64
imdbId       int64
tmdbId     float64
dtype: object

In [12]:
# movie meta data
movie_df = pd.read_csv('the-movies-dataset/movies_metadata.csv')
movie_df['id'] = pd.to_numeric(movie_df.id, errors='coerce')
movie_df = movie_df.rename(columns={'id':'tmdbId'}).merge(links_df.loc[:,['movieId','tmdbId']], on='tmdbId', how='left' )
movie_df = movie_df.dropna(subset=['movieId'])
movie_df = movie_df.loc[movie_df.movieId.isin(rating_df.movieId.unique())]
movie_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,tmdbId,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,movieId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,2.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602.0,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,3.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357.0,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,4.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862.0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,5.0


In [13]:
# transform movie id with label encoder
movie_df['movieId'] = movie_encoder.transform(movie_df.movieId)
movie_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,tmdbId,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,movieId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602.0,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,2
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357.0,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,3
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862.0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,4


In [14]:
# overview titles
import re
movie_df.loc[:,['overview','title']] = movie_df.loc[:,['overview','title']].fillna('')
overview_titles = movie_df.apply(lambda x:x['overview'].lower() + ' ' + x['title'].lower() , axis=1).values
overview_titles = [re.sub(r'[^a-zA-Z ]+', ' ', ot).split(' ') for ot in overview_titles]
overview_titles[:5]

[['led',
  'by',
  'woody',
  '',
  'andy',
  's',
  'toys',
  'live',
  'happily',
  'in',
  'his',
  'room',
  'until',
  'andy',
  's',
  'birthday',
  'brings',
  'buzz',
  'lightyear',
  'onto',
  'the',
  'scene',
  '',
  'afraid',
  'of',
  'losing',
  'his',
  'place',
  'in',
  'andy',
  's',
  'heart',
  '',
  'woody',
  'plots',
  'against',
  'buzz',
  '',
  'but',
  'when',
  'circumstances',
  'separate',
  'buzz',
  'and',
  'woody',
  'from',
  'their',
  'owner',
  '',
  'the',
  'duo',
  'eventually',
  'learns',
  'to',
  'put',
  'aside',
  'their',
  'differences',
  '',
  'toy',
  'story'],
 ['when',
  'siblings',
  'judy',
  'and',
  'peter',
  'discover',
  'an',
  'enchanted',
  'board',
  'game',
  'that',
  'opens',
  'the',
  'door',
  'to',
  'a',
  'magical',
  'world',
  '',
  'they',
  'unwittingly',
  'invite',
  'alan',
  '',
  '',
  'an',
  'adult',
  'who',
  's',
  'been',
  'trapped',
  'inside',
  'the',
  'game',
  'for',
  '',
  '',
  'years',
 

In [15]:
# map word to index
from collections import Counter
counter = Counter(np.hstack(overview_titles))
max_words = 10000
word2index = { word:idx for idx, (word, count) in enumerate(sorted(counter.items(),key=lambda x:x[1], reverse=True)) if idx < max_words}

In [16]:
# count matrix
count_matrix = np.zeros( [len(overview_titles), max_words], dtype=np.int32)
for idx, ot in enumerate(overview_titles):
    for word in ot:
        if word in word2index:
            count_matrix[idx][word2index[word]] += 1
count_matrix

array([[ 6,  2,  0, ...,  0,  0,  0],
       [11,  3,  1, ...,  0,  0,  0],
       [ 5,  4,  4, ...,  0,  0,  0],
       ...,
       [ 8,  4,  0, ...,  0,  0,  0],
       [23, 14,  5, ...,  0,  0,  0],
       [ 2,  1,  1, ...,  0,  0,  0]], dtype=int32)

In [17]:
#tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count_matrix)
tfidf_matrix = tfidf_matrix.toarray()

In [18]:
# built sparse matrix for tfidf features
movie_meta_ids = movie_df.movieId.values
movie_features = np.zeros([num_movies, max_words])
movie_features[movie_meta_ids,:] = tfidf_matrix
movie_features = csr_matrix(movie_features)

In [19]:
# training
ITEM_ALPHA = 1e-6
model = LightFM(no_components=32, loss='warp', item_alpha=ITEM_ALPHA)
model.fit(interactions=user2movie,epochs=2, item_features=movie_features)

<lightfm.lightfm.LightFM at 0x115ec4ef0>

In [20]:
# prediction & evaluation
predictions = model.predict(user_ids=val_user_ids, item_ids=val_movie_ids, item_features=movie_features)
evaluate_prediction(predictions)

0.8524726058298665