In [1]:
# Import packages
%matplotlib inline
import math
import pandas as pd
import matplotlib.pyplot as plt
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from CFModel import CFModel

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Define constants
K_FACTORS = 100
TEST_USER = 2000

Note:
* **K_FACTORS** is a randomly chosen constant for the number of dimensional embeddings for movies and users. I'll explain this later.
* **TEST_USER** is a randomly chosen constant for a test user in the dataset. Here, I randomly choose user with ID 2000.

## Loading Datasets
Similar to what I did for the previous notebook, I loaded the 3 datasets into 3 dataframes: *ratings*, *users*, and *movies*. Additionally, to make it easy to use series from the *ratings* dataframe as training inputs and output to the Keras model, I set *max_userid* as the max value of user_id in the ratings and *max_movieid* as the max value of movie_id in the ratings.

In [3]:
# Load ratings datasets
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 5 columns):
user_id         1000209 non-null int64
movie_id        1000209 non-null int64
rating          1000209 non-null int64
user_emb_id     1000209 non-null int64
movie_emb_id    1000209 non-null int64
dtypes: int64(5)
memory usage: 38.2 MB
None


In [4]:
# Load users dataset
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])
print(users.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
user_id     6040 non-null int64
gender      6040 non-null object
zipcode     6040 non-null object
age_desc    6040 non-null object
occ_desc    6040 non-null object
dtypes: int64(1), object(4)
memory usage: 236.0+ KB
None


In [5]:
# Load movies dataset
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'])
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
movie_id    3883 non-null int64
title       3883 non-null object
genres      3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None


In [6]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=RNG_SEED)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print 'Users:', Users, ', shape =', Users.shape

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print 'Movies:', Movies, ', shape =', Movies.shape

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print 'Ratings:', Ratings, ', shape =', Ratings.shape

Users: [1284 1682 2367 ... 2937 2076 4665] , shape = (1000209,)
Movies: [1093 3071   44 ... 1135 1731 1953] , shape = (1000209,)
Ratings: [4 5 4 ... 5 5 1] , shape = (1000209,)


## Matrix Factorization for Collaborative Filtering

The idea behind matrix factorization models is that attitudes or preferences of a user can be determined by a small number of hidden factors. We can call these factors as **Embeddings**.

Intuitively, we can understand embeddings as low dimensional hidden factors for movies and users. For e.g. say we have 3 dimensional embeddings for both movies and users (aka, **K_FACTORS = 3**).

For instance, for movie A, the 3 numbers in the movie embedding matrix represent 3 different characteristics about the movie, such as:
* How recent is the movie A?
* How much special effects are in movie A?
* How CGI-driven is movie A? 

For user B, the 3 numbers in the user embedding matrix represent:
* How much does user B like Drama movie?
* How likely does user B to give a 5-star rating?
* How often does user B watch movies?

![embedding-layers](images/embedding-layers.png)

In [7]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')

  self.add(Merge([P, Q], mode='dot', dot_axes=1))


In [9]:
# Train model
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)



Train on 900188 samples, validate on 100021 samples
Epoch 1/30
 - 387s - loss: 8.2727 - val_loss: 2.2829
Epoch 2/30
 - 316s - loss: 1.4957 - val_loss: 1.1248
Epoch 3/30
 - 319s - loss: 1.0059 - val_loss: 0.9370
Epoch 4/30
 - 317s - loss: 0.8957 - val_loss: 0.8764
Epoch 5/30
 - 316s - loss: 0.8495 - val_loss: 0.8461
Epoch 6/30
 - 355s - loss: 0.8182 - val_loss: 0.8228
Epoch 7/30
 - 395s - loss: 0.7921 - val_loss: 0.8045
Epoch 8/30
 - 395s - loss: 0.7695 - val_loss: 0.7921
Epoch 9/30
 - 394s - loss: 0.7477 - val_loss: 0.7807
Epoch 10/30
 - 406s - loss: 0.7269 - val_loss: 0.7700
Epoch 11/30
 - 371s - loss: 0.7060 - val_loss: 0.7614
Epoch 12/30
 - 332s - loss: 0.6849 - val_loss: 0.7543
Epoch 13/30
 - 319s - loss: 0.6639 - val_loss: 0.7483
Epoch 14/30
 - 340s - loss: 0.6428 - val_loss: 0.7458
Epoch 15/30
 - 358s - loss: 0.6218 - val_loss: 0.7428
Epoch 16/30
 - 315s - loss: 0.6009 - val_loss: 0.7433
Epoch 17/30
 - 314s - loss: 0.5801 - val_loss: 0.7424
Epoch 18/30
 - 314s - loss: 0.5596 - va

In [13]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print 'Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss))

Minimum RMSE at epoch 17 = 0.8616


In [14]:
# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

In [15]:
# Pick a random test user
users[users['user_id'] == TEST_USER]

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
1999,2000,M,44685,18-24,college/grad student


In [16]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [17]:
user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,title,genres
0,2000,1639,5,3.724665,Chasing Amy (1997),Drama|Romance
1,2000,2529,5,3.803218,Planet of the Apes (1968),Action|Sci-Fi
2,2000,1136,5,4.495121,Monty Python and the Holy Grail (1974),Comedy
3,2000,2321,5,4.010493,Pleasantville (1998),Comedy
4,2000,2858,5,4.253924,American Beauty (1999),Comedy|Drama
5,2000,2501,5,4.206387,October Sky (1999),Drama
6,2000,2804,5,4.35367,"Christmas Story, A (1983)",Comedy|Drama
7,2000,1688,5,3.710508,Anastasia (1997),Animation|Children's|Musical
8,2000,1653,5,4.089375,Gattaca (1997),Drama|Sci-Fi|Thriller
9,2000,527,5,5.046471,Schindler's List (1993),Drama|War


In [18]:
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,title,genres
0,953,4.868923,It's a Wonderful Life (1946),Drama
1,668,4.866858,Pather Panchali (1955),Drama
2,1423,4.859523,Hearts and Minds (1996),Drama
3,3307,4.834415,City Lights (1931),Comedy|Drama|Romance
4,649,4.802675,Cold Fever (Á köldum klaka) (1994),Comedy|Drama
5,669,4.797451,Aparajito (1956),Drama
6,326,4.784828,To Live (Huozhe) (1994),Drama
7,3092,4.761148,Chushingura (1962),Drama
8,3022,4.753003,"General, The (1927)",Comedy
9,2351,4.720692,Nights of Cabiria (Le Notti di Cabiria) (1957),Drama
