In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'],nrows=100000)
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

# Reading ratings file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'],nrows=100000)

# Reading ratings file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'],nrows=100000)

In [2]:
#Rratings
print(ratings) 
# Create training set
shuffled_ratings = ratings.sample(frac=1.)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print ('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print ('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print( 'Ratings:', Ratings, ', shape =', Ratings.shape)

       user_id  movie_id  rating  user_emb_id  movie_emb_id
0            1      1193       5            0          1192
1            1       661       3            0           660
2            1       914       3            0           913
3            1      3408       4            0          3407
4            1      2355       5            0          2354
5            1      1197       3            0          1196
6            1      1287       5            0          1286
7            1      2804       5            0          2803
8            1       594       4            0           593
9            1       919       4            0           918
10           1       595       5            0           594
11           1       938       4            0           937
12           1      2398       4            0          2397
13           1      2918       4            0          2917
14           1      1035       5            0          1034
15           1      2791       4        

In [3]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture
from CFModel import CFModel

Using TensorFlow backend.


In [4]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2 # A random test user (user_id = 2000)

In [5]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.summary()
model.compile(loss='mse', optimizer='adamax',metrics=['accuracy'])

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1, 100)        66900       embedding_input_1[0][0]          
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 100)           0           embedding_1[0][0]                
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 100)        395200      embedding_input_2[0][0]          
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 100)           0           embedding_2[0][0]                
Total params: 462100
______________________________________________________________________

In [6]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings,nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 90000 samples, validate on 10000 samples
Epoch 1/30
52s - loss: 10.3527 - acc: 0.0498 - val_loss: 5.8085 - val_acc: 0.1429
Epoch 2/30
51s - loss: 4.0265 - acc: 0.2117 - val_loss: 3.0161 - val_acc: 0.2597
Epoch 3/30
52s - loss: 2.4122 - acc: 0.2880 - val_loss: 2.1507 - val_acc: 0.3030
Epoch 4/30
52s - loss: 1.8231 - acc: 0.3193 - val_loss: 1.7660 - val_acc: 0.3272
Epoch 5/30
57s - loss: 1.5448 - acc: 0.3372 - val_loss: 1.5623 - val_acc: 0.3393
Epoch 6/30
74s - loss: 1.3918 - acc: 0.3465 - val_loss: 1.4415 - val_acc: 0.3448
Epoch 7/30
73s - loss: 1.3012 - acc: 0.3528 - val_loss: 1.3663 - val_acc: 0.3502
Epoch 8/30
73s - loss: 1.2395 - acc: 0.3562 - val_loss: 1.3127 - val_acc: 0.3552
Epoch 9/30
73s - loss: 1.1996 - acc: 0.3600 - val_loss: 1.2769 - val_acc: 0.3573
Epoch 10/30
73s - loss: 1.1705 - acc: 0.3626 - val_loss: 1.2500 - val_acc: 0.3602
Epoch 11/30
73s - loss: 1.1495 - acc: 0.3627 - val_loss: 1.2302 - val_acc: 0.3604
Epoch 12/30
73s - loss: 1.1337 - acc: 0.3639 - val_loss:

In [7]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 30 = 1.0703


In [8]:
# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

In [9]:
# Pick a random test user
users[users['user_id'] == TEST_USER]

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
1,2,M,70072,56+,self-employed


In [10]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [11]:
user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,title,genres
0,2,1357,5,3.733144,Shine (1996),Drama|Romance
1,2,2236,5,3.72242,Simon Birch (1998),Drama
2,2,3147,5,3.714902,"Green Mile, The (1999)",Drama|Thriller
3,2,1293,5,3.693522,Gandhi (1982),Drama
4,2,110,5,3.67662,Braveheart (1995),Action|Drama|War
5,2,3471,5,3.734725,Close Encounters of the Third Kind (1977),Drama|Sci-Fi
6,2,1945,5,3.725213,On the Waterfront (1954),Crime|Drama
7,2,1225,5,3.809141,Amadeus (1984),Drama
8,2,515,5,3.625121,"Remains of the Day, The (1993)",Drama
9,2,480,5,3.717022,Jurassic Park (1993),Action|Adventure|Sci-Fi


In [12]:
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,title,genres
0,3719,4.039097,Love's Labour's Lost (2000),Comedy|Romance
1,1670,3.971614,Welcome To Sarajevo (1997),Drama|War
2,484,3.961718,Lassie (1994),Adventure|Children's
3,2652,3.959218,"Curse of Frankenstein, The (1957)",Horror
4,3310,3.95576,"Kid, The (1921)",Action
5,1743,3.946225,Arguing the World (1996),Documentary
6,3597,3.937067,Whipped (2000),Comedy
7,1234,3.934803,"Sting, The (1973)",Comedy|Crime
8,1482,3.923779,"Van, The (1996)",Comedy|Drama
9,3496,3.917943,Madame Sousatzka (1988),Drama
