In [1]:
import kimports; reload(kimports); from kimports import *
import kutils; reload(kutils)

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


<module 'kutils' from 'kutils.pyc'>

In [2]:
path = '../data/movielense/small'

In [53]:
ratings, movies, tags = [pd.read_csv(path + '/%s.csv' % f) for f in ['ratings', 'movies', 'tags']]

In [54]:
movies = movies.set_index('movieId')

In [63]:
movies.iloc[31]['title']

'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'

In [71]:
movies.loc[164979]

title     Women of '69, Unboxed
genres              Documentary
Name: 164979, dtype: object

In [72]:
# Map from movie and user indexes to contiguous indexes,
# to reduce the size of the embedding layers
userid2idx = {o:i for i,o in enumerate(ratings['userId'].unique())}
movieid2idx = {o:i for i,o in enumerate(ratings['movieId'].unique())}
ratings['seqMovieId'] = ratings['movieId'].apply(lambda x: movieid2idx[x])
ratings['title'] = ratings['movieId'].apply(lambda x: movies.loc[x]['title'])
ratings['seqUserId'] = ratings['userId'].apply(lambda x: userid2idx[x])

In [73]:
num_factors = 50
mask = np.random.rand(len(ratings)) < 0.8
train_ratings = ratings[mask]
valid_ratings = ratings[~mask]

In [74]:
def fit(model, nb_epoch, lr):
    model.optimizer.lr = lr
    model.fit([train_ratings['seqUserId'], train_ratings['seqMovieId']], train_ratings['rating'],
          validation_data=([valid_ratings['seqUserId'], valid_ratings['seqMovieId']], valid_ratings['rating']),
          batch_size=64, nb_epoch=nb_epoch)

# Basic, which over fits

In [7]:
movie_input = keras.layers.Input(shape=(1,), dtype='int64', name='movie_input')
user_input = keras.layers.Input(shape=(1,), dtype='int64', name='user_input')

user_embedding = keras.layers.Embedding(ratings['seqUserId'].nunique(), num_factors, input_length=1)(user_input)
movie_embedding = keras.layers.Embedding(ratings['seqMovieId'].nunique(), num_factors, input_length=1)(movie_input)

layers = keras.layers.merge([user_embedding, movie_embedding], mode='dot')
layers = keras.layers.Flatten()(layers)
model = keras.models.Model([user_input, movie_input], layers)
model.compile(optimizer=keras.optimizers.Adam(0.001), loss='mse')

In [8]:
fit(model, 1, 0.001)

Train on 80080 samples, validate on 19924 samples
Epoch 1/1


In [9]:
fit(model, 10, 0.01)

Train on 80080 samples, validate on 19924 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Batch norm

In [10]:
movie_input = keras.layers.Input(shape=(1,), dtype='int64', name='movie_input')
user_input = keras.layers.Input(shape=(1,), dtype='int64', name='user_input')

user_embedding = keras.layers.Embedding(ratings['seqUserId'].nunique(), num_factors, input_length=1)(user_input)
movie_embedding = keras.layers.Embedding(ratings['seqMovieId'].nunique(), num_factors, input_length=1)(movie_input)

layers = keras.layers.merge([user_embedding, movie_embedding], mode='dot')
layers = keras.layers.normalization.BatchNormalization()(layers)
layers = keras.layers.Flatten()(layers)
layers = keras.layers.core.Dropout(0.05)(layers)
model = keras.models.Model([user_input, movie_input], layers)
model.compile(optimizer=keras.optimizers.Adam(0.001), loss='mse')

In [11]:
fit(model, 1, 0.001)

Train on 80080 samples, validate on 19924 samples
Epoch 1/1


In [12]:
fit(model, 10, 0.01)

Train on 80080 samples, validate on 19924 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Regularization in embedding layers

In [13]:
movie_input = keras.layers.Input(shape=(1,), dtype='int64', name='movie_input')
user_input = keras.layers.Input(shape=(1,), dtype='int64', name='user_input')

user_embedding = keras.layers.Embedding(ratings['seqUserId'].nunique(), num_factors, input_length=1, W_regularizer=keras.regularizers.l2(0.01))(user_input)
movie_embedding = keras.layers.Embedding(ratings['seqMovieId'].nunique(), num_factors, input_length=1, W_regularizer=keras.regularizers.l2(0.01))(movie_input)

layers = keras.layers.merge([user_embedding, movie_embedding], mode='dot')
layers = keras.layers.normalization.BatchNormalization()(layers)
layers = keras.layers.Flatten()(layers)
model = keras.models.Model([user_input, movie_input], layers)
model.compile(optimizer=keras.optimizers.Adam(0.001), loss='mse')

In [14]:
fit(model, 1, 0.001)

Train on 80080 samples, validate on 19924 samples
Epoch 1/1


In [15]:
fit(model, 10, 0.01)

Train on 80080 samples, validate on 19924 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Bias units

In [75]:
movie_input = keras.layers.Input(shape=(1,), dtype='int64', name='movie_input')
user_input = keras.layers.Input(shape=(1,), dtype='int64', name='user_input')

user_embedding = keras.layers.Embedding(ratings['seqUserId'].nunique(), num_factors, input_length=1, W_regularizer=keras.regularizers.l2(0.001))(user_input)
movie_embedding = keras.layers.Embedding(ratings['seqMovieId'].nunique(), num_factors, input_length=1, W_regularizer=keras.regularizers.l2(0.001))(movie_input)
b_user_embedding = keras.layers.Embedding(ratings['seqUserId'].nunique(), 1, input_length=1)(user_input)
b_movie_embedding = keras.layers.Embedding(ratings['seqMovieId'].nunique(), 1, input_length=1)(movie_input)
b_user_embedding = keras.layers.Flatten()(b_user_embedding)
b_movie_embedding = keras.layers.Flatten()(b_movie_embedding)
layers = keras.layers.merge([user_embedding, movie_embedding], mode='dot')
layers = keras.layers.Flatten()(layers)
layers = keras.layers.merge([layers, b_user_embedding], mode='sum')
layers = keras.layers.merge([layers, b_movie_embedding], mode='sum')


#layers = keras.layers.normalization.BatchNormalization()(layers)

model = keras.models.Model([user_input, movie_input], layers)
model.compile(optimizer=keras.optimizers.Adam(0.001), loss='mse')

In [76]:
fit(model, 1, 0.001)

Train on 80031 samples, validate on 19973 samples
Epoch 1/1


In [77]:
fit(model, 10, 0.01)

Train on 80031 samples, validate on 19973 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Look at stuffs

## First the movie bias

In [90]:
bias_model = keras.models.Model(movie_input, b_movie_embedding)
bias_preds = bias_model.predict(ratings['seqMovieId'].unique())

In [94]:
movie_ratings = sorted(zip(ratings['title'].unique(), bias_preds), key=lambda x: x[1])

In [95]:
movie_ratings[:10]

[('Spy Kids 3-D: Game Over (2003)', array([-0.57035518], dtype=float32)),
 ("Joe's Apartment (1996)", array([-0.511823], dtype=float32)),
 ('Rage: Carrie 2, The (1999)', array([-0.47340706], dtype=float32)),
 ('Baby Geniuses (1999)', array([-0.44225287], dtype=float32)),
 ('Beverly Hills Chihuahua (2008)', array([-0.43715361], dtype=float32)),
 ('In the Army Now (1994)', array([-0.42980075], dtype=float32)),
 ('Vampires Suck (2010)', array([-0.42295581], dtype=float32)),
 ('Carnosaur 3: Primal Species (1996)', array([-0.40701455], dtype=float32)),
 ('Jaws: The Revenge (1987)', array([-0.40035567], dtype=float32)),
 ('Tekkonkinkreet (Tekkon kinkur\xc3\xaeto) (2006)',
  array([-0.3926492], dtype=float32))]

In [96]:
movie_ratings[-10:]

[('Forrest Gump (1994)', array([ 2.00717521], dtype=float32)),
 ('Silence of the Lambs, The (1991)', array([ 2.01407266], dtype=float32)),
 ('Godfather: Part II, The (1974)', array([ 2.02144289], dtype=float32)),
 ('Fargo (1996)', array([ 2.07270026], dtype=float32)),
 ('Star Wars: Episode IV - A New Hope (1977)',
  array([ 2.07493281], dtype=float32)),
 ("Schindler's List (1993)", array([ 2.09836292], dtype=float32)),
 ('Usual Suspects, The (1995)', array([ 2.1232934], dtype=float32)),
 ('Pulp Fiction (1994)', array([ 2.16603613], dtype=float32)),
 ('Godfather, The (1972)', array([ 2.2109468], dtype=float32)),
 ('Shawshank Redemption, The (1994)', array([ 2.42214465], dtype=float32))]