In [1]:
try:
  %tensorflow_version 2.x  # Colab only.
except Exception:
  pass

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, os

`%tensorflow_version` only switches the major version: `1.x` or `2.x`.
You set: `2.x  # Colab only.`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [4]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-25m.zip

--2019-12-14 22:32:36--  http://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2019-12-14 22:32:56 (13.0 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]



In [5]:
!unzip -n ml-25m.zip

Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


In [6]:
!ls

ml-25m	ml-25m.zip  sample_data


In [0]:
df = pd.read_csv('ml-25m/ratings.csv')

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [11]:
#userID not numbered
df.userId = pd.Categorical(df.userId)
df['new_user_id'] = df.userId.cat.codes
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id
0,1,296,5.0,1147880044,0
1,1,306,3.5,1147868817,0
2,1,307,5.0,1147868828,0
3,1,665,5.0,1147878820,0
4,1,899,3.5,1147868510,0


In [15]:
df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes
df.tail()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id,new_movie_id
25000090,162541,50872,4.5,1240953372,162540,11359
25000091,162541,55768,2.5,1240951998,162540,11925
25000092,162541,56176,2.0,1240950697,162540,11972
25000093,162541,58559,4.0,1240953434,162540,12216
25000094,162541,63876,5.0,1240952515,162540,12819


In [18]:
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values

N = len(set(user_ids))
M = len(set(movie_ids))

#Embedding dim
K = 10

print(N, M, K)

162541 59047 10


In [0]:
# Make the network

# since we will concat user and movie ids, we need the functional, and not sequential API

# user input
u = Input(shape = (1,))
# movie input
m = Input(shape = (1,))
# user embedding
u_emb = Embedding(N, K)(u)
# movie embedding
m_emb = Embedding(M, K)(m)
# Flatten embeddings
u_emb = Flatten()(u_emb)
m_emb = Flatten()(m_emb)
# Concatenate user-movie embeddings into a feature vector
x = Concatenate()([u_emb,m_emb])
# from now on, it's just a regular ANN
x = Dense(1024, activation = 'relu')(x)
# output layer
x = Dense(1)(x)

In [0]:
# Compile

model = Model(inputs = [u, m], outputs = x)
model.compile(loss = 'mse', opimizer = SGD(lr = .08, momentum = .9))

In [0]:
# split the data
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)
Ntrain = int(.8*len(ratings))

train_user = user_ids[:Ntrain]
train_movie = movie_ids[:Ntrain]
train_ratings = ratings[:Ntrain]

test_user = user_ids[Ntrain:]
test_movie = movie_ids[Ntrain:]
test_ratings = ratings[Ntrain:]

# center the ratings
avg_rating = train_ratings.mean()
# reduce by the mean
train_ratings = train_ratings - avg_rating
test_ratings = test_ratings - avg_rating

In [36]:
#tf.keras.models.load_model('model.h5')

history = model.fit([train_user, train_movie], train_ratings,
                    epochs = 1,
                    batch_size = 1024,
                    validation_data = ([test_user, test_movie], test_ratings))

Train on 20000076 samples, validate on 5000019 samples


In [0]:
tf.keras.models.save_model(model, 'model.h5')

In [38]:
RMSE = np.sqrt(.66)
RMSE

0.812403840463596

Not bad according to 
https://datascience.stackexchange.com/questions/29740/benchmark-result-for-movielens-dataset :
RMSE = .8 - .81