In [14]:
import numpy as np
import pandas as pd
from time import time

from keras.models import Model
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense , merge
from keras.layers.merge import dot,add
from keras.utils.vis_utils import model_to_dot
from keras.callbacks import ReduceLROnPlateau
from keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from keras.layers import Dropout, Flatten,Activation,Input,Embedding
print("Packages imported")

Packages imported


In [15]:
print('reading rating data...')
tic = time()
data = np.loadtxt('./ml-1m/ratings.dat', skiprows=0,  delimiter='::').astype('int32')
print("reading user data...")
datContent = [i.strip().split('::') for i in open("./ml-1m/users.dat",encoding="ISO-8859-1").readlines()]
user_data = pd.DataFrame(datContent,columns=['userId','gender','occupation','age','zip'])
print('data read in', time() - tic, 'seconds')

reading rating data...
reading user data...
data read in 5.817676782608032 seconds


In [16]:
df =pd.DataFrame(data)
df.columns=['userId','movieId','rating','timestampe']
df.head()

Unnamed: 0,userId,movieId,rating,timestampe
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [18]:
user_data.userId=user_data.userId.astype(int)
df = df.merge(user_data,on='userId',how='left')
df.head()

Unnamed: 0,userId,movieId,rating,timestampe,gender_x,occupation_x,age_x,zip_x,gender_y,occupation_y,age_y,zip_y
0,1,1193,5,978300760,F,1,10,48067,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067,F,1,10,48067


In [19]:
users = df.userId.unique()
movies = df.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}


df['userId'] = df['userId'].apply(lambda x: userid2idx[x])
df['movieId'] = df['movieId'].apply(lambda x: movieid2idx[x])

In [21]:
split = np.random.rand(len(df)) < 0.8
train = df[split]
valid = df[~split]
print(train.shape , valid.shape)

(800151, 12) (200058, 12)


In [22]:
n_movies=len(df['movieId'].unique())
n_users=len(df['userId'].unique())
n_latent_factors=64  

In [33]:
user_input=Input(shape=(1,),name='user_input',dtype='int64')
user_embedding=Embedding(n_users,n_latent_factors,name='user_embedding')(user_input)
user_vec =Flatten(name='FlattenUsers')(user_embedding)

In [24]:
movie_input=Input(shape=(1,),name='movie_input',dtype='int64')
movie_embedding=Embedding(n_movies,n_latent_factors,name='movie_embedding')(movie_input)
movie_vec=Flatten(name='FlattenMovies')(movie_embedding)


In [25]:
sim_item_user=dot([user_vec,movie_vec],name='Simalarity-Dot-Product',axes=1)
model =Model([user_input, movie_input],sim_item_user)
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 64)        386560      user_input[0][0]                 
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 64)        237184      movie_input[0][0]                
______________________________________________________________________________________________

In [27]:
model.compile(optimizer=Adam(lr=1e-4),loss='mse')
train.shape
batch_size=128
epochs=10

History = model.fit([train.userId,train.movieId,],train.rating, batch_size=batch_size,
                              epochs =epochs, validation_data = ([valid.userId,valid.movieId],valid.rating),
                              verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
model.predict([pd.Series([0]*len(movieid2idx)),pd.Series(movieid2idx.values())])

array([[4.6896791e+00],
       [3.7299058e+00],
       [4.3715343e+00],
       ...,
       [3.9169991e-01],
       [3.9832401e-01],
       [4.2947684e-03]], dtype=float32)