In [1]:
import numpy as np
import pandas as pd
from time import time

from keras.models import Model
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense , merge
from keras.layers.merge import dot,add
from keras.utils.vis_utils import model_to_dot
from keras.callbacks import ReduceLROnPlateau
from keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from keras.layers import Dropout, Flatten,Activation,Input,Embedding
print("Packages imported")

Packages imported


In [2]:
print('reading rating data...')
tic = time()
data = np.loadtxt('./ml-1m/ratings.dat', skiprows=0,  delimiter='::').astype('int32')
print("reading user data...")
datContent = [i.strip().split('::') for i in open("./ml-1m/users.dat",encoding="ISO-8859-1").readlines()]
user_data = pd.DataFrame(datContent,columns=['userId','gender','occupation','age','zip'])
print('data read in', time() - tic, 'seconds')

reading rating data...
reading user data...
data read in 5.810754776000977 seconds


In [3]:
df =pd.DataFrame(data)
df.columns=['userId','movieId','rating','timestampe']
df.head()

Unnamed: 0,userId,movieId,rating,timestampe
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
user_data.userId=user_data.userId.astype(int)
df = df.merge(user_data,on='userId',how='left')
df.head()

Unnamed: 0,userId,movieId,rating,timestampe,gender,occupation,age,zip
0,1,1193,5,978300760,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067


In [5]:
df.age.value_counts()

4     131032
0     130499
7     105425
1      85351
17     72816
20     60397
12     57214
2      50068
14     49109
16     46021
6      37205
3      31623
10     23290
15     22951
5      21850
11     20563
19     14904
13     13754
18     12086
9      11345
8       2706
Name: age, dtype: int64

In [6]:
users = df.userId.unique()
movies = df.movieId.unique()
occupations = df.occupation.unique()
gender = df.gender.unique()
age = df.age.unique()
zip_ = df.zip.unique()


userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}
occupationid2idx = {o:i for i,o in enumerate(occupations)}

zipid2idx = {o:i for i,o in enumerate(zip_)}
genderid2idx = {o:i for i,o in enumerate(gender)}
ageid2idx = {o:i for i,o in enumerate(age)}


In [7]:
df['userId'] = df['userId'].apply(lambda x: userid2idx[x])
df['movieId'] = df['movieId'].apply(lambda x: movieid2idx[x])
df['occupation'] = df['occupation'].apply(lambda x: occupationid2idx[x])

df['zip'] = df['zip'].apply(lambda x: zipid2idx[x])
df['gender'] = df['gender'].apply(lambda x: genderid2idx[x])
df['age'] = df['age'].apply(lambda x: ageid2idx[x])




In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,timestampe,gender,occupation,age,zip
0,0,0,5,978300760,0,0,0,0
1,0,1,3,978302109,0,0,0,0
2,0,2,3,978301968,0,0,0,0
3,0,3,4,978300275,0,0,0,0
4,0,4,5,978824291,0,0,0,0


In [9]:
split = np.random.rand(len(df)) < 0.8
train = df[split]
valid = df[~split]
print(train.shape , valid.shape)

(801104, 8) (199105, 8)


In [10]:
n_movies=len(df['movieId'].unique())
n_users=len(df['userId'].unique())
n_occu=len(df['occupation'].unique())

n_gender=len(df['gender'].unique())
n_zip=len(df['zip'].unique())
n_age=len(df['age'].unique())

n_latent_factors=64  

In [11]:
user_input=Input(shape=(1,),name='user_input',dtype='int64')
user_embedding=Embedding(n_users,n_latent_factors,name='user_embedding')(user_input)
user_vec =Flatten(name='FlattenUsers')(user_embedding)

In [12]:
user_occu =Input(shape=(1,),name='user_occu',dtype='int64')
occu_embedding=Embedding(n_occu,n_latent_factors,name='occu_embedding')(user_occu)
occu_vec =Flatten(name='FlattenOccu')(occu_embedding)

In [13]:
movie_input=Input(shape=(1,),name='movie_input',dtype='int64')
movie_embedding=Embedding(n_movies,n_latent_factors,name='movie_embedding')(movie_input)
movie_vec=Flatten(name='FlattenMovies')(movie_embedding)


In [14]:
gender_input=Input(shape=(1,),name='gender_input',dtype='int64')
gender_embedding=Embedding(n_gender,n_latent_factors,name='gender_embedding')(gender_input)
gender_vec=Flatten(name='FlattenGender')(gender_embedding)


In [15]:
zip_input=Input(shape=(1,),name='zip_input',dtype='int64')
zip_embedding=Embedding(n_zip ,n_latent_factors,name='zip_embedding')(zip_input)
zip_vec=Flatten(name='FlattenZip')(zip_embedding)


In [16]:
age_input=Input(shape=(1,),name='age_input',dtype='int64')
age_embedding=Embedding(n_age ,n_latent_factors,name='age_embedding')(age_input)
age_vec=Flatten(name='FlattenAge')(age_embedding)


In [17]:
sim_item_user=dot([user_vec,movie_vec],name='Simalarity-Dot-Product',axes=1)
sim_occ_user=dot([user_vec,occu_vec],name='Simalarity-Dot-Product-Occu',axes=1)

sim_gender_user=dot([user_vec,gender_vec],name='Simalarity-Dot-Product-gender',axes=1)
sim_age_user=dot([user_vec,age_vec],name='Simalarity-Dot-Product-age',axes=1)

add_layer_sim = add([sim_item_user,sim_occ_user,sim_gender_user,sim_age_user])


model =Model([user_input, movie_input,user_occu,gender_input,age_input],add_layer_sim)
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_occu (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender_input (InputLayer)       [(None, 1)]          0                                            
______________________________________________________________________________________________

In [18]:
model.compile(optimizer=Adam(lr=1e-4),loss='mse')
train.shape
batch_size=128
epochs=10

History = model.fit([train.userId,train.movieId,train.occupation,train.gender,train.age],train.rating, batch_size=batch_size,
                    epochs =epochs, 
                    validation_data = ([valid.userId,valid.movieId,valid.occupation,valid.gender,valid.age],valid.rating),
                              verbose = 1)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
