In [0]:
import numpy as np
import pandas as pd
import os
import tensorflow as T
import keras
from keras import backend as K
from keras import initializers
from keras.initializers import RandomNormal
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape,  Flatten, Dropout
from keras.optimizers import Adagrad, Adam, SGD, RMSprop, Adamax
from keras.regularizers import l2
from keras.layers import Multiply, Concatenate
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from time import time
import multiprocessing as mp
import sys
import math
import argparse
import matplotlib.pyplot as plt
from IPython.display import display, HTML

Using TensorFlow backend.


In [0]:
def binning(col, cut_points, labels=None):
    minval = col.min()
    maxval = col.max()

    break_points = [minval] + cut_points + [maxval]

    if not labels:
        labels = range(len(cut_points)+1)

    colBin = pd.cut(col, bins=break_points, labels=labels, include_lowest=True)
    return colBin

In [0]:
def get_gmf_model_occupation(num_users, num_items,num_occupations,latent_oc_dim, latent_dim,do,num_genre):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    genre_input = Input(shape=(num_genre,),dtype='float32', name = 'genre_input')
    occupation_input = Input(shape=(1,), dtype='float32', name = 'occupation_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding', input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding', input_length=1) 
    MF_Embedding_occupation = Embedding(input_dim = num_occupations, output_dim = latent_oc_dim, name = 'occupation_embedding', input_length=1)   

    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    occupation_latent = Flatten()(MF_Embedding_occupation(occupation_input))
    
    # Element-wise product of user and item embeddings
    hidden1 = Multiply()([user_latent, item_latent])
   
    
    conc = Concatenate()([hidden1, occupation_latent,genre_input])
    drop = Dropout(do)(conc)
    prediction = Dense(1, activation='relu', kernel_initializer='lecun_uniform', name = 'prediction')(conc)
    
    
    model = Model(input=[user_input, item_input,genre_input,occupation_input], output=prediction)

    return model

In [0]:
def get_ncf_model_genre(num_users, num_items,num_genre, latent_dim,latent_genre_dim,hidden_dim,do,num_occupations,latent_oc_dim):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input_ncf')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input_ncf')
    genre_input = Input(shape=(num_genre,),dtype='float32', name = 'genre_input_ncf')
    occupation_input = Input(shape=(1,), dtype='float32', name = 'occupation_input_ncf')
    
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding_ncf', input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding_ncf', input_length=1)   
    MF_Embedding_occupation = Embedding(input_dim = num_occupations, output_dim = latent_oc_dim, name = 'occupation_embedding_ncf', input_length=1)
#     MF_Embedding_genre = Embedding(input_dim = num_genre, output_dim = latent_genre_dim, name = 'genre_embedding',input_length=19)   

    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    occupation_latent = Flatten()(MF_Embedding_occupation(occupation_input))
#     genre_latent = Flatten()(MF_Embedding_Item(genre_input))
    # Concat user and item embeddings with gender
    
    
    conc = Concatenate()([user_latent, item_latent, genre_input,occupation_latent])
    drop = Dropout(do)(conc)
    hid1 = Dense(hidden_dim, activation='relu')(conc)
    drop2  = Dropout(do)(hid1)
    hid2 = Dense(10,activation='relu')(drop2)
    drop3  = Dropout(do)(hid2)
    prediction = Dense(1, activation='relu', kernel_initializer='lecun_uniform', name = 'prediction_ncf')(drop3)
    
    model = Model(input=[user_input, item_input, genre_input,occupation_input], output=prediction)
    return model

In [0]:
def combination_model(num_users, num_items,num_occupations,num_genre,latent_oc_dim, latent_dim,do,hidden_dim,latent_genre_dim):
  
  ncf_model=get_ncf_model_genre(num_users, num_items,num_genre, latent_dim,latent_genre_dim,hidden_dim,do,num_occupations,latent_oc_dim)
  gmf_model=get_gmf_model_occupation(num_users, num_items,num_occupations,latent_oc_dim, latent_dim,do,num_genre)
  conc = Concatenate()([ncf_model.output,gmf_model.output])
#   drop = Dropout(do)(conc)
  prediction = Dense(1, activation='relu', kernel_initializer='lecun_uniform', name = 'prediction_comb')(conc)
  list=[]
  list+=ncf_model.input + gmf_model.input
  model = Model(input=list, output=prediction)
  return model




In [0]:
!pip install -U -q PyDrive ## you will have install for every colab session

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
RATING_DATA_FILE_TRAIN = 'u1.base'
RATING_DATA_FILE_TEST = 'u1.test'
RATING_DATA_FILE_USER = 'u.user'
RATING_DATA_FILE_ITEM = 'u.item'
RATING_DATA_FILE_OCCUPATION = 'u.occupation'
RATING_DATA_GEN_FILE = 'u_gen.data'
RATINGS_CSV_FILE_NORM = 'u_norm.data'
RATINGS_GEN_CSV_FILE = 'u_genr.data'
MODEL_WEIGHTS_FILE = 'u_emb_weights.h5'
MODEL_WEIGHTS_FILE_CORE = 'u_emb_weights'

In [0]:
rating_file_import_train = drive.CreateFile({'id':'1SClzmjAoOCDY5kYWGHaA27RCkzqupddU'})
rating_file_import_train.GetContentFile(RATING_DATA_FILE_TRAIN)
rating_file_import_test = drive.CreateFile({'id':'1ELSUkW4DWU7TEWSto6nDBTewWLwHVCh2'})
rating_file_import_test.GetContentFile(RATING_DATA_FILE_TEST)
rating_file_import_user = drive.CreateFile({'id':'1m2UzDHT1lsLNE8TB6r3c7IIk2-iqbYfi'})
rating_file_import_user.GetContentFile(RATING_DATA_FILE_USER)
rating_file_import_item = drive.CreateFile({'id':'1F4E6SnymhtDs1F0kx0hR8cvBy6BquRrb'})
rating_file_import_item.GetContentFile(RATING_DATA_FILE_ITEM)
rating_file_import_occupation = drive.CreateFile({'id':'1POeYU9Vq67BWOZ0zBxg1poUiKhGXtaEZ'})
rating_file_import_occupation.GetContentFile(RATING_DATA_FILE_OCCUPATION)


In [0]:
ratings = pd.read_csv(RATING_DATA_FILE_TRAIN, 
                    sep='\t', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userid', 'movieid', 'rating', 'timestamp'])
max_userid = ratings['userid'].drop_duplicates().max()
max_movieid = ratings['movieid'].drop_duplicates().max()
ratings['user_emb_id'] = ratings['userid'] - 1
ratings['movie_emb_id'] = ratings['movieid'] - 1
print(str(len(ratings))+' ratings loaded')

80000 ratings loaded


In [0]:
test_ratings = pd.read_csv(RATING_DATA_FILE_TEST, 
                    sep='\t', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userid', 'movieid', 'rating', 'timestamp'])
test_ratings['user_emb_id'] = test_ratings['userid'] - 1
test_ratings['movie_emb_id'] = test_ratings['movieid'] - 1
print(str(len(test_ratings))+' ratings loaded')

20000 ratings loaded


In [0]:
items = pd.read_csv(RATING_DATA_FILE_ITEM,sep='|', engine='python',encoding='latin-1',names=['movieid','movie title','release date','video release date','IMDb URL'
          ,'unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western'])


In [0]:
users = pd.read_csv(RATING_DATA_FILE_USER,sep ='|', engine='python',encoding='latin-1',names=['userid','age','gender','occupation','zip_code'])
#users = age_data_prapare(users)
users["age_bin"] = binning(users['age'],[12,20,30,45],[0,1,2,3,4])

In [0]:
occupations = pd.read_csv(RATING_DATA_FILE_OCCUPATION,sep ='\t', engine='python',encoding='latin-1',names=['occupation'])
occupations['occupation_id']=pd.Series(range(0,occupations.shape[0]))

In [0]:
genres = items[['unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']]

In [0]:
ratings = pd.merge(ratings,users) 
ratings = pd.merge(ratings,items)
ratings = pd.merge(ratings,occupations)


In [0]:
Users = ratings['user_emb_id'].values
Movies = ratings['movie_emb_id'].values
Genres = ratings[['unknown','Action','Adventure','Animation','Childrens','Comedy',
          'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
        'Thriller','War','Western']].values
#Ocupations = ratings['occupation_id'].values
Ages = ratings['age_bin'].values
Ratings = ratings['rating'].values

In [119]:
#combained_modal = combination_model(Users.shape[0],Movies.shape[0],Ocupations.shape[0],Genres.shape[1],4,20,0.5,20,5)
combained_modal = combination_model(Users.shape[0],Movies.shape[0],5,Genres.shape[1],2,20,0.5,20,5)
combained_modal.compile(loss='mse',optimizer=Adamax(),metrics=['mae'])
callbacks_ncf = [EarlyStopping('val_loss', patience=5), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE+'_ncf_'+str(0.5)+'_'+str(20)+'_'+str(20)+'.h5', save_best_only=True)]
history_history_ncf = combained_modal.fit([Users, Movies,Genres,Ages,Users, Movies,Genres,Ages], Ratings, nb_epoch=30, validation_split=.1, verbose=1, callbacks=callbacks_ncf, batch_size = 32)
#predict=combained_modal.predict([test_Users,test_Movies])

  # Remove the CWD from sys.path while we load stuff.
  """


Train on 72000 samples, validate on 8000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

KeyboardInterrupt: ignored