In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from tensorflow.keras.layers import Dense , Flatten ,Embedding,Input,Concatenate,Flatten,Dropout,Activation,BatchNormalization
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
def read_data(path):
    files = {}
    for filename in path.glob('*'):
        if filename.suffix == '.csv':
            files[filename.stem] = pd.read_csv(filename)
        elif filename.suffix == '.dat':
            if filename.stem == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(filename, sep='::', names=columns, engine='python')
            files[filename.stem] = data
    return files['ratings'], files['movies']

In [3]:
file_path=Path.cwd()/ 'ml-latest-small'
ratings, movies = read_data(file_path)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
def matrix_preview(ratings, n=20):
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:n]
    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:n]
    top = (
        ratings.
        join(top_users, rsuffix='_r', how='inner', on='userId').
        join(top_movies, rsuffix='_r', how='inner', on='movieId'))
    return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)

In [7]:
matrix_preview(ratings,10)

movieId,1,260,296,318,356,480,527,589,593,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15,2.0,5.0,5.0,2.0,1.0,3.0,4.0,4.0,5.0,5.0
30,4.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,3.0
73,5.0,4.5,5.0,5.0,5.0,4.0,5.0,3.0,4.5,4.5
311,3.0,4.0,3.0,4.5,5.0,4.5,5.0,4.5,2.0,4.0
380,4.0,4.0,5.0,4.0,5.0,4.0,,4.0,5.0,5.0
452,3.5,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,2.0
468,4.0,3.5,3.5,3.5,3.0,2.5,,,3.0,3.0
547,3.5,,5.0,5.0,2.0,3.0,5.0,,5.0,3.5
564,4.0,2.0,5.0,,3.0,5.0,4.0,5.0,5.0,3.0
624,5.0,5.0,5.0,,3.0,3.0,,3.0,5.0,2.0


In [8]:
def create_dataset(ratings):  
    unique_users = ratings.userId.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.userId.map(user_to_index)
    
    unique_movies = ratings.movieId.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    new_movies = ratings.movieId.map(movie_to_index)
    
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users,'movie_id': new_movies})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [9]:
(n_users, n_movies), (X, y), _ = create_dataset(ratings)


In [29]:
def gen_ncf(n_users,n_movies,n_factors=64,n_hidden=128):
    user_input=Input(shape=(1,)) 
    user_embedding=Embedding(n_users, n_factors, input_length=1)(user_input)
    user_embedding=Flatten()(user_embedding)
    movie_input=Input(shape=(1,)) 
    movie_embedding=Embedding(n_movies, n_factors, input_length=1)(movie_input)
    movie_embedding=Flatten()(movie_embedding)
    um_vector=Concatenate(axis=1)([user_embedding,movie_embedding])
    um_vector=Dense(n_hidden, use_bias=True)(um_vector)
 #   um_vector=BatchNormalization()(um_vector)
    um_vector=Activation("relu")(um_vector)
    um_vector=Dropout(0.5)(um_vector)
    um_vector=Dense(0.5*n_hidden,use_bias=True)(um_vector)
 #   um_vector=BatchNormalization()(um_vector)
    um_vector=Activation("relu")(um_vector)
    um_vector=Dropout(0.5)(um_vector)
    um_vector=Dense(0.25*n_hidden, activation='relu')(um_vector)
    um_vector=Dropout(0.5)(um_vector)
    output=Dense(1)(um_vector)
    model=Model([user_input,movie_input], output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [30]:
ncf=gen_ncf(n_users,n_movies)
ncf.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1, 64)        42944       input_9[0][0]                    
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 64)        580224      input_10[0][0]                   
____________________________________________________________________________________________

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [32]:
callback = EarlyStopping(monitor='val_loss', patience=5)

ncf.fit([X_train.user_id.values,X_train.movie_id.values], y_train.values,epochs=20,batch_size=200,shuffle=True,
        callbacks=[callback],
        validation_data=(([X_valid.user_id.values,X_valid.movie_id.values],y_valid.values)))

Train on 80003 samples, validate on 20001 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<tensorflow.python.keras.callbacks.History at 0x1423d5eee08>

In [41]:
y_predict=ncf.predict([X_valid.user_id.values,X_valid.movie_id.values])
y_predict[y_predict>=5]=5

In [42]:

from sklearn.metrics import mean_squared_error
rmse=mean_squared_error(y_valid.values, y_predict)**0.5
print(rmse)

0.9098748029794335
