In [1]:
pip install keras-rectified-adam



In [2]:
import keras
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Flatten, Dense, Input, Embedding, Dot
from keras.layers import GlobalAveragePooling1D, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras_radam import RAdam

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie = pd.read_csv('/content/drive/My Drive/MovieLens/movie.csv')
rating = pd.read_csv('/content/drive/My Drive/MovieLens/rating.csv')
movie = movie.loc[:,["movieId","genres"]]
rating = rating.loc[:,["userId","movieId","rating"]]
data = pd.merge(rating, movie)

In [4]:
n_movies = len(data['movieId'].unique())
n_users = len(data['userId'].unique())
print(n_movies, n_users, len(rating))

26744 138493 20000263


In [5]:
genres_split = data.genres.str.split('|').tolist()
genres_unique = set()
for movie_genres in genres_split:
    for genre in movie_genres:
        genres_unique.add(genre)
genres2idx = {o:i+1 for i,o in enumerate(genres_unique)}
genres_split = [[genres2idx[x] for x in movie_genres] for movie_genres in genres_split]
padded_genres = tf.keras.preprocessing.sequence.pad_sequences(
    genres_split, padding="post"
)

users = data.userId.unique()
movies = data.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

data['userId'] = data['userId'].apply(lambda x: userid2idx[x])
data['movieId'] = data['movieId'].apply(lambda x: movieid2idx[x])

data = data.drop(columns=['genres'])
for i in range(10):
    data['genre'+str(i)] = padded_genres[:,i]

split = np.random.rand(len(data)) < 0.9
train = data[split]
valid = data[~split]
data.head(10)

Unnamed: 0,userId,movieId,rating,genre0,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8,genre9
0,0,0,3.5,1,7,9,0,0,0,0,0,0,0
1,1,0,3.0,1,7,9,0,0,0,0,0,0,0
2,2,0,3.0,1,7,9,0,0,0,0,0,0,0
3,3,0,3.0,1,7,9,0,0,0,0,0,0,0
4,4,0,3.0,1,7,9,0,0,0,0,0,0,0
5,5,0,3.0,1,7,9,0,0,0,0,0,0,0
6,6,0,1.0,1,7,9,0,0,0,0,0,0,0
7,7,0,3.5,1,7,9,0,0,0,0,0,0,0
8,8,0,2.0,1,7,9,0,0,0,0,0,0,0
9,9,0,4.0,1,7,9,0,0,0,0,0,0,0


In [6]:
n_user_embedding = 32
n_movie_embedding = int(n_user_embedding * 0.75)
n_genres_embedding = int(n_user_embedding * 0.25)
keras.backend.clear_session()

user_input = Input(shape=(1,), name='user_input', dtype='int64')
user_embedding = Embedding(n_users, n_user_embedding, name='user_embedding')(user_input)
user_vector = Flatten()(user_embedding)

movie_input = Input(shape=(1,), name='movie_input', dtype='int64')
movie_embedding = Embedding(n_movies, n_movie_embedding, name='movie_embedding')(movie_input)
movie_vector = Flatten()(movie_embedding)

genres_input = Input(shape=(len(padded_genres[0]),), name='genres_input', dtype='int64') # input = (10, 1)
genres_embedding = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, name='genres_embedding')(genres_input) # input = 21
genres_average_embedding = GlobalAveragePooling1D()(genres_embedding)
genres_vector = Flatten()(genres_average_embedding)

concat_layer = tf.concat([user_vector, movie_vector, genres_vector], 1, name='concat_layer')
concat_vector = BatchNormalization()(concat_layer)
mlp_layer = Dense(16, activation='relu')(concat_layer)

user_embedding2 = Embedding(n_users, n_user_embedding, name='user_embedding2')(user_input)
user_vector2 = Flatten()(user_embedding2)
movie_embedding2 = Embedding(n_movies, n_movie_embedding, name='movie_embedding2')(movie_input)
movie_vector2 = Flatten()(movie_embedding2)
genres_embedding2 = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, name='genres_embedding2')(genres_input) # input = 21
genres_average_embedding2 = GlobalAveragePooling1D()(genres_embedding2)
genres_vector2 = Flatten()(genres_average_embedding2)
matrix_factorization = Dot(axes=1)([user_vector2, tf.concat([movie_vector2, genres_vector2], 1)])

output_layer = tf.concat([matrix_factorization, mlp_layer], 1)
output = Dense(1, activation='sigmoid')(output_layer)
output = output*4.5 + 0.5

model = Model([user_input, movie_input, genres_input], output)
# age 추가 실험?
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
genres_input (InputLayer)       [(None, 10)]         0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
genres_embedding2 (Embedding)   (None, 10, 8)        168         genres_input[0][0]               
______________________________________________________________________________________________

In [None]:
model.compile(optimizer=Adam(), loss='mse')
batch_size = 4096*8
epochs=10

history = model.fit([train.userId, train.movieId, train.iloc[:,3:13]],train.rating,
                batch_size=batch_size, epochs=epochs,
                validation_data = ([valid.userId, valid.movieId, valid.iloc[:,3:13]],valid.rating), verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [None]:
valid_predictions = model.predict([valid.userId, valid.movieId, valid.iloc[:,3:13]], batch_size = batch_size)
valid_rmse = mean_squared_error(valid.rating, valid_predictions, squared=False) # squared=False -> RMSE
print(valid_rmse)