In [1]:
pip install keras-rectified-adam



In [2]:
import keras
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from keras.models import Model
from keras.layers.merge import dot
from keras.layers import Flatten, Dense, Input, Embedding
from keras.layers import GlobalAveragePooling1D, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras_radam import RAdam

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie = pd.read_csv('/content/drive/My Drive/MovieLens/movie.csv')
rating = pd.read_csv('/content/drive/My Drive/MovieLens/rating.csv')
movie = movie.loc[:,["movieId","genres"]]
rating = rating.loc[:,["userId","movieId","rating"]]
data = pd.merge(rating, movie)

n_movies = len(data['movieId'].unique())
n_users = len(data['userId'].unique())

In [4]:
data['rating'] = (data['rating']-0.5)/4.5

In [5]:
genres_split = data.genres.str.split('|').tolist()
genres_unique = set()
for movie_genres in genres_split:
    for genre in movie_genres:
        genres_unique.add(genre)
genres2idx = {o:i+1 for i,o in enumerate(genres_unique)}
genres_split = [[genres2idx[x] for x in movie_genres] for movie_genres in genres_split]
padded_genres = tf.keras.preprocessing.sequence.pad_sequences(
    genres_split, padding="post"
)

In [6]:
n_user_embedding = 32
n_movie_embedding = 16
n_genres_embedding = 16
n_hidden_layer = 32
dropout_rate = 0.25

user_input = Input(shape=(1,), name='user_input', dtype='int64')
user_embedding = Embedding(n_users, n_user_embedding, name='user_embedding')(user_input)
user_vector = BatchNormalization()(Dropout(dropout_rate)(Flatten()(user_embedding)))

movie_input = Input(shape=(1,), name='movie_input', dtype='int64')
movie_embedding = Embedding(n_movies, n_movie_embedding, name='movie_embedding')(movie_input)
movie_vector = BatchNormalization()(Dropout(dropout_rate)(Flatten()(movie_embedding)))

genres_input = Input(shape=(len(padded_genres[0]),), name='genres_input', dtype='int64') # input = (10, 1)
genres_embedding = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, name='genres_embedding')(genres_input) # input = 21
genres_average_embedding = GlobalAveragePooling1D()(genres_embedding)
genres_vector = BatchNormalization()(Dropout(dropout_rate)(Flatten()(genres_average_embedding)))

concat_layer = tf.concat([user_vector, movie_vector, genres_vector], 1, name='concat_layer')
hidden_layer = Dense(n_hidden_layer, activation='relu')(concat_layer)
output = Dense(1, activation='sigmoid')(hidden_layer)

model = Model([user_input, movie_input, genres_input], output)
# age 추가 실험?
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
genres_input (InputLayer)       [(None, 10)]         0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
genres_embedding (Embedding)    (None, 10, 16)       336         genres_input[0][0]               
______________________________________________________________________________________________

In [7]:
users = data.userId.unique()
movies = data.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

data['userId'] = data['userId'].apply(lambda x: userid2idx[x])
data['movieId'] = data['movieId'].apply(lambda x: movieid2idx[x])

data = data.drop(columns=['genres'])
for i in range(10):
    data['genre'+str(i)] = padded_genres[:,i]

In [8]:
split = np.random.rand(len(data)) < 0.9
train = data[split]
valid = data[~split]

In [9]:
model.compile(optimizer=RAdam(), loss='mse')
batch_size=32768
epochs=50
history = model.fit([train.userId, train.movieId, train.iloc[:,3:13]],train.rating,
                    batch_size=batch_size, epochs=epochs,
                    validation_data = ([valid.userId, valid.movieId, valid.iloc[:,3:13]],valid.rating), verbose = 1)

Epoch 1/50
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50

KeyboardInterrupt: ignored