In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

my_path = '/content/notebooks'
try: os.symlink('/content/drive/My Drive/Colab Notebooks/my_env', my_path)
except: print('\nAlready linked...')
sys.path.insert(0, my_path)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dropout, Activation, BatchNormalization, LeakyReLU, concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.regularizers import l2

from bokeh.plotting import output_notebook, figure, show
from bokeh.layouts import row, column
from bokeh.resources import INLINE
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, TableColumn
output_notebook(INLINE)

In [None]:
display(glob.glob('drive/My Drive/RecSys/data/ml-25m/*'))
base_path = 'drive/My Drive/RecSys/'

['drive/My Drive/RecSys/data/ml-25m/README.txt',
 'drive/My Drive/RecSys/data/ml-25m/links.csv',
 'drive/My Drive/RecSys/data/ml-25m/tags.csv',
 'drive/My Drive/RecSys/data/ml-25m/ratings.csv',
 'drive/My Drive/RecSys/data/ml-25m/genome-tags.csv',
 'drive/My Drive/RecSys/data/ml-25m/genome-scores.csv',
 'drive/My Drive/RecSys/data/ml-25m/movies.csv',
 'drive/My Drive/RecSys/data/ml-25m/ratings_v1.csv',
 'drive/My Drive/RecSys/data/ml-25m/movies_v1.csv',
 'drive/My Drive/RecSys/data/ml-25m/valid.csv',
 'drive/My Drive/RecSys/data/ml-25m/train.csv',
 'drive/My Drive/RecSys/data/ml-25m/test_a.csv',
 'drive/My Drive/RecSys/data/ml-25m/test_q.csv',
 'drive/My Drive/RecSys/data/ml-25m/movies_v2.csv']

In [None]:
ratings = pd.read_csv(base_path + 'data/ml-25m/ratings_v1.csv')

train = pd.read_csv(base_path + 'data/ml-25m/train.csv')
valid = pd.read_csv(base_path + 'data/ml-25m/valid.csv')

test_q = pd.read_csv(base_path + 'data/ml-25m/test_q.csv')
test_a = pd.read_csv(base_path + 'data/ml-25m/test_a.csv')

In [None]:
print(valid.shape)
valid.head(3)

(2491865, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,8,963,5.0,859381992
1,8,195,4.0,859382015
2,8,1079,5.0,859382042


## 1. Preprocessing

Remove all unseen data in validation and test sets for memory efficiency

In [None]:
seen_data = train.movieId.unique()

In [None]:
def remove_unseen(df):
    df = df[df.movieId.isin(seen_data)]
    df = df.reset_index(drop = True)
    return df

In [None]:
valid, test_q = map(remove_unseen, (valid, test_q))

In [None]:
print(valid.shape)
valid.head(3)

(2490440, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,8,963,5.0,859381992
1,8,195,4.0,859382015
2,8,1079,5.0,859382042


In [None]:
def remove_invalid(df_q, df_a):
    exist_a = df_a.userId.unique()
    return df_q[df_q.userId.isin(exist_a)]

In [None]:
test_q = remove_invalid(test_q, test_a)
print(test_q.shape)
test_q.head(3)

(2166301, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,9,1209,3.0,1227570828
1,9,1210,3.0,1227570836
2,9,1211,3.5,1227570841


## 2. U-AutoRec

In [None]:
idx_item_map = train.movieId.unique()
item_idx_map = {e: i for i, e in enumerate(idx_item_map)}
n_item = idx_item_map.shape[0]
print(n_item)

56946


In [None]:
def embedd_movie_idx(df):
    return df.assign(movieId = lambda x: x.movieId.map(item_idx_map))

In [None]:
train, valid, test_q = map(embedd_movie_idx, (train, valid, test_q))

In [None]:
def make_interaction(df, user_based = True):
    if user_based:
        return df.groupby('userId', as_index = False)[['movieId', 'rating']].agg(list)
    else:
        return df.groupby('movieId', as_index = False)[['userId', 'rating']].agg(list)

In [None]:
def generate_batch(df, n_item, epochs = 10, batch_size = 256, predict = False):
    n_user = df.index.size
    for _ in range(epochs + 1):
        if not predict:
            df = df.sample(frac = 1).reset_index(drop = True) # At fit_generator, shuffle does not work if steps_per_epoch != None
        idx = 0
        while idx < n_user:
            batchId = idx % batch_size
            if batchId == 0:
                batch = np.zeros(shape = (batch_size, n_item))
            movieId = df.at[idx, 'movieId']
            rating = df.at[idx, 'rating']
            idx += 1

            batch[batchId, movieId] = rating
            if batchId == batch_size - 1:
                if not predict:
                    yield batch, batch
                else:
                    yield batch
            if idx == n_user:
                batch = batch[:n_user % batch_size]
                if not predict:
                    yield batch, batch
                else:
                    yield batch

In [None]:
def masked_mse(y_true, y_pred, masked_value = 0):
    mask_true = K.cast_to_floatx(K.not_equal(y_true, masked_value))
    masked_se = K.square(mask_true * (y_true - y_pred))
    masked_mse = K.sum(masked_se, axis = -1) / K.maximum(K.sum(mask_true, axis = -1), 1)
    return masked_mse

In [None]:
def masked_rmse_clip(y_true, y_pred, masked_value = 0):
    mask_true = K.cast_to_floatx(K.not_equal(y_true, masked_value))
    y_pred = K.clip(y_pred, 1, 5)
    masked_se = K.square(mask_true * (y_true - y_pred))
    masked_mse = K.sum(masked_se, axis = -1) / K.maximum(K.sum(mask_true, axis = -1), 1)
    return K.sqrt(masked_mse)

In [None]:
def build_AutoRec(n_item, latent_dim, first_activation, last_activation, reg):
    inputs = x = Input(shape = (n_item, ), name = 'input')
    x = Dense(latent_dim, activation = first_activation, kernel_regularizer = l2(reg), name = 'encoder')(x)
    outputs = Dense(n_item, activation = last_activation, kernel_regularizer = l2(reg), name = 'decoder')(x)
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(optimizer = 'nadam', loss = masked_mse, metrics = [masked_rmse_clip])
    return model

In [None]:
AutoRec = build_AutoRec(n_item, latent_dim = 500, first_activation = 'elu', last_activation = 'elu', reg = 0.001)
AutoRec.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 56946)]           0         
_________________________________________________________________
encoder (Dense)              (None, 500)               28473500  
_________________________________________________________________
decoder (Dense)              (None, 56946)             28529946  
Total params: 57,003,446
Trainable params: 57,003,446
Non-trainable params: 0
_________________________________________________________________


In [None]:
epochs = 100
batch_size = 256

steps_per_epoch = train.userId.unique().shape[0] // batch_size + 1
validation_steps = valid.userId.unique().shape[0] // batch_size + 1

train_gen = make_interaction(train)
train_gen = generate_batch(train_gen, n_item, epochs, batch_size)

valid_gen = make_interaction(valid)
valid_gen = generate_batch(valid_gen, n_item, epochs, batch_size)

In [None]:
%%time
early_stopping = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 5, patience = 5)
model_checkpoint = ModelCheckpoint(base_path + 'model/AutoRec.h5', monitor = 'val_loss', mode = 'min', save_best_only = True)

hist = AutoRec.fit(x = train_gen, epochs = epochs, steps_per_epoch = steps_per_epoch, 
                   validation_data = valid_gen, validation_steps = validation_steps, validation_batch_size = batch_size, 
                   verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
AutoRec.save(base_path + 'model/AutoRec.h5')

In [None]:
AutoRec = load_model(base_path + 'model/AutoRec.h5', compile = False)

In [None]:
p1 = figure(plot_width = 500, plot_height = 300, title = 'MSE', 
           background_fill_color = '#FAFAFA', toolbar_location = None)
p1.xaxis.axis_label = 'epochs'
p1.xaxis.axis_label_text_font_style = 'bold'
p1.yaxis.axis_label = 'loss'
p1.yaxis.axis_label_text_font_style = 'bold'

p2 = figure(plot_width = 500, plot_height = 300, title = 'RMSE', 
           background_fill_color = '#fafafa', toolbar_location = None)
p2.xaxis.axis_label = 'epochs'
p2.xaxis.axis_label_text_font_style = 'bold'
p2.yaxis.axis_label = 'loss'
p2.yaxis.axis_label_text_font_style = 'bold'

X = [i+1 for i in range(epochs)]
Y1 = hist.history['loss']
Y2 = hist.history['val_loss']
Z1 = hist.history['masked_rmse_clip']
Z2 = hist.history['val_masked_rmse_clip']

X, Y1, Y2, Z1, Z2 = map(lambda L: np.take(L, range(9, epochs, 5)), (X, Y1, Y2, Z1, Z2))
source = ColumnDataSource(dict(epoch = X, mse = Y1, val_mse = Y2, rmse_clip = Z1, val_rmse_clip = Z2))

p1.line(x = 'epoch', y = 'mse', source = source, line_width = 2, line_color = 'salmon', legend_label = 'mse')
p1.line(x = 'epoch', y = 'val_mse', source = source, line_width = 2, line_color = 'skyblue', legend_label = 'val_mse')
p1.xaxis.ticker = X

p2.line(x = 'epoch', y = 'rmse_clip', source = source, line_width = 2, line_color = 'salmon', legend_label = 'rmse')
#p2.scatter(x = 'epoch', y = 'rmse_clip', source = source, color = 'black', size = 5, alpha = 0.2)
p2.line(x = 'epoch', y = 'val_rmse_clip', source = source, line_width = 2, line_color = 'skyblue', legend_label = 'val_rmse')
#p2.scatter(x = 'epoch', y = 'val_rmse_clip', source = source, color = 'black', size = 5, alpha = 0.2)
p2.xaxis.ticker = X

columns = [TableColumn(field = 'epoch', title = 'epoch'),
           TableColumn(field = 'mse', title = 'loss'),
           TableColumn(field = 'val_mse', title = 'val_loss'),
           TableColumn(field = 'rmse_clip', title = 'clip_rmse'),
           TableColumn(field = 'val_rmse_clip', title = 'val_clip_rmse')]
table = DataTable(source = source, columns = columns, width = 1000, height = 300)

show(column(row(p1, p2), table))

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# pd.DataFrame(hist.history).to_csv(base_path + 'model/AutoRec_history.csv')