In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz


import tensorflow as tf
Model = tf.keras.models.Model
Layers = tf.keras.layers
Input, Embedding = Layers.Input, Layers.Embedding
Concatenate, Flatten = Layers.Concatenate, Layers.Flatten
Dropout, BatchNorm = Layers.Dropout, Layers.BatchNormalization
l2 = tf.keras.regularizers.l2
SGD, Adam = tf.keras.optimizers.SGD, tf.keras.optimizers.Adam

## Load Data

In [7]:
df = pd.read_csv('../../data/movielens/edited_rating.csv')

In [8]:
N = df.userId.max() + 1
M = df.movie_idx.max() + 1

## Create Train/Test

In [10]:
df = shuffle(df)
cutoff = int(0.8 * len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

## Convert to Sparse Matrices

Since we have to feed the entire ratings matrix to our Autoencoder, we will convert to sparse matrix for memory management.

In [None]:
# lil_matrix is better for updates
A = lil_matrix((N, M))

count = 0

def update_train(row):
    global count
    count += 1
    if count % 100000 == 0:
        print(f'Processed {count}')
    i = int(row.userId)
    j = int(row.movie_idx)
    A[i, j] = row.rating

df_train.apply(update_train, axis=1)

# csr matrix is better for i/o
A = A.tocsv()
mask = (A > 0)
save_npz('train-sparse.npz', A)

In [None]:
A_test = lil_matrix((N, M))

count = 0

def update_test(row):
    global count
    count += 1
    if count % 100000 == 0:
        print(f'Processed {count}')
    i = int(row.userId)
    j = int(row.movie_idx)
    A_test[i, j] = row.rating

df_test.apply(update_test, axis=1)

A_test = A_test.tocsr()
mask_test = (A_test > 0)
save_npz('test-sparse.npz', A_test)

## Initialize Params

In [3]:
batch_size = 128
epochs = 20
reg = 0.0001

In [None]:
mask = (A > 0) * 1.0
mask_test = (A_test > 0) * 1.0

N, M = A.shape
mu = A.sum() / mask.sum()

## Define Model

In [2]:
i = Input(shape=(M,))

x = Dropout(0.7)(i)

x = Dense(700, activation='tanh', kernel_regularizer=l2(reg))(x)

x = Dense(M, kernel_regularizer=l2(reg))(x)


NameError: name 'N' is not defined

## Compile Model

In [17]:
model.compile(
    loss='mse',
    optimizer=SGD(lr=0.01, momentum=0.9),
    metrics=['mse']
)

## Train Model

In [None]:
r = model.fit(
    x=[df_train.userId.values, df_train.movie_idx.values],
    y=df_train.rating.values - mu,
    epochs=epochs,
    batch_size=128,
    validation_data=(
        [df_test.userId.values, df_test.movie_idx.values],
        df_test.rating.values - mu
    )
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16000210 samples, validate on 4000053 samples
Epoch 1/25

## Plot Results

In [None]:
# plot loss
plt.plot(r.history['loss'], label='train loss')
plt.plot(r.history['val_loss'], label='test loss')
plt.legend()
plt.show()

In [None]:
# plot mse
plt.plot(r.history['mean_squared_error'], label='train mse')
plt.plot(r.history['val_mean_squared_error'], label='test mse')
plt.legend()
plt.show()