In [None]:
# set seed to reproduce results with tf and keras
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
# other imports
from keras.layers import Input, Dense, multiply, Lambda
from keras.models import Model
from keras import regularizers
from keras.datasets import mnist
from keras import backend as K
import numpy as np
import matplotlib.pyplot as plt
import pickle
import numpy as np
import pandas as pd
from keras import backend as K
from keras import optimizers
from keras import metrics
from keras.callbacks import EarlyStopping, ModelCheckpoint

path =  "../../data/"
dataset = "recsys17/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"


valid_train = pd.read_csv(processed_path + "valid_train_14d.csv", header=0, sep='\t')
valid_test = pd.read_csv(processed_path + "valid_test_14d.csv", header=0, sep='\t')

In [None]:
# load vocabulary
unqiue_train_items_df = pd.read_csv(interim_path + 'vocabulary.csv', index_col=0)
# create dictionary out of it
unqiue_train_items_dict = unqiue_train_items_df.to_dict('dict')["item_id"]
# inverse that item_id is key and index is value
unqiue_train_items_dict_inv = {v: k for k, v in unqiue_train_items_dict.items()}
print("Vocabulary size: " + str(len(unqiue_train_items_dict_inv)))

# encode data
def encode_data(session_df, unqiue_train_items_dict_inv):
    session_vectors = []
    session_groups = session_df.groupby("session_id")
    print(str(len(session_groups)) + " sessions to encode.")
    
    s_counter = 0      
    for session_id, session_group in session_groups:
        # vector length = len(unqiue_train_items)
        session_vector = np.zeros((len(unqiue_train_items_dict_inv),), dtype=int)
        # fill 1s for session items
        for index, row in session_group.iterrows():
            item_index = unqiue_train_items_dict_inv[row["item_id"]]
            # 1-hot encode
            session_vector[item_index] = 1
            
        session_vectors.append(session_vector)
        #session_vectors = np.concatenate(([session_vectors, np.array([session_vector])]), axis=0)
        #session_vectors = np.vstack([session_vectors, session_vector])
        s_counter += 1
        if (s_counter % 1000 == 0):
            print(str(len(session_groups) - s_counter) + " sessions remaining to encode.")

    return np.vstack(session_vectors) 

In [None]:
enc_valid_train = encode_data(valid_train, unqiue_train_items_dict_inv)
enc_valid_train.shape

In [None]:
enc_valid_test = encode_data(valid_test, unqiue_train_items_dict_inv)
enc_valid_test.shape

# Define (Denoising) Autoencoder

In [None]:
def train_ae(model_path, input_size, train_set, valid_set, encoding_dim = 100, hidden_dim = 256, attention=False, dae=False):
    # this is our input placeholder; size of vocabulary
    input_vec = Input(shape=(input_size, ))
    
    l2_reg = regularizers.l2(1e-4)
    
    #encoded = input_vec
    #if attention == True:
        # ATTENTION PART STARTS HERE
    #    attention_probs = Dense(input_size, activation='softmax', name='attention_vec')(input_vec)
    #    encoded = multiply([input_vec, attention_probs], name='attention_mul')
        # ATTENTION PART FINISHES HERE

        
    # "encoded" is the encoded representation of the inputs
    encoded = Dense(hidden_dim, activation='relu', activity_regularizer=l2_reg)(input_vec)
    
    if attention == True:
        # ATTENTION PART STARTS HERE
        attention_probs = Dense(hidden_dim, activation='softmax', name='attention_vec')(encoded)
        encoded = multiply([encoded, attention_probs], name='attention_mul')
        # ATTENTION PART FINISHES HERE

    encoded = Dense(encoding_dim, activation='relu')(encoded)

    # "decoded" is the lossy reconstruction of the input
    decoded = Dense(hidden_dim, activation='relu')(encoded)
    decoded = Dense(input_size, activation='sigmoid')(decoded)

    # this model maps an input to its reconstruction
    autoencoder = Model(input_vec, decoded)

    # Separate Encoder model
    # this model maps an input to its encoded representation
    encoder = Model(input_vec, encoded)

    # Separate Decoder model
    # create a placeholder for an encoded (32-dimensional) input
    encoded_input = Input(shape=(encoding_dim,))
    # retrieve the last layer of the autoencoder model
    decoder_layer1 = autoencoder.layers[-2]
    decoder_layer2 = autoencoder.layers[-1]
    
    # create the decoder model
    #decoder = Model(encoded_input, decoder_layer(encoded_input))
    decoder = Model(encoded_input, decoder_layer2(decoder_layer1(encoded_input)))
    
    

    # Set callback functions to early stop training and save the best model so far
    model_name = model_path
    if attention == True:
        model_name = model_name + "att_"
        
    if dae == True:
        model_name = model_name + 'dae-{epoch:02d}.model'
    else:
        model_name = model_name + 'ae-{epoch:02d}.model'


    es = EarlyStopping(monitor='val_loss', patience=2)
    mc = ModelCheckpoint(filepath=model_name, 
                         monitor='val_loss', 
                         save_best_only=False, 
                         save_weights_only=False, 
                         mode='auto', 
                         period=1)
    callbacks = [es, mc]
                    
    # configure model to use a per-pixel binary crossentropy loss, and the Adadelta optimizer
    opt = optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
    autoencoder.compile(optimizer=opt, loss='kullback_leibler_divergence')

    train_in = train_set
    valid_in = valid_set
    if dae == True:
        noise_factor = 0.5
        train_in = train_set + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=train_set.shape)
        valid_in = valid_set + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=valid_set.shape)
        train_in = np.clip(train_in, 0., 1.)
        valid_in = np.clip(valid_in, 0., 1.)

    autoencoder.fit(train_in, train_set, 
                    callbacks=callbacks, # Early stopping 
                    epochs=50, 
                    batch_size=256, 
                    shuffle=True, 
                    validation_data=(valid_in, valid_set), 
                    verbose=2)
    
    return encoder, decoder

# Train normal Autoencoder

* Set **attention = False** to train **without** the attention mechanism
* Set **attention = True** to train **with** the attention mechanism

In [None]:
attention = False
model_path = interim_path + "models/"
vocab_size = len(unqiue_train_items_dict_inv)  # size of vocab
encoder, decoder = train_ae(model_path, vocab_size, enc_valid_train, enc_valid_test, attention = attention)

print("Finished training. Storing model to: " + interim_path)
model_name = "ae_"
if attention == True:
    model_name = "att_" + model_name
pickle.dump(encoder, open(model_path + model_name + "encoder.model", 'wb'), protocol=4)
pickle.dump(decoder, open(model_path + model_name + "decoder.model", 'wb'), protocol=4)
K.clear_session()

# Train Denoising Autoencoder

* Set **attention = False** to train **without** the attention mechanism
* Set **attention = True** to train **with** the attention mechanism

In [None]:
attention = False
model_path = interim_path + "models/"
vocab_size = len(unqiue_train_items_dict_inv)  # size of vocab
dae_encoder, dae_decoder = train_ae(model_path, vocab_size, enc_valid_train, enc_valid_test, dae = True, attention = attention)

print("Finished training. Storing model to: " + interim_path)
model_name = "dae_"
if attention == True:
    model_name = "att_" + model_name
pickle.dump(dae_encoder, open(model_path + model_name + "encoder.model", 'wb'), protocol=4)
pickle.dump(dae_decoder, open(model_path + model_name + "decoder.model", 'wb'), protocol=4)
K.clear_session()

# Define Variational Autoencoder

In [None]:
# reparameterization trick
# instead of sampling from Q(z|X), sample epsilon = N(0,I)
# z = z_mean + sqrt(var) * epsilon
def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean = 0 and std = 1.0
    epsilon = K.random_normal(shape=(batch, dim))
    
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

def train_vae(model_path, input_size, 
             train_set, valid_set, 
             encoding_dim = 100, hidden_dim = 256, 
             attention=False):
    # VAE model = encoder + decoder
    # build encoder model
    x = Input(shape=(input_size, ), name='encoder_input')
    h = Dense(hidden_dim, activation='relu')(x)
    
    if attention == True:
        # ATTENTION PART STARTS HERE
        attention_probs = Dense(hidden_dim, activation='softmax', name='attention_vec')(h)
        h = multiply([h, attention_probs], name='attention_mul')
        # ATTENTION PART FINISHES HERE
        
    z_mean = Dense(encoding_dim, name='z_mean')(h)
    z_log_var = Dense(encoding_dim, name='z_log_var')(h)
    
    # use reparameterization trick to push the sampling out as input
    # note that "output_shape" isn't necessary with the TensorFlow backend
    z = Lambda(sampling, name='z')([z_mean, z_log_var])
    
    # we instantiate these layers separately so as to reuse them later
    decoder_h = Dense(hidden_dim, activation='relu')
    decoder_mean = Dense(input_size, activation='sigmoid')
    h_decoded = decoder_h(z)
    x_decoded_mean = decoder_mean(h_decoded)

    # instantiate VAE model
    vae = Model(x, x_decoded_mean)

    # Compute VAE loss
    xent_loss = input_size * metrics.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    vae_loss = K.mean(xent_loss + kl_loss)

    vae.add_loss(vae_loss)
    vae.compile(optimizer='rmsprop')
    vae.summary()
    
    
    model_name = model_path + 'vae-{epoch:02d}.model'
    es = EarlyStopping(monitor='val_loss', patience=2)
    mc = ModelCheckpoint(filepath=model_name, 
                         monitor='val_loss', 
                         save_best_only=False, 
                         save_weights_only=False, 
                         mode='auto', 
                         period=1)
    callbacks = [es, mc]
    
    vae.fit(train_set,
        shuffle=True,
        epochs=50,
        callbacks=callbacks, # Early stopping 
        batch_size=256,
        verbose=2,
        validation_data=(valid_set, None))
    
    # build a model to project inputs on the latent space
    encoder = Model(x, z_mean)
    
    # build a digit generator that can sample from the learned distribution
    decoder_input = Input(shape=(encoding_dim,))
    _h_decoded = decoder_h(decoder_input)
    _x_decoded_mean = decoder_mean(_h_decoded)
    generator = Model(decoder_input, _x_decoded_mean)

    return encoder, generator

# Train Variational Autoencoder

* Set **attention = False** to train **without** the attention mechanism
* Set **attention = True** to train **with** the attention mechanism

In [None]:
attention = False
model_path = interim_path + "models/"
vocab_size = len(unqiue_train_items_dict_inv)  # size of vocab
vae_encoder, vae_decoder = train_vae(model_path, vocab_size, enc_valid_train, enc_valid_test, attention = attention)

print("Finished training. Storing model to: " + interim_path)
model_name = "vae_"
if attention == True:
    model_name = "att_" + model_name
pickle.dump(vae_encoder, open(model_path + model_name + "encoder.model", 'wb'), protocol=4)
pickle.dump(vae_decoder, open(model_path + model_name + "decoder.model", 'wb'), protocol=4)
K.clear_session()

# Infer test session vectors

Every session will be divided into sub-sessions for which the latent vector will be inferred


* Set **dae = False** to infer latent session vectors using the trained **Autoencoder**
* Set **dae = True** to infer latent session vectors using the trained **Denoising Autoencoder**


* Set **attention = False** to infer latent session vectors using the model that was trained **without** the attention mechanism
* Set **attention = True** to infer latent session vectors using the model that was trained **with** the attention mechanism

In [None]:
algo = "vae" # ae | dae | vae
attention = False

test = pd.read_csv(processed_path + "test_14d.csv", header=0, sep='\t')
test_session_groups = test.groupby("session_id")
print(str(len(test_session_groups)) + " sessions to test.")
    
model_to_load = algo + "_encoder.model"
if attention == True:
    model_to_load = "att_" + model_to_load
    
encoder = pickle.load(open(model_path + model_to_load, 'rb'))

    
infer_columns = ["session_id", "latent_session_vector", "input_items", "remaining_items"]
infer_df = pd.DataFrame(columns = infer_columns)
    
s_counter = 0
for session_id, session_group in test_session_groups:
    # vector length = len(unqiue_train_items)
    session_vector = np.zeros((vocab_size,), dtype=int)
    # fill 1s for session items
    max_index = len(session_group) - 1
    item_counter = 0
    for index, row in session_group.iterrows():
        # go from item to item 
        # leave last one for testing
        if item_counter == max_index:
            break
        # make prediction
        item_index = unqiue_train_items_dict_inv[row["item_id"]]
        session_vector[item_index] = 1
        test_in = session_vector.reshape((1, vocab_size))
        latent_session_vec = encoder.predict(test_in)
        
        # get input and remaining (expected) items
        current_input_set = session_group["item_id"].values[:item_counter+1]
        remaining_test_set = session_group["item_id"].values[item_counter+1:]
        item_counter += 1

        infer_df = infer_df.append({
            "session_id": session_id,
            "latent_session_vector": ','.join(map(str, latent_session_vec[0])),
            "input_items":  ','.join(map(str, current_input_set)),
            "remaining_items":  ','.join(map(str, remaining_test_set))
        }, ignore_index=True)
    s_counter += 1
    if (s_counter % 1000 == 0):
        print(str(len(test_session_groups) - s_counter) + " test sessions remaining to infer latent vector.")

infer_path = interim_path + "infer/"
infer_file_name = algo + "_encoder_test.csv"

if attention == True:
    infer_file_name = "att_" + infer_file_name
infer_df.to_csv(infer_path + infer_file_name, sep='\t', header=False, index=False)
K.clear_session()

In [None]:
algo = "vae" # ae | dae | vae
attention = True

train = pd.read_csv(processed_path + "train_14d.csv", header=0, sep='\t')
train_session_groups = train.groupby("session_id")
print(str(len(train_session_groups)) + " sessions to test.")
    
model_to_load = algo + "_encoder.model"
if attention == True:
    model_to_load = "att_" + model_to_load
    
encoder = pickle.load(open(model_path + model_to_load, 'rb'))
    
infer_columns = ["session_id", "latent_session_vector", "items"]
infer_df = pd.DataFrame(columns = infer_columns)
    
s_counter = 0
for session_id, session_group in train_session_groups:
    # vector length = len(unqiue_train_items)
    session_vector = np.zeros((vocab_size,), dtype=int)
    # init train session vector
    for index, row in session_group.iterrows():
        item_index = unqiue_train_items_dict_inv[row["item_id"]]
        session_vector[item_index] = 1
        
    test_in = session_vector.reshape((1, vocab_size))
    latent_session_vec = encoder.predict(test_in)

    infer_df = infer_df.append({
        "session_id": session_id,
        "latent_session_vector": ','.join(map(str, latent_session_vec[0])),
        "items":  ','.join(map(str, session_group["item_id"].values)),
    }, ignore_index=True)
    s_counter += 1
    if (s_counter % 1000 == 0):
        print(str(len(train_session_groups) - s_counter) + " train sessions remaining to infer latent vector.")

infer_path = interim_path + "infer/"
infer_file_name = algo + "_encoder_train.csv"

if attention == True:
    infer_file_name = "att_" + infer_file_name
    
infer_df.to_csv(infer_path + infer_file_name, sep='\t', header=False, index=False)
K.clear_session()