1) Metric - ndcg, recall

2) Loss Function

In [None]:
import tensorflow as tf
assert tf.__version__ == '2.0.0'

In [None]:
import os
from os.path import basename, normpath
import urllib.request
import requests
import tarfile
import tempfile
import zipfile
from tqdm import tqdm
import itertools

from glob import glob

In [None]:
import scipy.sparse as sp
import pandas as pd
import numpy as np

In [None]:
def argsort_sparse(m_sp, R=None):
    '''

    Parameters
    ----------
    m_sp : scipy sparse matrix

    R(optional) : int
        maximum number of keeping indexs

    Returns
    -------
    Sorted Indexs for nonzero data
    '''
    row_inds, col_inds = m_sp.nonzero()
    tuples = zip(row_inds, col_inds, m_sp.data)
    sorted_tuples = list(sorted(tuples))

    n_rows, n_cols = m_sp.shape

    results = []
    tup_idx = 0
    for r in range(n_rows):
        results.append([])
        for i, tup in enumerate(sorted_tuples[tup_idx:]):
            if tup[0] == r:
                results[-1].append(tup[1])
            else:
                tup_idx += i
                break

    if R is not None:
        results = list(map(lambda l:l[:R], results))



    return np.array(results)


    

def recall_at_r(x_true, x_predicted, R):
    '''
    
    Parameters
    ----------
    x_true : scipy sparse matrix or numpy array
        true data
    x_predicted : numpy array
        predicted recommendations
    R : int 
        hyper-parameter for this metric

    Returs
    ------
    Recall@R

    '''

    if type(x_true) == np.array:
        sorted_col_inds = np.argsort(x_true, axis=-1).reshape(-1)
    else:
        sorted_col_inds = argsort_sparse(x_true, R).reshape(-1)



    row_inds = np.repeat(np.array(list(range(x_true.shape[0]))), R)


    x_true_cp = sp.csr_matrix(([1]*(R*x_true.shape[0]), (list(row_inds), list(sorted_col_inds)) ), shape=x_true.shape)
    sorted_idxs_predicted = np.argsort(x_predicted, axis=-1)
    selected = np.take_along_axis(x_true_cp, sorted_idxs_predicted[:, :R], axis=-1)
    hit = selected.sum(axis=-1)
    maxhit = np.minimum(x_true.getnnz(axis=1), R)

    return np.squeeze(np.array(hit)) / maxhit

def dcg(x_true, x_predicted, R):
    '''

    '''

    n_rows, n_cols = x_true.shape

    if type(x_true) == np.array:
        sorted_col_inds = np.argsort(x_true, axis=-1).reshape(-1)
    else:
        sorted_col_inds = argsort_sparse(x_true, R).reshape(-1)



    row_inds = np.repeat(np.array(list(range(x_true.shape[0]))), R)


    x_true_cp = sp.csr_matrix(([1]*(R*x_true.shape[0]), (list(row_inds), list(sorted_col_inds)) ), shape=x_true.shape)
    sorted_idxs_predicted = np.argsort(x_predicted, axis=-1)

    '''
    results = np.zeros((n_rows, 1))
    for r in range(1, R+1, 1):
        selected = np.take_along_axis(x_true_cp, sorted_idxs_predicted[:, r-1:r], axis=-1).toarray()
        denominator = np.log(r+1)
        nominator = np.power(2, selected) - 1
        results += nominator / denominator
    '''
    
    selected = np.take_along_axis(x_true_cp, sorted_idxs_predicted[:,:R], axis=-1).toarray()
    denominator = np.expand_dims(np.log2(np.arange(2,R+2,1)), axis=0)
    results = selected / denominator
    results = results.sum(axis=-1)


    return results

    


In [None]:
DATASETS_DIR = './'
# the different datasets
ML_20M = 'ml-20m'
ML_20M_ALT = 'ml-20m_alt'
NETFLIX = 'netflix'
LASTFM = 'lastfm'
PINTEREST = 'pinterest'
DATASETS = [ML_20M, NETFLIX, LASTFM, PINTEREST, ML_20M_ALT]

# download urls to different datasets
DOWNLOAD_URL = {
    ML_20M: 'http://files.grouplens.org/datasets/movielens/ml-20m.zip',
    NETFLIX: 'https://archive.org/download/nf_prize_dataset.tar/nf_prize_dataset.tar.gz',
    LASTFM: 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz',
}

In [None]:
def download_file(url, filename):
    if not os.path.isdir(DATASETS_DIR):
        os.makedirs(DATASETS_DIR)

    u = urllib.request.urlopen(url)
    with open(filename, 'wb') as f:
        meta = u.info()
        if (meta.get_all("Content-Length")):
            file_size = int(meta.get_all("Content-Length")[0])
            pbar = tqdm(
                total=file_size,
                desc=basename(normpath(filename)),
                unit='B',
                unit_scale=True)

            file_size_dl = 0
            block_sz = 8192
            while True:
                buff = u.read(block_sz)
                if not buff:
                    break
                pbar.update(len(buff))
                file_size_dl += len(buff)
                f.write(buff)
            pbar.close()
        else:
            LOG.warning("No content length information")
            file_size_dl = 0
            block_sz = 8192
            for cyc in itertools.cycle('/–\\|'):
                buff = u.read(block_sz)
                if not buff:
                    break
                print(cyc, end='\r')
                file_size_dl += len(buff)
                f.write(buff)



def extract_file(path, to_directory):
    """
    Extract file
    :param path: Path to compressed file
    :param to_directory: Directory that is going to store extracte files
    """
    if (path.endswith("tar.gz")):
        tar = tarfile.open(path, "r:gz")
        tar.extractall(path=to_directory)
        tar.close()
    elif (path.endswith("tar")):
        tar = tarfile.open(path, "r:")
        tar.extractall(path=to_directory)
        tar.close()
    elif (path.endswith("zip")):
        with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(to_directory)
    else:
        raise Exception(
            "Could not extract {} as no appropriate extractor is found".format(path))

def download_movielens():
    filepath = os.path.join(DATASETS_DIR, ML_20M_ALT + '.zip')
    if not glob(filepath):
        download_file(DOWNLOAD_URL[ML_20M], filepath)

    #.info("Extracting")
    extract_file(filepath, DATASETS_DIR)

def download_lastfm():
    filepath = os.path.join(DATASETS_DIR, LASTFM + '.tar.gz')
    if not glob(filepath):
        download_file(DOWNLOAD_URL[LASTFM], filepath)

    extract_file(filepath, DATASETS_DIR)


In [None]:
download_movielens()

In [None]:
df = pd.read_csv('./ml-20m/ratings.csv')
df.rating.isna()

In [None]:
df.rating

In [None]:
def make_feedback_implicit(feedback, threshold):
    return [1 if rating >= threshold else 0 for rating in feedback]

def parse_movielens(threshold=4, **kwargs):

    source_file = './ml-20m/ratings.csv'
    if not glob(source_file):
        download_movielens()


    df = pd.read_csv(source_file)
    df.drop('timestamp', axis=1, inplace=True)
    df['rating'].fillna(0.)
    df['rating'] = make_feedback_implicit(df['rating'], 3.5)

    map_user_id = {u: i for i, u in enumerate(df.userId.unique())}
    map_movie_id = {m: i for i, m in enumerate(df.movieId.unique())}

    m_sp = sp.csr_matrix(
        (df.rating,
         ([map_user_id[u] for u in df.userId],
          [map_movie_id[m] for m in df.movieId])),
        shape=(len(map_user_id), len(map_movie_id))
    )

    m_sp.eliminate_zeros()

    def save_as_npz(m_sp, path):
        if not os.path.isdir('./binary'):
            os.mkdir('./binary')
        sp.save_npz(path, m_sp)
    save_as_npz(m_sp, './binary/bin_ml-20m.npz')

In [None]:
def parse_lastfm():
    df = pd.read_csv('./lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv', delimiter='\t', names=['UserId', 'MusicId', 'ArtistName','Plays'])

    music_artist_pair = list(zip([str(i) for i in df['MusicId']],[str(i) for i in df['ArtistName']]))

    user_id_dict = {id: i for i, id in enumerate(sorted(set(df['UserId'])))}
    item_id_dict = {key: i for i, key in enumerate(sorted(set(music_artist_pair)))}

    user_idxs = [user_id_dict[user] for user in df['UserId']]
    item_idxs = [item_id_dict[item] for item in music_artist_pair]

    m_sp = sp.csr_matrix(([1] * df.shape[0], (user_idxs, item_idxs)), shape=(len(user_id_dict), len(item_id_dict)))

    def save_as_npz(m_sp, path):
        if not os.path.isdir('./binary'):
            os.mkdir('./binary')
        sp.save_npz(path, m_sp)

    save_as_npz(m_sp, './binary/lastfm.npz')
    del user_idxs
    del item_idxs
    del music_artist_pair
    del df


In [None]:
parse_movielens()

In [None]:
data = sp.load_npz('./binary/bin_ml-20m.npz')

In [None]:
num_users, num_items = data.shape


In [None]:
num_items

In [None]:
def make_generator(sparse_matrix, batch_size):
    n, _ = sparse_matrix.shape
    buckets = n // batch_size
    additional = (n % batch_size != 0)
    def generator():
        while True:
            i = 0
            while i < buckets:
                batch = sparse_matrix[i*batch_size:(i+1) * batch_size].copy()
                batch = batch.tocoo()
                idxs = np.stack([batch.row, batch.col], axis=1)
                vals = batch.data
                yield (idxs, vals)
                i += 1

            if additional:
                batch = sparse_matrix[i*batch_size:]
                batch = batch.tocoo()
                idxs = np.stack([batch.row, batch.col], axis =1)
                vals = batch.data
                yield (idxs, vals)

    return generator

In [None]:
BATCH_SIZE = 500

In [None]:
gen = make_generator(data, BATCH_SIZE)
_, movies = data.shape

In [None]:
dataset_v1 = tf.data.Dataset.from_generator(gen, output_types=(tf.int64, tf.float32)).map(lambda i, v: tf.sparse.SparseTensor(i, v, (BATCH_SIZE, movies)))

In [None]:
def make_generator_v2(sparse_matrix, batch_size):
    n, _ = sparse_matrix.shape
    buckets = n // batch_size
    additional = (n % batch_size != 0)
    def generator():
        while True:
            i = 0
            while i < buckets:
                batch = sparse_matrix[i*batch_size:(i+1) * batch_size].copy()
                yield batch.toarray()
                i += 1

            if additional:
                batch = sparse_matrix[i*batch_size:]
                yield batch.toarray()

    return generator

In [None]:
gen_v2 = make_generator_v2(data, BATCH_SIZE)

In [None]:
dataset_v2 = tf.data.Dataset.from_generator(gen_v2, output_types=tf.int32)

Ratings ==> User * Movies

Movie_id ==> Movie Titile

In [None]:
class Sparse2Dense(tf.keras.layers.Dense):
    def call(self, x):
        #print(type(x))
        assert type(x) == tf.sparse.SparseTensor
        rank = len(x.shape)
        if rank != 2:
            raise NotImplementedError("input rank should be 2")
        else:
            outputs = tf.sparse.sparse_dense_matmul(x, self.kernel)
        if self.use_bias:
            outputs = tf.nn.bias_add(outputs, self.bias)
        if self.activation is not None:
            return self.activation(outputs)  # pylint: disable=not-callable
        return outputs

In [None]:
class VAE_CF(tf.keras.Model):
    def __init__(self, items, hidden_dims, latent_dims, *args, **kwargs):
        super(VAE_CF, self).__init__(*args, **kwargs)
        self.encoder = tf.keras.Sequential([
                #tf.keras.layers.Dense(hidden_dims),
                Sparse2Dense(hidden_dims),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(2 * latent_dims)
        ])

        self.decoder = tf.keras.Sequential([
                #tf.keras.layers.InputLayer(input_shape=(latent_dims)),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(items)
        ])

    def encode(self, x):
        mean, log_var = tf.split(self.encoder(x), 2, 1)
        z = self.reparameterize(mean, log_var)
        return mean, log_var, z

    def decode(self, z):
        return self.decoder(z)

    def reparameterize(self, mean, log_var):
        batch = tf.shape(mean)[0]
        dim = tf.shape(mean)[1]
        epsilon = tf.random.normal(shape=(batch, dim))
        return mean + tf.exp(log_var * .5) * epsilon

    def call(self, inputs):
        mean, log_var, z = self.encode(inputs)
        reconstructed = self.decode(z)
        kl_loss = -.5 * tf.math.reduce_mean((log_var - tf.exp(log_var) - tf.square(mean) + 1))
        self.add_loss(kl_loss)
        return reconstructed


In [None]:
def encoder_layer(items, hidden_dims, latent_dims, batch_size):
    inputs = tf.keras.layers.Input(shape=(items,), sparse=True, batch_size=batch_size)
    x = Sparse2Dense(hidden_dims)(inputs)
    x = tf.keras.layers.Dense(hidden_dims)(x)
    outputs = tf.keras.layers.Dense(2 * latent_dims)(x)

    return tf.keras.Model(inputs=inputs, outputs=outputs)

def decoder_layer(items, hidden_dims, latent_dims):
    inputs = tf.keras.layers.Input(shape=(latent_dims,), batch_size=BATCH_SIZE)
    x = tf.keras.layers.Dense(hidden_dims)(inputs)
    x = tf.keras.layers.Dense(hidden_dims)(x)
    outputs = tf.keras.layers.Dense(items)(x)

    return tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
class VAE_CF(tf.keras.Model):
    def __init__(self, items, hidden_dims, latent_dims, *args, **kwargs):
        super(VAE_CF, self).__init__(*args, **kwargs)
        self.encoder = encoder_layer(items, hidden_dims, latent_dims, BATCH_SIZE)

        self.decoder = decoder_layer(items, hidden_dims, latent_dims)

    def encode(self, x):
        mean, log_var = tf.split(self.encoder(x), 2, 1)
        z = self.reparameterize(mean, log_var)
        return mean, log_var, z

    def decode(self, z):
        return self.decoder(z)

    def reparameterize(self, mean, log_var):
        batch = tf.shape(mean)[0]
        dim = tf.shape(mean)[1]
        epsilon = tf.random.normal(shape=(batch, dim))
        return mean + tf.exp(log_var * .5) * epsilon

    def call(self, inputs):
        mean, log_var, z = self.encode(inputs)
        reconstructed = self.decode(z)
        kl_loss = -.5 * tf.math.reduce_mean((log_var - tf.exp(log_var) - tf.square(mean) + 1))
        self.add_loss(kl_loss)
        return reconstructed

In [None]:
class VAE_CF(tf.keras.Model):
    def __init__(self, items, hidden_dims, latent_dims, *args, **kwargs):
        super(VAE_CF, self).__init__(*args, **kwargs)
        self.encoder = tf.keras.Sequential([
                #tf.keras.layers.InputLayer(input_shape=(items,)),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(2 * latent_dims)
        ])

        self.decoder = tf.keras.Sequential([
                #tf.keras.layers.InputLayer(input_shape=(latent_dims, )),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(hidden_dims),
                tf.keras.layers.Dense(items)
        ])

    def encode(self, x):
        mean, log_var = tf.split(self.encoder(x), 2, 1)
        z = self.reparameterize(mean, log_var)
        return mean, log_var, z

    def decode(self, z):
        return self.decoder(z)

    def reparameterize(self, mean, log_var):
        batch = tf.shape(mean)[0]
        dim = tf.shape(mean)[1]
        epsilon = tf.random.normal(shape=(batch, dim))
        return mean + tf.exp(log_var * .5) * epsilon

    def call(self, inputs):
        mean, log_var, z = self.encode(inputs)
        reconstructed = self.decode(z)
        kl_loss = -.5 * tf.math.reduce_mean((log_var - tf.exp(log_var) - tf.square(mean) + 1))
        self.add_loss(kl_loss)
        return reconstructed

In [None]:
'''
explicit
mse
poisson

implicit
weighted mse
multinomial
sigmoid_cross_entropy
'''

In [None]:
def get_loss_fn(loss_fn_name='multinomial'):
    assert loss_fn_name.lower() in {'mse', 'poisson', 'multinomial', 'sigmoid_ce', 'weighted_mse'}
    if loss_fn_name == 'mse':
        return tf.keras.losses.MeanSquaredError()
    elif loss_fn_name == 'poisson':
        raise NotImplementedError("poisson")
    elif loss_fn_name == 'multinomial':
        return tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    elif loss_fn_name == 'sigmoid_ce':
        return lambda label, logit: tf.nn.sigmoid_cross_entropy_with_logits(label, logit)
    else:
        raise NotImplementedError("weighted_mse")
        

In [None]:
loss_fn = get_loss_fn()

In [None]:
vae_cf = VAE_CF(num_items, 400, 200)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = get_loss_fn()
loss_metric = tf.keras.metrics.Mean()

In [None]:
vae_cf(next(iter(dataset_v1)))

In [None]:
tf.sparse.SparseTensor([[1,2],[3,4]], [1, 2], [3,4])

In [None]:
@tf.function
def train_step(model, x, loss_fn, optimizer):
    #if type(x) == tf.sparse.SparseTensor:
    #    x = tf.sparse.to_dense(x)
    with tf.GradientTape() as tape:
        logit = model(x)
        if type(x) == tf.sparse.SparseTensor:
            x = tf.sparse.to_dense(x)
        loss = loss_fn(x, logit)
        loss += sum(model.losses)

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss

In [None]:
EPOCHS = 100

for epoch in range(EPOCHS):

    for step, x in enumerate(dataset_v2):
        #print(type(x) == tf.sparse.SparseTensor)
        loss = train_step(vae_cf, x, loss_fn, optimizer)
        loss_metric(loss)

        if step % 2 == 0:
            print("step {}, mean loss = {}".format(step, loss_metric.result()))

In [None]:
vae_cf.summary()