In [None]:
!cp -r  "drive/My Drive/Projects/Hafid/" .

In [None]:
import os
import random

import numpy as np
import pandas as pd

from scipy import sparse


  import pandas.util.testing as tm


In [None]:
os.chdir("Hafid")

# I/ Pre-processing

## 1) Read .csv file

In [None]:
print("Reading csv..")
df = pd.read_csv('ratings.csv', header=0)
print('Done.')
print('Sorting the dataframe by date ..')
df.sort_values(by='timestamp', inplace=True)
print('Done..')

df = df[df['rating'] > 3.5]

print("Total no of ratings: ", df.shape[0])
print("Total Number of users: ", len(np.unique(df.userId)))
print("Total Number of movies: ", len(np.unique(df.movieId)))

Reading csv..
Done.
Sorting the dataframe by date ..
Done..
Total no of ratings:  9995410
Total Number of users:  138287
Total Number of movies:  20720


In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
4182421,28507,1176,4.0,789652004
18950936,131160,47,5.0,789652009
12341168,85252,21,4.0,822873600
12341169,85252,22,4.0,822873600
12341173,85252,32,4.0,822873600


## 2) Filter users with few ratings

In [None]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['userId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 

    return tp, usercount, itemcount

raw_data, user_activity, item_popularity = filter_triplets(df)

## 3) Reindex user_id and movie_id

We add the following to have the same items and users as in the RecVae notebook. 

In [None]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

n_users = unique_uid.size
n_heldout_users = 10_000

users = unique_uid[:n_users - n_heldout_users * 2]

raw_data = raw_data.loc[(raw_data['userId']).isin(users)]

In [None]:
movie_id_dict = {u_id: i for i, u_id in enumerate(np.unique(raw_data["movieId"]))}

raw_data["movieId"] = raw_data["movieId"].apply(lambda x: movie_id_dict[x])

n_items = len(np.unique(raw_data["movieId"]))
n_users = len(np.unique(raw_data["userId"]))

## 4) Get data as a sparse matrix

In the next cell, we associate to each "virtual" user a sparse matrix containing its ratings. To generate the "virtual" users, we associate to each "real" user a set of users having the same ratings up to some time stamps. (I SHOULD EXPLAIN THIS PART LATER IN A BETTER WAY).

THIS SECTION SHOULD BE OPTIMIZED

In [None]:
from tqdm.notebook import tqdm


users_data_dict = {}

for user_id, user_df in tqdm(raw_data.groupby("userId")):
    users_data_dict[user_id] = {}
    
    n_rows = len(user_df) # number of movies rated by this user

    if n_rows <= 50:
        movie_ids = list(user_df["movieId"])
        time_stamp = list(user_df["timestamp"])[-1]
        
        users_data_dict[user_id][time_stamp] =\
         sparse.csr_matrix((np.ones_like(movie_ids),
                            (np.zeros_like(movie_ids), movie_ids)),
                            dtype='float64', shape=(1, n_items))

    else:
        for k in range((n_rows - 50) // 10):
            movie_ids = list(user_df["movieId"])[:(50 + 10*k)]
            time_stamp = list(user_df["timestamp"])[(50 + 10*k)-1]
        
            users_data_dict[user_id][time_stamp] =\
                sparse.csr_matrix((np.ones_like(movie_ids),
                            (np.zeros_like(movie_ids), movie_ids)),
                            dtype='float64', shape=(1, n_items))

        movie_ids = list(user_df["movieId"])
        time_stamp = list(user_df["timestamp"])[-1]
        
        users_data_dict[user_id][time_stamp] =\
                sparse.csr_matrix((np.ones_like(movie_ids),
                            (np.zeros_like(movie_ids), movie_ids)),
                            dtype='float64', shape=(1, n_items))


HBox(children=(FloatProgress(value=0.0, max=116677.0), HTML(value='')))




In [None]:
virtual_userid_2_real_userid = {}
real_userid_2_virtual_userid = {}

count = 0
for key1 in users_data_dict:
    for key2 in users_data_dict[key1]:
        virtual_userid_2_real_userid[count] = (key1, key2)
        real_userid_2_virtual_userid[(key1, key2)] = count
        count += 1

n_virtual_users = len(virtual_userid_2_real_userid)

virtual_users_data = []
for virtual_user_id in  virtual_userid_2_real_userid:
    key1, key2 = virtual_userid_2_real_userid[virtual_user_id]
    virtual_users_data.append(users_data_dict[key1][key2])
        

virtual_users_data = sparse.vstack(virtual_users_data)

# II/ Build and Load model

This part should me moved to an external file in the future

## 1) model definition

In [None]:
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F

seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def swish_(x):
    return x.mul_(torch.sigmoid(x))

def swish(x):
    return x.mul(torch.sigmoid(x))

def kl(q_distr, p_distr, weights, eps=1e-7):
    mu_q, logvar_q = q_distr
    mu_p, logvar_p = p_distr
    return 0.5 * (((logvar_q.exp() + (mu_q - mu_p).pow(2)) / (logvar_p.exp() + eps) \
                    + logvar_p - logvar_q - 1
                   ).sum(dim=-1) * weights).mean()

def simple_kl(mu_q, logvar_q, logvar_p_scale, norm):
    return (-0.5 * ( (1 + logvar_q #- torch.log(torch.ones(1)*logvar_p_scale) \
                      - mu_q.pow(2)/logvar_p_scale - logvar_q.exp()/logvar_p_scale
                     )
                   ).sum(dim=-1) * norm
           ).mean()

def log_norm_pdf(x, mu, logvar):
    return -0.5*(logvar + np.log(2 * np.pi) + (x - mu).pow(2) / logvar.exp())

def log_norm_std_pdf(x):
    return -0.5*(np.log(2 * np.pi) + x.pow(2))


class DeterministicDecoder(nn.Linear):
    def __init__(self, *args):
        super(DeterministicDecoder, self).__init__(*args)

    def forward(self, *args):
        output = super(DeterministicDecoder, self).forward(*args)
        return output, 0


class StochasticDecoder(nn.Linear):
    def __init__(self, in_features, out_features, bias=True):
        super(StochasticDecoder, self).__init__(in_features, out_features, bias)
        self.in_features = in_features
        self.out_features = out_features
        self.logvar = nn.Parameter(torch.Tensor(out_features, in_features))
        self.logvar.data.fill_(-2)

    def forward(self, input):
        
        if self.training:
            std = torch.exp(self.logvar)
            a = F.linear(input, self.weight, self.bias)
            eps = torch.randn_like(a)
            b = eps.mul_(torch.sqrt_(F.linear(input * input, std)))
            output = a + b
            
            kl = (-0.5 * (1 + self.logvar - self.weight.pow(2) - self.logvar.exp())).sum(dim=-1).mean() #/ (10)
            return output, kl
        else:
            output = F.linear(input, self.weight, self.bias)
            return output, 0


class GaussianMixturePrior(nn.Module):
    def __init__(self, latent_dim, gaussians_number):
        super(GaussianMixturePrior, self).__init__()
        
        self.gaussians_number = gaussians_number
        
        self.mu_prior = nn.Parameter(torch.Tensor(latent_dim, gaussians_number))
        self.mu_prior.data.fill_(0)
        
        self.logvar_prior = nn.Parameter(torch.Tensor(latent_dim, gaussians_number))
        self.logvar_prior.data.fill_(0)
        
    def forward(self, z):
        density_per_gaussian = log_norm_pdf(x=z[:, :, None],
                                            mu=self.mu_prior[None, ...].detach(),
                                            logvar=self.logvar_prior[None, ...].detach()
                                           ).add(-np.log(self.gaussians_number))
        
      
        return torch.logsumexp(density_per_gaussian, dim=-1)


class GaussianMixturePriorWithAprPost(nn.Module):
    def __init__(self, latent_dim, input_count):
        super(GaussianMixturePriorWithAprPost, self).__init__()
        
        self.gaussians_number = 1
        
        self.mu_prior = nn.Parameter(torch.Tensor(latent_dim, self.gaussians_number))
        self.mu_prior.data.fill_(0)
        
        self.logvar_prior = nn.Parameter(torch.Tensor(latent_dim, self.gaussians_number))
        self.logvar_prior.data.fill_(0)
        
        self.logvar_uniform_prior = nn.Parameter(torch.Tensor(latent_dim, self.gaussians_number))
        self.logvar_uniform_prior.data.fill_(10)
        
        self.user_mu = nn.Embedding(input_count, latent_dim)
        self.user_logvar = nn.Embedding(input_count, latent_dim)
        
    def forward(self, z, idx):
        density_per_gaussian1 = log_norm_pdf(x=z[:, :, None],
                                            mu=self.mu_prior[None, :, :].detach(),
                                            logvar=self.logvar_prior[None, :, :].detach()
                                           ).add(np.log(1/5 - 1/20))
        
        
        density_per_gaussian2 = log_norm_pdf(x=z[:, :, None],
                                            mu=self.user_mu(idx)[:, :, None].detach(),
                                            logvar=self.user_logvar(idx)[:, :, None].detach()
                                           ).add(np.log(4/5 - 1/20))
        
        density_per_gaussian3 = log_norm_pdf(x=z[:, :, None],
                                            mu=self.mu_prior[None, :, :].detach(),
                                            logvar=self.logvar_uniform_prior[None, :, :].detach()
                                           ).add(np.log(1/10))
        
        density_per_gaussian = torch.cat([density_per_gaussian1,
                                          density_per_gaussian2,
                                          density_per_gaussian3], dim=-1)
                
        return torch.logsumexp(density_per_gaussian, dim=-1)


class VAE(nn.Module):
    def __init__(self, hidden_dim, latent_dim, matrix_dim, axis):
        super(VAE, self).__init__()

        self.fc1 = nn.Linear(matrix_dim[1], hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.ln3 = nn.LayerNorm(hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.ln4 = nn.LayerNorm(hidden_dim)
        self.fc5 = nn.Linear(hidden_dim, hidden_dim)
        self.ln5 = nn.LayerNorm(hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, latent_dim)
        self.fc22 = nn.Linear(hidden_dim, latent_dim)
        
        self.prior = GaussianMixturePriorWithAprPost(latent_dim, matrix_dim[0])
        self.decoder = DeterministicDecoder(latent_dim, matrix_dim[1])
        
        self.axis = axis


    def encode(self, x, dropout_rate):
        norm = x.pow(2).sum(dim=-1).sqrt()
        x = x / norm[:, None]
    
        x = F.dropout(x, p=dropout_rate, training=self.training)
        
        h1 = self.ln1(swish(self.fc1(x)))
        h2 = self.ln2(swish(self.fc2(h1) + h1))
        h3 = self.ln3(swish(self.fc3(h2) + h1 + h2))
        h4 = self.ln4(swish(self.fc4(h3) + h1 + h2 + h3))
        h5 = self.ln5(swish(self.fc5(h4) + h1 + h2 + h3 + h4))
        return self.fc21(h5), self.fc22(h5)
    
    
    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5*logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu

    def decode(self, z):
        return self.decoder(z)

    def forward(self, user_ratings, user_idx, beta=1, dropout_rate=0.5, calculate_loss=True, mode=None):
        
        if mode == 'pr':
            mu, logvar = self.encode(user_ratings, dropout_rate=dropout_rate)
        elif mode == 'mf':
            mu, logvar = self.encode(user_ratings, dropout_rate=0)
            
        z = self.reparameterize(mu, logvar)
        x_pred, decoder_loss = self.decode(z)
        
        NLL = -(F.log_softmax(x_pred, dim=-1) * user_ratings).sum(dim=-1).mean()
        
        if calculate_loss:
            if mode == 'pr':
                norm = user_ratings.sum(dim=-1)
                KLD = -(self.prior(z, user_idx) - log_norm_pdf(z, mu, logvar)).sum(dim=-1).mul(norm).mean()
                loss = NLL + beta * KLD + decoder_loss
            
            elif mode == 'mf':
                KLD = NLL * 0
                loss = NLL + decoder_loss
            
            return (NLL, KLD), loss
            
        else:
            return x_pred

    def set_embeddings(self, train_data, momentum=0, weight=None):
        istraining = self.training
        self.eval()

        for batch in generate(batch_size=500, device=device, data_1=train_data, axis=self.axis):

            user_ratings = batch.get_ratings_to_dev()
            users_idx = batch.get_idx()

            new_user_mu, new_user_logvar = self.encode(user_ratings, 0)

            old_user_mu = self.prior.user_mu.weight.data[users_idx,:].detach()
            old_user_logvar = self.prior.user_logvar.weight.data[users_idx,:].detach()

            if weight:
                old_user_var = torch.exp(old_user_logvar)
                new_user_var = torch.exp(new_user_logvar)

                post_user_var = 1 / (1 / old_user_var + weight / new_user_var)
                post_user_mu = (old_user_mu / old_user_var + weight * new_user_mu / new_user_var) * post_user_var

                self.prior.user_mu.weight.data[users_idx,:] = post_user_mu
                self.prior.user_logvar.weight.data[users_idx,:] = torch.log(post_user_var + new_user_var)
            else:
                self.prior.user_mu.weight.data[users_idx,:] = momentum * old_user_mu + (1-momentum) * new_user_mu
                self.prior.user_logvar.weight.data[users_idx,:] = momentum * old_user_logvar + (1-momentum) * new_user_logvar

        if istraining:
            self.train()
        else:
            self.eval()


## 2) batch generator

In [None]:
from torch.utils.data import Dataset, DataLoader


bz = 128

class RatingsDataset(Dataset):
    def __init__(self, data_matrix):
        """
        :param data: sparse.csr_matrix
        """
        self.n_users, self.n_items = data_matrix.shape
        self.data_matrix = data_matrix

    def __len__(self):
        return self.n_users
    
    def __getitem__(self, idx):
        return (torch.Tensor(virtual_users_data[idx].toarray()),
                torch.tensor([idx], dtype=torch.long))


ratings_dataset = RatingsDataset(virtual_users_data)
ratings_loader = DataLoader(ratings_dataset, batch_size = bz)

## 2) load model

In [None]:
hidden_dim = 600
latent_dim = 200

path_to_model = "model_weights.pt"

model_i = VAE(hidden_dim, latent_dim, (n_users, n_items), 'users').to(device)

model_i.load_state_dict(torch.load(path_to_model, map_location=device))



<All keys matched successfully>

# III/-Compute embeddings

In [None]:
virtual_user_embeds = np.zeros((n_virtual_users, latent_dim))


for user_ratings, users_idx in tqdm(ratings_loader):    
    mu, logvar = model_i.encode(user_ratings.to(device), dropout_rate=0) 
    
    mu = mu.cpu().detach().numpy() 
    virtual_user_embeds[users_idx.numpy().squeeze()] = mu.squeeze()

HBox(children=(FloatProgress(value=0.0, max=4269.0), HTML(value='')))




## Re arrange the organized embeds

In [None]:
users_embeds = {}

for key1 in users_data_dict:
    users_embeds[key1] = {}
    for key2 in users_data_dict[key1]:
        virtual_user_idx = real_userid_2_virtual_userid[(key1, key2)]
        users_embeds[key1][key2] = virtual_user_embeds[virtual_user_idx]


## Write csv file

In [None]:
import csv

def make_trace_csv(users_embeds, file_path):
    """
    :param users_mebeds: nested dictionnary with user_id as key at level 1
                        and timestamp at level 2 key;
                         the values are the embedding vector
    :param file_path: saveing path 
    """
    with open(file_path, newline="", mode="w") as csvfile:
         csv_writer = csv.writer(csvfile,
                        delimiter=',',
                        quotechar='|',
                        quoting=csv.QUOTE_MINIMAL)
         
         
         row = ["userId", "timestamp"] + ["x"+str(i) for i in range(latent_dim)]
         
         csv_writer.writerow(row)

         for key_ in tqdm(users_embeds):
            for time_stamp in users_embeds[key_]:
                row = []

                row.append(key_)
                row.append(int(time_stamp))

                embed_vec = users_embeds[key_][time_stamp]

                for i in range(latent_dim):
                    row.append(embed_vec[i])

                csv_writer.writerow(row)



In [None]:
make_trace_csv(users_embeds, "trace.csv")

HBox(children=(FloatProgress(value=0.0, max=116677.0), HTML(value='')))




In [None]:
!cp "trace.csv" "../drive/My Drive/Projects/Hafid/trace.csv"

In [None]:
import pandas as pd 

trace_df = pd.read_csv("trace.csv")

trace_df.head()

Unnamed: 0,userId,timestamp,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,...,x160,x161,x162,x163,x164,x165,x166,x167,x168,x169,x170,x171,x172,x173,x174,x175,x176,x177,x178,x179,x180,x181,x182,x183,x184,x185,x186,x187,x188,x189,x190,x191,x192,x193,x194,x195,x196,x197,x198,x199
0,1,1112485508,0.493056,1.093938,-0.446583,0.352506,-0.01488,-0.632651,-0.213995,0.36964,-0.409159,-0.212136,0.363747,1.100469,0.300124,-0.768745,-0.182506,0.181067,-0.553114,-0.396183,0.179962,0.942014,0.526881,-0.330593,-0.029539,0.225445,0.170554,-0.119273,0.323323,1.053104,0.200025,-0.503903,0.123351,0.64972,-0.205826,0.593935,0.311344,0.13072,0.501609,-0.224939,...,0.070893,-0.464899,0.821674,-0.313038,-0.711266,-0.021609,-0.560014,-0.606836,0.681886,-0.242006,0.432082,-1.147012,-0.660456,0.365239,0.634163,-0.059413,0.095486,0.002517,-0.012485,-0.105754,0.376812,-0.087076,-0.720512,-0.107861,-0.408668,-0.123125,-0.570351,-0.28588,-0.436593,-0.285102,0.052823,-0.582531,-0.652738,-0.082333,-0.527581,0.251437,-0.128489,0.234728,-1.34835,0.260969
1,1,1112485734,0.573485,1.032271,-0.339061,0.373223,0.103776,-0.485489,-0.34741,0.371366,-0.381123,-0.224447,0.326191,1.355446,0.358249,-0.577043,-0.255397,0.074611,-0.355852,-0.339301,-0.023107,0.975696,0.290334,-0.480143,-0.081887,0.469132,0.133688,-0.209479,0.26606,1.070603,-0.15034,-0.601572,0.137125,0.902918,-0.022838,0.629643,0.405035,0.082423,0.55679,0.009736,...,-0.185285,-0.449455,0.604276,-0.240385,-0.749006,-0.025311,-0.443807,-0.521229,0.573603,-0.09107,0.279699,-0.983924,-0.728989,0.284881,0.438248,-0.035065,0.134324,0.14166,-0.032133,-0.052023,0.578776,-0.185704,-0.748251,-0.305122,-0.337713,-0.264886,-0.462135,0.019662,-0.530419,-0.369241,0.248272,-0.351693,-0.735107,0.100103,-0.436432,0.428435,-0.067896,0.105642,-1.451921,0.132499
2,1,1112485843,0.405805,0.886442,0.033165,0.261497,0.160979,-0.531054,-0.336695,0.252138,-0.403862,-0.06434,0.295789,1.253506,0.302927,-0.258976,-0.223385,0.122597,-0.471714,-0.301951,-0.024571,0.845121,0.207956,-0.409257,-0.040918,0.446006,-0.137074,-0.24171,0.147558,1.027061,-0.055817,-0.572271,-0.010878,0.983186,0.017941,0.530678,0.532163,0.139361,0.596523,-0.104566,...,-0.231482,-0.410192,0.4702,-0.15567,-0.457331,0.06681,-0.329037,-0.610132,0.236062,-0.08325,0.266341,-0.902071,-0.75649,0.177758,0.498448,0.181375,-0.020044,0.115953,0.105378,0.339643,0.582278,-0.147272,-0.563057,-0.407199,-0.383249,-0.42005,-0.521682,-0.077245,-0.639748,-0.320546,0.291629,-0.393074,-0.588999,0.055727,-0.500402,0.609026,-0.119148,0.235834,-1.264074,0.07347
3,1,1112486138,0.520921,1.000646,0.18599,0.480544,0.381678,-0.583675,-0.294344,0.192643,-0.33709,-0.152306,0.318268,0.917508,0.36062,-0.227877,-0.231118,-0.027407,-0.519991,-0.544839,0.004927,0.860907,0.096391,-0.308187,-0.210851,0.330233,0.17746,-0.17024,0.358302,0.983105,0.026153,-0.477025,0.152022,0.838375,0.175319,0.565987,0.544301,0.026304,0.579356,-0.117789,...,-0.098935,-0.400902,0.525317,-0.12908,-0.505837,0.151903,-0.448778,-0.293221,0.449568,-0.16353,0.231447,-0.872612,-0.667738,0.237615,0.379363,0.175642,-0.014782,-0.003482,0.277156,0.235513,0.544801,-0.46152,-0.502414,-0.38232,-0.443739,-0.280066,-0.609116,0.136833,-0.545068,-0.308164,-0.043295,-0.596315,-0.451072,0.052201,-0.430775,0.769548,-0.068722,0.237031,-1.2824,0.115054
4,2,974821014,0.507684,1.741585,-0.142395,0.017246,0.325601,0.353155,-0.055823,-0.070061,-0.683004,-0.659156,1.487311,0.4583,-0.089832,-1.026922,-0.299981,0.095798,-0.341952,-0.335215,-0.12744,1.563931,-0.456964,-0.809072,-0.922549,-0.94225,-0.020608,0.484043,-0.454481,0.232934,-1.127691,-1.255953,-0.184261,1.165801,0.668482,0.271541,-0.126179,0.589353,0.143285,-0.014382,...,0.896552,-0.573243,0.323982,-0.314303,-0.684044,-0.420814,0.090265,-0.543867,1.245018,-0.407704,0.636745,-1.241678,-0.520423,-0.251477,1.257084,0.013701,-0.94001,0.168714,0.384057,-0.069983,-0.648262,0.050695,-0.151449,-0.854381,0.433794,0.124927,-0.642527,0.147918,-0.964187,-0.514361,0.274111,-0.874183,0.068298,0.709699,0.096169,0.205263,-0.005275,-0.348959,-0.522,0.619956
