In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score

import torch
from torch import nn
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

import os
from tqdm import tqdm
import random
import time

import traceback

In [2]:
random_seed = 42

In [3]:
data_path = '../../../data/All.csv'
df = pd.read_csv(data_path)

In [4]:
sampled_df = df['SessionId'].sample(n=1000, random_state=random_seed)
df = df[df['SessionId'].isin(sampled_df)]

In [5]:
class GRU4REC(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, final_act='tanh',
                 dropout_hidden=.5, dropout_input=0, batch_size=50, embedding_dim=-1, use_cuda=False):
        super(GRU4REC, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout_hidden = dropout_hidden
        self.dropout_input = dropout_input
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.use_cuda = use_cuda
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.onehot_buffer = self.init_emb()
        self.h2o = nn.Linear(hidden_size, output_size)
        self.create_final_activation(final_act)
        if self.embedding_dim != -1:
            self.look_up = nn.Embedding(input_size, self.embedding_dim)
            self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.num_layers, dropout=self.dropout_hidden)
        else:
            self.gru = nn.GRU(self.input_size, self.hidden_size, self.num_layers, dropout=self.dropout_hidden)
        self = self.to(self.device)

    def create_final_activation(self, final_act):
        if final_act == 'tanh':
            self.final_activation = nn.Tanh()
        elif final_act == 'relu':
            self.final_activation = nn.ReLU()
        elif final_act == 'softmax':
            self.final_activation = nn.Softmax()
        elif final_act == 'softmax_logit':
            self.final_activation = nn.LogSoftmax()
        elif final_act.startswith('elu-'):
            self.final_activation = nn.ELU(alpha=float(final_act.split('-')[1]))
        elif final_act.startswith('leaky-'):
            self.final_activation = nn.LeakyReLU(negative_slope=float(final_act.split('-')[1]))

    def forward(self, input, hidden):

        if self.embedding_dim == -1:
            embedded = self.onehot_encode(input)
            if self.training and self.dropout_input > 0: embedded = self.embedding_dropout(embedded)
            embedded = embedded.unsqueeze(0)
        else:
            embedded = input.unsqueeze(0)
            embedded = self.look_up(embedded)

        output, hidden = self.gru(embedded, hidden) #(num_layer, B, H)
        output = output.view(-1, output.size(-1))  #(B,H)
        logit = self.final_activation(self.h2o(output))

        return logit, hidden

    def init_emb(self):
        onehot_buffer = torch.FloatTensor(self.batch_size, self.output_size)
        onehot_buffer = onehot_buffer.to(self.device)
        return onehot_buffer

    def onehot_encode(self, input):
        self.onehot_buffer.zero_()
        index = input.view(-1, 1)
        one_hot = self.onehot_buffer.scatter_(1, index, 1)
        return one_hot

    def embedding_dropout(self, input):
        p_drop = torch.Tensor(input.size(0), 1).fill_(1 - self.dropout_input)
        mask = torch.bernoulli(p_drop).expand_as(input) / (1 - self.dropout_input)
        mask = mask.to(self.device)
        input = input * mask
        return input

    def init_hidden(self):
        try:
            h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        except:
            self.device = 'cpu'
            h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(self.device)
        return h0

In [6]:
class Dataset(object):
    def __init__(self, df, sep=',', session_key='SessionId', item_key='ItemId', time_key='Time', n_sample=-1, itemmap=None, itemstamp=None, time_sort=False):
        # Read csv
        self.df = df
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        if n_sample > 0:
            self.df = self.df[:n_sample]

        self.add_item_indices(itemmap=itemmap)
        self.df.sort_values([session_key, time_key], inplace=True)
        self.click_offsets = self.get_click_offset()
        self.session_idx_arr = self.order_session_idx()

    def add_item_indices(self, itemmap=None):
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key: item_ids,
                                   'item_idx': item2idx[item_ids].values})
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')

    def get_click_offset(self):
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        return offsets

    def order_session_idx(self):
        if self.time_sort:
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())
        return session_idx_arr

    @property
    def items(self):
        return self.itemmap[self.item_key].unique()


class DataLoader():
    def __init__(self, dataset, batch_size=50, isTrain=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.isTrain = isTrain
        
        self.total_items = len(set(self.dataset.df['item_idx']))
        item_counts = dataset.df['item_idx'].value_counts()
        self.item_probabilities = item_counts / item_counts.sum()

    def __iter__(self):
        df = self.dataset.df
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []
        finished = False
        batch_items = set()

        while not finished:
            minlen = (end - start).min()
            idx_target = df.item_idx.values[start]

            for i in range(minlen - 1):
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = torch.LongTensor(idx_input)
                target = torch.LongTensor(idx_target)
                
                batch_items.update(idx_input.tolist())
                batch_items.update(idx_target.tolist())

                if self.isTrain:
                    idx_negative_samples = self.generate_negative_samples(batch_items, num_samples=10)
                    negative_samples = torch.LongTensor(idx_negative_samples)
                    yield input, target, negative_samples, mask
                    
                else:
                    yield input, target, mask

            start = start + (minlen - 1)
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

    def generate_negative_samples(self, batch_items, num_samples):
        try:
            all_items = set(range(self.total_items))
            available_items = list(all_items - batch_items)
            if not available_items:
                available_items = list(all_items)
            weights = self.item_probabilities[available_items].values
            idx_negative_samples = random.choices(available_items, weights=weights, k=num_samples)
            
            return idx_negative_samples
    
        except Exception as e:
            print("An error occurred:", e)
            print("all_items:", all_items)
            print("batch_items:", batch_items)
            print("available_items:", available_items)
            print("weights:", weights)
            traceback.print_exc()
            raise e

In [7]:
class Optimizer:
    def __init__(self, params, optimizer_type='Adagrad', lr=.05,
                 momentum=0, weight_decay=0, eps=1e-6):
        if optimizer_type == 'RMSProp':
            self.optimizer = optim.RMSprop(params, lr=lr, eps=eps, weight_decay=weight_decay, momentum=momentum)
        elif optimizer_type == 'Adagrad':
            self.optimizer = optim.Adagrad(params, lr=lr, weight_decay=weight_decay)
        elif optimizer_type == 'Adadelta':
            self.optimizer = optim.Adadelta(params, lr=lr, eps=eps, weight_decay=weight_decay)
        elif optimizer_type == 'Adam':
            self.optimizer = optim.Adam(params, lr=lr, eps=eps, weight_decay=weight_decay)
        elif optimizer_type == 'SparseAdam':
            self.optimizer = optim.SparseAdam(params, lr=lr, eps=eps)
        elif optimizer_type == 'SGD':
            self.optimizer = optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay)
        else:
            raise NotImplementedError

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.optimizer.step()

In [8]:
class Trainer(object):
    def __init__(self, model, train_data, eval_data, optim, use_cuda, loss_func, batch_size, k):
        self.model = model
        self.train_data = train_data
        self.eval_data = eval_data
        self.optim = optim
        self.loss_func = LossFunctionNeg()
        self.evaluation = Evaluation(self.model, self.loss_func, use_cuda, k = k)
        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.batch_size = batch_size

    def train(self, start_epoch, end_epoch, start_time=None):
        if start_time is None:
            self.start_time = time.time()
        else:
            self.start_time = start_time

        for epoch in range(start_epoch, end_epoch + 1):
            st = time.time()
            train_loss = self.train_epoch(epoch)
            ndcg = self.evaluation.eval(self.eval_data, self.batch_size)

        return ndcg

    def train_epoch(self, epoch):
        self.model.train()
        losses = []
    
        def reset_hidden(hidden, mask):
            if len(mask) != 0:
                hidden[:, mask, :] = 0
            return hidden
    
        hidden = self.model.init_hidden()
        dataloader = DataLoader(self.train_data, self.batch_size)
        
        for ii, (input, target, negative_samples, mask) in tqdm(enumerate(dataloader), total=len(dataloader.dataset.df) // dataloader.batch_size, miniters = 1000):
            input = input.to(self.device)
            target = target.to(self.device)
            negative_samples = negative_samples.to(self.device)
    
            self.optim.zero_grad()
            hidden = reset_hidden(hidden, mask).detach()
            logit, hidden = self.model(input, hidden)
    
            pos_logits = logit[:, target.view(-1)]
            neg_logits = logit[:, negative_samples.view(-1)]
    
            loss = self.loss_func(pos_logits, neg_logits)
            losses.append(loss.item())
    
            loss.backward()
            self.optim.step()
    
        mean_losses = np.mean(losses)
        return mean_losses

In [9]:
class Evaluation(object):
    def __init__(self, model, loss_func, use_cuda, k=100):
        self.model = model
        self.loss_func = LossFunction()
        self.topk = k
        self.device = torch.device('cuda' if use_cuda else 'cpu')

    def eval(self, eval_data, batch_size):
        self.model.eval()
        ndcgs = []
        dataloader = DataLoader(eval_data, batch_size, isTrain=False)
        with torch.no_grad():
            hidden = self.model.init_hidden()
            for ii, (input, target, mask) in tqdm(enumerate(dataloader), total=len(dataloader.dataset.df) // dataloader.batch_size, miniters = 1000):

                input = input.to(self.device)
                target = target.to(self.device)
                logit, hidden = self.model(input, hidden)

                ndcg = evaluate_ndcg(logit, target, k=self.topk)
                ndcgs.append(ndcg)
                
        mean_ndcg = np.mean(ndcgs)

        return mean_ndcg

In [10]:
def evaluate_ndcg(logits, targets, k):
    probs = torch.softmax(logits, dim=1).cpu().detach().numpy()

    targets_one_hot = np.zeros_like(probs)
    targets_one_hot[np.arange(len(targets)), targets.cpu().numpy()] = 1

    ndcg = ndcg_score(targets_one_hot, probs, k=k)

    return ndcg

In [11]:
class LossFunction(nn.Module):
    def __init__(self, loss_type='BPR'):
        super(LossFunction, self).__init__()
        self.loss_type = loss_type
        if loss_type == 'TOP1':
            self._loss_fn = TOP1Loss()
        elif loss_type == 'BPR':
            self._loss_fn = BPRLoss()
        else:
            raise NotImplementedError

    def forward(self, logit):
        return self._loss_fn(logit)

class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, logit):
        diff = logit.diag().view(-1, 1).expand_as(logit) - logit
        # final loss
        loss = -torch.mean(F.logsigmoid(diff))
        return loss

class TOP1Loss(nn.Module):
    def __init__(self):
        super(TOP1Loss, self).__init__()
    def forward(self, logit):
        diff = -(logit.diag().view(-1, 1).expand_as(logit) - logit)
        loss = torch.sigmoid(diff).mean() + torch.sigmoid(logit ** 2).mean()
        return loss

In [12]:
class LossFunctionNeg(nn.Module):
    def __init__(self, loss_type='BPR'):
        super(LossFunctionNeg, self).__init__()
        self.loss_type = loss_type
        if loss_type == 'TOP1':
            self._loss_fn = TOP1LossNeg()
        elif loss_type == 'BPR':
            self._loss_fn = BPRLossNeg()
        else:
            raise NotImplementedError

    def forward(self, pos_logits, neg_logits):
        return self._loss_fn(pos_logits, neg_logits)

class BPRLossNeg(nn.Module):
    def __init__(self):
        super(BPRLossNeg, self).__init__()

    def forward(self, pos_logits, neg_logits):
        
        pos_logits_expanded = pos_logits.unsqueeze(2)
        neg_logits_expanded = neg_logits.unsqueeze(1)

        diff = pos_logits_expanded - neg_logits_expanded
        loss = -torch.mean(F.logsigmoid(diff))

        return loss

class TOP1LossNeg(nn.Module):
    def __init__(self):
        super(TOP1LossNeg, self).__init__()
    def forward(self, logit):
        diff = -(logit.diag().view(-1, 1).expand_as(logit) - logit)
        loss = torch.sigmoid(diff).mean() + torch.sigmoid(logit ** 2).mean()
        return loss

In [13]:
def get_recall(indices, targets):
    targets = targets.view(-1, 1).expand_as(indices)
    hits = (targets == indices).nonzero()
    if len(hits) == 0:
        return 0
    n_hits = (targets == indices).nonzero()[:, :-1].size(0)
    recall = float(n_hits) / targets.size(0)
    return recall


def get_mrr(indices, targets):
    tmp = targets.view(-1, 1)
    targets = tmp.expand_as(indices)
    hits = (targets == indices).nonzero()
    ranks = hits[:, -1] + 1
    ranks = ranks.float()
    rranks = torch.reciprocal(ranks)
    mrr = torch.sum(rranks).data / targets.size(0)
    return mrr


def evaluate(indices, targets, k=20):
    _, indices = torch.topk(indices, k, -1)
    recall = get_recall(indices, targets)
    mrr = get_mrr(indices, targets)
    return recall, mrr

In [14]:
sigma = None

In [15]:
def init_model(model, sigma):
    if sigma is not None:
        for p in model.parameters():
            if sigma != -1 and args.sigma != -2:
                sigma = sigma
                p.data.uniform_(-sigma, sigma)
            elif len(list(p.size())) > 1:
                sigma = np.sqrt(6.0 / (p.size(0) + p.size(1)))
                if args.sigma == -1:
                    p.data.uniform_(-sigma, sigma)
                else:
                    p.data.uniform_(0, sigma)

In [16]:
input_size = df['ItemId'].nunique()
hidden_size = 100
output_size = input_size
final_act = 'tanh'
num_layers = 3
use_cuda = torch.cuda.is_available()
batch_size = 50
dropout_input = 0
dropout_hidden = 0.5
embedding_dim = -1

device = torch.device('cuda' if use_cuda else 'cpu')

optimizer_type = 'Adagrad'
lr = 0.01
weight_decay = 0
momentum = 0
eps = 1e-6

loss_type = 'BPR'
loss_function = LossFunction(loss_type=loss_type)
k = 100

n_epochs = 1

In [17]:
ndcgs = []

In [18]:
group_kfold = GroupKFold(n_splits=5)

for fold, (train_index, valid_index) in enumerate(group_kfold.split(df, groups=df['SessionId'])):
    print(f"Starting fold {fold + 1}")

    train_data_df = df.iloc[train_index]
    valid_data_df = df.iloc[valid_index]
    train_data = Dataset(train_data_df)
    valid_data = Dataset(valid_data_df)
    
    model = GRU4REC(input_size = input_size, hidden_size = hidden_size, output_size = output_size, final_act=final_act, num_layers=num_layers, use_cuda=use_cuda, batch_size=batch_size, dropout_input=dropout_input, dropout_hidden=dropout_hidden, embedding_dim=embedding_dim)
    init_model(model, sigma)
    optimizer = Optimizer(model.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps)
    trainer = Trainer(model, train_data=train_data, eval_data=valid_data, optim=optimizer, use_cuda=use_cuda, loss_func=loss_function, batch_size=batch_size, k=k)
    ndcg = trainer.train(0, n_epochs - 1)
    
    ndcgs.append(ndcg)

Starting fold 1


 92%|███████████████████████████████████████████████████████████████████████▉      | 3090/3350 [02:02<00:10, 25.15it/s]
 74%|███████████████████████████████████████████████████████████▎                    | 621/837 [01:41<00:35,  6.13it/s]


Starting fold 2


 92%|███████████████████████████████████████████████████████████████████████▌      | 3071/3350 [02:13<00:12, 23.03it/s]
 70%|████████████████████████████████████████████████████████▍                       | 590/837 [01:40<00:42,  5.85it/s]


Starting fold 3


 92%|████████████████████████████████████████████████████████████████████████      | 3095/3350 [05:41<00:28,  9.07it/s]
 76%|█████████████████████████████████████████████████████████████                   | 639/837 [01:48<00:33,  5.91it/s]


Starting fold 4


 92%|███████████████████████████████████████████████████████████████████████▌      | 3072/3350 [02:11<00:11, 23.33it/s]
 69%|███████████████████████████████████████████████████████▏                        | 578/837 [01:38<00:44,  5.86it/s]


Starting fold 5


 92%|███████████████████████████████████████████████████████████████████████▉      | 3090/3350 [02:12<00:11, 23.26it/s]
 69%|███████████████████████████████████████████████████████▌                        | 581/837 [01:42<00:44,  5.69it/s]


In [19]:
average_ndcg = np.mean(ndcgs)
print("Average NDCG@100:", average_ndcg)

Average NDCG@100: 0.021980521933475616
