In [18]:
import numpy as np
import pandas as pd
import torch
from torch import nn

### Data preprocessing

In [19]:
def load_dataset_timestamp(n_users, n_context, seq_len):
    act_list = []
    time_list = []
    user_list = []

    max_timestamp = -1.0
    min_timestamp = float('inf')

    with open('gowalla_user_activity.txt', 'r') as raw_file:
        for line in raw_file:
            t_item_list = []
            t_time_list = []
            user = int(line.split(':')[0])
            entries = line.split()[1:]
            for a_entry in entries:
                item, time_stamp = a_entry.split(':')
                t_item_list.append(int(item.strip()))
                t_time_list.append(int(time_stamp.strip()))

                if min_timestamp > int(time_stamp.strip()):
                    min_timestamp = int(time_stamp.strip())
                if max_timestamp < int(time_stamp.strip()):
                    max_timestamp = int(time_stamp.strip())

            act_list.append(t_item_list[0: seq_len])
            time_list.append(t_time_list[0: seq_len])
            user_list.append(user)

    new_time_list = []
    num_bins = 0

    times_bins = np.linspace(min_timestamp, max_timestamp + 1, num=num_bins, dtype=np.int32)
    for a_time_list in time_list:
        temp_time_list = (np.digitize(np.asarray(a_time_list), times_bins) - 1).tolist()
        new_time_list.append(temp_time_list)

    all_examples = []
    for i in range(0, len(act_list)):
        train_act_seq = act_list[i][:-2]
        train_time_seq = new_time_list[i][:-2]

        train_act_label = act_list[i][-2]
        train_time_label = new_time_list[i][-2]

        test_act_seq = act_list[i][1:-1]
        test_time_seq = new_time_list[i][1:-1]

        test_act_label = act_list[i][-1]
        test_time_label = new_time_list[i][-1]

        entry = {
            'train_act_seq': train_act_seq,
            'train_time_seq': train_time_seq,
            'train_act_label': train_act_label,
            'train_time_label': train_time_label,
            'test_act_seq': test_act_seq,
            'test_time_seq': test_time_seq,
            'test_act_label': test_act_label,
            'test_time_label': test_time_label,
            'seq_len': len(train_act_seq),
            'user': user_list[i]
        }

        all_examples.append(entry)

    return all_examples

In [20]:
data_examples = load_dataset_timestamp(20001, 128, 100)

### Dataset class

In [21]:
class UserDataset():
  def __init__(self, data, max_len):
    self.data = data
    self.max_len = max_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):

    user = self.data[idx]
    seq_len = user['seq_len']

    tr_act_seq = np.zeros((self.max_len,)).astype('int32')
    tr_act_seq[:seq_len] = np.array(user['train_act_seq'])
    tr_act_seq = np.transpose(tr_act_seq)

    tr_time_seq = np.zeros((self.max_len,)).astype('int32')
    tr_time_seq[:seq_len] = user['train_time_seq']
    tr_time_seq = np.transpose(tr_time_seq)

    t_act_seq = np.zeros((self.max_len, )).astype('int32')
    t_act_seq[:seq_len] = user['test_act_seq']
    t_act_seq = np.transpose(t_act_seq)

    t_time_seq = np.zeros((self.max_len, )).astype('int32')
    t_time_seq[:seq_len] = user['test_time_seq']
    t_time_seq = np.transpose(t_time_seq)


    return user['user'], tr_act_seq, \
    tr_time_seq, user['train_act_label'], \
    user['train_time_label'], t_act_seq, \
    t_time_seq, user['test_act_label'], \
    user['test_time_label'], user['seq_len']

### Baseline model

In [22]:
class RecModel(nn.Module):
  def __init__(self, num_classes):
    super(RecModel, self).__init__()
    self.fc1 = nn.Linear(128, num_classes)
    self.rnn = nn.RNN(128, 128, batch_first = True)
    self.norm = nn.BatchNorm1d(128)

  def forward(self, x, seq_len):

    x, h = self.rnn(x)
    hx = torch.zeros(x.shape[0], x.shape[2])
    for i in range(hx.shape[0]):
      hx[i] = x[i][seq_len[i] - 1]
    hx = self.norm(hx)

    x = self.fc1(hx)

    return x

### Dataset initialization

In [23]:
num_classes = 186
item_emb  = nn.init.xavier_uniform_(torch.empty(num_classes, 128))

In [24]:
from torch.utils.data import DataLoader
from torch.utils.data import Subset

user_dataset = UserDataset(data_examples, 100)

n = len(user_dataset)

indices = np.arange(n)
indices = np.random.permutation(indices)

train_indices = indices [:int(0.8*n)]
test_indices = indices[int(0.8*n):]

user_train_dataset = Subset(user_dataset, train_indices)
user_test_dataset = Subset(user_dataset, test_indices)

user_train_dataloader = DataLoader(user_train_dataset, batch_size=64, shuffle=True)
user_test_dataloader = DataLoader(user_test_dataset, batch_size=64, shuffle=True)

In [25]:
labels = np.arange(0, num_classes)

### Metrics

In [26]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(y_prob, y, k=10):
    predicted = [np.argsort(p_)[-k:][::-1] for p_ in y_prob]
    actual = [[y_] for y_ in y]
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])



def hits_k(y_prob, y, k=10):
    acc = []
    for p_, y_ in zip(y_prob, y):
        top_k = p_.argsort()[-k:][::-1]
        acc += [1. if y_ in top_k else 0.]
    return sum(acc) / len(acc)

In [27]:
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import ndcg_score

def get_metrics_(probs, labels_batch, test_one_hot):
    hits1 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=1, labels = labels)
    hits5 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=5, labels = labels)
    hits10 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=10, labels = labels)
    hits20 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=20, labels = labels)
    hits50= top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=50, labels = labels)
    hits100 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=100, labels = labels)

    map1 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=1)
    map5 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=5)
    map10 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=10)
    map20 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=20)
    map50 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=50)
    map100 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=100)

    ndcg1 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=1)
    ndcg5 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=5)
    ndcg10 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=10)
    ndcg20 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=20)
    ndcg50 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=50)
    ndcg100 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=100)
    return hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100

In [28]:
def print_metrics(hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100):
    print(f'hits@1: {hits1:.6f}, hits@5: {hits5:.6f}, hits@10: {hits10:.6f}, hits@20: {hits20:.6f}')
    print(f'hits@50: {hits50:.6f}, hits@100: {hits100:.6f}')
    print(f'map@1: {map1:.6f}, map@5: {map5:.6f}, map@10: {map10:.6f}, map@20: {map20:.6f}')
    print(f'map@50: {map50:.6f}, map@100: {map100:.6f}')
    print(f'ndcg@1: {ndcg1:.6f}, ndcg@5: {ndcg5:.6f}, ndcg@10: {ndcg10:.6f}, ndcg@20: {ndcg20:.6f}')
    print(f'ndcg@50: {ndcg50:.6f}, ndcg@100: {ndcg100:.6f}')

### Train and test

In [29]:
def test(model):
  metrics_val = []
  model.eval()
  index = 0

  for user, train_input, train_time, train_label, train_time_label, test_input, test_time, test_label, test_time_label, seq_len in user_test_dataloader:

      test_comb_input = np.concatenate([np.expand_dims(test_input, axis=-1),
                                                np.expand_dims(test_time, axis=-1)], axis=2)
      model_input = test_comb_input
      model_output = test_label
      test_rnn_input_emb = item_emb[model_input[:, :, 0]]
      test_probs = model(test_rnn_input_emb, seq_len)

      test_pred = torch.argmax(test_probs, axis = 1)

      test_one_hot = torch.zeros(len(test_probs), num_classes)
      test_one_hot[torch.arange(len(test_one_hot)), test_label] = 1
      loss = loss_fn(test_probs, test_one_hot)

      hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, \
      ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100 = get_metrics_(test_probs, test_label, test_one_hot)

      metrics_val.append([hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100])


  mean = torch.Tensor(metrics_val).mean(axis=0)
  test_hits1, test_hits5, test_hits10, test_hits20, test_hits50, test_hits100, \
  test_map1, test_map5, test_map10, test_map20, test_map50, test_map100, test_ndcg1, test_ndcg5, test_ndcg10, test_ndcg20, test_ndcg50, test_ndcg100 = mean
  return test_hits1, test_hits5, test_hits10, test_hits20, test_hits50, test_hits100, test_map1, test_map5, test_map10, test_map20, test_map50, test_map100,\
  test_ndcg1, test_ndcg5, test_ndcg10, test_ndcg20, test_ndcg50, test_ndcg100


In [31]:
loss_fn = nn.CrossEntropyLoss()
model  = RecModel(num_classes)

model.train()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.97, last_epoch=-1)


for i in range(20):
  losses = []
  hits_1_scores = []

  for user, train_input, train_time, train_label, train_time_label, test_input, test_time, test_label, test_time_label, seq_len in user_train_dataloader:
      optimizer.zero_grad()
      comb_input = np.concatenate([np.expand_dims(train_input, axis=-1),
                                                np.expand_dims(train_time, axis=-1)], axis=2)
      model_input = comb_input
      model_output = train_label
      rnn_input_emb = item_emb[model_input[:, :, 0]]


      probs = model(rnn_input_emb, seq_len)
      pred = torch.argmax(probs, axis = 1)

      one_hot = torch.zeros(len(probs), num_classes)
      one_hot[torch.arange(len(one_hot)), model_output] = 1

      loss = loss_fn(probs, one_hot)
      losses.append(loss)
      loss.backward()
      optimizer.step()
      losses.append(loss)


  mean_loss = torch.Tensor(losses).mean(axis=0)
  mean_hits = torch.Tensor(hits_1_scores).mean(axis=0).item()
  print(f'Epoch: {i} Loss: {mean_loss.item()}')
hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100 = test(model)
print_metrics(hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100)

Epoch: 0 Loss: 1.5317010879516602
Epoch: 1 Loss: 0.9467215538024902
Epoch: 2 Loss: 0.8689113259315491
Epoch: 3 Loss: 0.8289505243301392
Epoch: 4 Loss: 0.7923804521560669
Epoch: 5 Loss: 0.7688073515892029
Epoch: 6 Loss: 0.744735062122345
Epoch: 7 Loss: 0.7252783179283142
Epoch: 8 Loss: 0.7110788226127625
Epoch: 9 Loss: 0.6926876306533813
Epoch: 10 Loss: 0.6846606731414795
Epoch: 11 Loss: 0.6701433062553406
Epoch: 12 Loss: 0.6578955054283142
Epoch: 13 Loss: 0.6444637179374695
Epoch: 14 Loss: 0.6355376243591309
Epoch: 15 Loss: 0.6251114010810852
Epoch: 16 Loss: 0.6153188347816467
Epoch: 17 Loss: 0.6113913655281067
Epoch: 18 Loss: 0.5987008213996887
Epoch: 19 Loss: 0.6001372337341309
hits@1: 0.777853, hits@5: 0.912961, hits@10: 0.942475, hits@20: 0.963061
hits@50: 0.987118, hits@100: 0.996032
map@1: 0.777853, map@5: 0.834172, map@10: 0.838151, map@20: 0.839562
map@50: 0.840351, map@100: 0.840481
ndcg@1: 0.777853, ndcg@5: 0.854113, ndcg@10: 0.863697, ndcg@20: 0.868878
ndcg@50: 0.873691, ndc

In [33]:
hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100 = test(model)
print_metrics(hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100)

hits@1: 0.777387, hits@5: 0.912961, hits@10: 0.942475, hits@20: 0.963061
hits@50: 0.987118, hits@100: 0.995799
map@1: 0.777387, map@5: 0.833931, map@10: 0.837911, map@20: 0.839321
map@50: 0.840110, map@100: 0.840236
ndcg@1: 0.777387, ndcg@5: 0.853937, ndcg@10: 0.863520, ndcg@20: 0.868702
ndcg@50: 0.873515, ndcg@100: 0.874928


In [34]:
torch.save(model.state_dict(), 'rnn_baseline_model.pth')