In [1]:
import os 
import pickle
import tempfile
import wget
import zipfile
from collections import Counter
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
from tqdm import trange

from dotmap import DotMap
import random
import math

Below are the list of arguments and parameters used in the process of loading and preprocessing the dataset. Feel free to alter the values according to your experiment.

In [2]:
args = DotMap()

args.dataset_root = Path('/ssd003/projects/aieng/public/recsys_datasets/kasandr/de')
args.preprocessed = Path(f'./Data/kasandr/preprocessed')
args.min_uc = 5
args.max_uc = 1000
args.min_mc = 5

args.eval_set_size = 500

args.train_batch_size = 128
args.val_batch_size = 128
args.test_batch_size = 128

args.negative_sample_size = 30

args.data_seed = 98765

args.user_sample = 0.2

In [3]:
DATA_PATH = "/ssd003/projects/aieng/public/recsys_datasets/kasandr/de"

train_path = os.path.join(DATA_PATH, r'train_de.csv')
test_path = os.path.join(DATA_PATH, r'test_de.csv')

df = pd.read_csv(train_path, delimiter = '\t')


In [4]:
# user_list = list(set(df['userid']))
# sampled_users = random.sample(user_list, math.floor(len(user_list)*args.user_sample))
# df = df[df['userid'].isin(sampled_users)]

df = df[df['rating'] == 1]

In [5]:
df

Unnamed: 0,userid,offerid,countrycode,category,merchant,utcdate,rating
44,5bafdc0592dff7fa8c2898888d7fa183865a8a425d15f8...,b24526a20f4e4412d9301c328ff71f61,de,138001,9f6c66333880924b1d7db5d2903ee94f8c1d8d958d3afb...,2016-06-02 17:50:48.0,1
56,6eb2fe43a01f9daf19af5873cbc279c07dffb10790da96...,2bfd670e616b8f088797487eb6b71515,de,100354123,ab8863ef55e574c0093451ca02bc1842bd88e0fc32cf67...,2016-06-02 17:51:20.0,1
80,7c61831be004421505903f990a5b4f597322307e572a39...,05aa287e0a53f6a5c9b92a273ded0844,de,100091613,418899436d9314d6afe780b7fad289efed5115928e9329...,2016-06-02 17:53:50.0,1
93,7735de9a62b5bdd4307818b3fa7928ec226ac1f8607831...,9674edeb73f6e51ade9481ad452edcb4,de,142101,c26503aa822d9652cb0c2274a76467c5720e341e549d7c...,2016-06-02 17:54:47.0,1
107,2014a976a7e6775f784952118aba4cbdef790c5608bc53...,9157b97ae21f55ed3fe7e63e79c711c5,de,164401,b042951fdb45ddef8ba6075ced0e5885bc2fa4c4470bf7...,2016-06-02 17:55:45.0,1
...,...,...,...,...,...,...,...
15844610,fd82bceb96006341c7b56ca561d8496c624235d7f48f3f...,0f2fcf95319f5c1e5745371351f521e5,de,125801,a7b2f269064dbe77eb21b5a8b0f067d3f297a26aa185d3...,2016-06-14 20:08:53.0,1
15844613,2413e7ee24872c11cf72737e8abd99ae8b90e12f110e67...,ebb77a97cfdfd01c8b2f5cbffb1d5627,de,100020213,ac26975cf46eae9898b7d906bdfbbf99ce7813ffc3f9b7...,2016-06-14 20:09:19.0,1
15844616,860ca92277a6b738fa2642157cbaf26e8ad201249d17a7...,0f2fcf95319f5c1e5745371351f521e5,de,125801,a7b2f269064dbe77eb21b5a8b0f067d3f297a26aa185d3...,2016-06-14 20:09:33.0,1
15844617,001fecc308b147cbd9837051c62f035fd75ab42b3ef19c...,0f2fcf95319f5c1e5745371351f521e5,de,125801,a7b2f269064dbe77eb21b5a8b0f067d3f297a26aa185d3...,2016-06-14 20:09:52.0,1


## Data Preprocessing

The goal of data preprocessing is to make the dataset compatible with BERT4Rec. The preprocessing steps are based on the Pytorch implementation of BERT4Rec provided [here](https://github.com/jaywonchung/BERT4Rec-VAE-Pytorch).

The first step of preprocessing is to filter the data based on 
1. Rating values: removing the records with a rating lower than `args.min_rating`.
2. Number of ratings per movie: removing the movies that have been rated fewer times than `args.min_mc`.
3. Number of ratings per user: removing the users who have rated movies fewer times than `args.min_uc`.

In [6]:
def filter_min_mc(df):
    """removes the movie records that have been rated less frequent than minimun

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_mc > 0:
            item_sizes = df.groupby('offerid').size()
            good_items = item_sizes.index[item_sizes >= args.min_mc]
            df = df[df['offerid'].isin(good_items)]
    return df

def filter_min_uc(df):
    """removes the user records that have rated less frequent than minimun

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_uc > 0:
            user_sizes = df.groupby('userid').size()
            good_users = user_sizes.index[user_sizes >= args.min_uc]
            df = df[df['userid'].isin(good_users)]
    return df

def filter_max_uc(df):
    """removes the user records that have too many records

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.max_uc > 0:
            user_sizes = df.groupby('userid').size()
            good_users = user_sizes.index[user_sizes <= args.max_uc]
            df = df[df['userid'].isin(good_users)]
    return df



In [7]:
df2 = df.copy()

df2 = filter_min_mc(df2)
df2 = filter_min_uc(df2)
df2 = filter_max_uc(df2)


In [8]:
df2

Unnamed: 0,userid,offerid,countrycode,category,merchant,utcdate,rating
0,fa937b779184527f12e2d71c711e6411236d1ab59f8597...,c5f63750c2b5b0166e55511ee878b7a3,de,100020213,f3c93baa0cf4430849611cedb3a40ec4094d1d370be841...,2016-06-14 17:28:47.0,0
1,f6c8958b9bc2d6033ff4c1cc0a03e9ab96df4bcc528913...,19754ec121b3a99fff3967646942de67,de,100020213,21a509189fb0875c3732590121ff3fc86da770b0628c18...,2016-06-14 17:28:48.0,0
2,02fe7ccf1de19a387afc8a11d08852ffd2b4dabaed4e2d...,5ac4398e4d8ad4167a57b43e9c724b18,de,125801,b042951fdb45ddef8ba6075ced0e5885bc2fa4c4470bf7...,2016-06-14 17:28:50.0,0
3,9de5c06d0a16256b13b8e7cdc50bf203ecef533eb5cbe1...,be83df9772ec47fd210b28091138ff11,de,125801,4740b6c83b6e12e423297493f234323ffd1c991f3d4496...,2016-06-14 17:29:19.0,0
4,8d26ade603ea5473c3844aebfcd9e96e6adc8ff411576e...,3735290a415dc236bacd7ed3aa03b2d5,de,125801,8bf8f87492a799528235c04bb18ff2d12db5058ff6e9a0...,2016-06-14 17:29:31.0,0
...,...,...,...,...,...,...,...
15844711,e4d50bd79fa33e4650e252137e14b9b9928cf60d65781d...,3735290a415dc236bacd7ed3aa03b2d5,de,125801,8bf8f87492a799528235c04bb18ff2d12db5058ff6e9a0...,2016-06-14 21:12:09.0,0
15844712,04ab4fa5e19f1a23c64461a8489c0729f6d4f6cee063f8...,19754ec121b3a99fff3967646942de67,de,100020213,21a509189fb0875c3732590121ff3fc86da770b0628c18...,2016-06-14 21:12:14.0,0
15844713,f01941434c90419b71592498e9578b63e9e8d2b46c4312...,e0a0fd363797131fead652ec2a2d6027,de,100020213,f3c93baa0cf4430849611cedb3a40ec4094d1d370be841...,2016-06-14 21:12:14.0,0
15844714,08a3b618d4a3107b0050b1104681b132a1cc5b44d75c73...,0f2fcf95319f5c1e5745371351f521e5,de,125801,a7b2f269064dbe77eb21b5a8b0f067d3f297a26aa185d3...,2016-06-14 21:12:19.0,0


After filtering some of the records in the dataset, the movie ids and the user ids are densified so that there is no missing id value in the sequence of all ids.

In [9]:
# reassign userid and offerid to int range

def densify_index(df):
    """reassigns the user and movie ids to remove the gaps caused by deletions

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    umap = {u: i for i, u in enumerate(set(df['userid']))}
    smap = {s: i for i, s in enumerate(set(df['offerid']))}
    df['userid'] = df['userid'].map(umap)
    df['offerid'] = df['offerid'].map(smap)
    return df, umap, smap

In [10]:
df2, umap, smap = densify_index(df2)

df2

Unnamed: 0,userid,offerid,countrycode,category,merchant,utcdate,rating
0,66249,46780,de,100020213,f3c93baa0cf4430849611cedb3a40ec4094d1d370be841...,2016-06-14 17:28:47.0,0
1,63525,488786,de,100020213,21a509189fb0875c3732590121ff3fc86da770b0628c18...,2016-06-14 17:28:48.0,0
2,71170,355663,de,125801,b042951fdb45ddef8ba6075ced0e5885bc2fa4c4470bf7...,2016-06-14 17:28:50.0,0
3,55812,244020,de,125801,4740b6c83b6e12e423297493f234323ffd1c991f3d4496...,2016-06-14 17:29:19.0,0
4,87593,529556,de,125801,8bf8f87492a799528235c04bb18ff2d12db5058ff6e9a0...,2016-06-14 17:29:31.0,0
...,...,...,...,...,...,...,...
15844711,31057,529556,de,125801,8bf8f87492a799528235c04bb18ff2d12db5058ff6e9a0...,2016-06-14 21:12:09.0,0
15844712,87512,488786,de,100020213,21a509189fb0875c3732590121ff3fc86da770b0628c18...,2016-06-14 21:12:14.0,0
15844713,29963,511537,de,100020213,f3c93baa0cf4430849611cedb3a40ec4094d1d370be841...,2016-06-14 21:12:14.0,0
15844714,90653,419990,de,125801,a7b2f269064dbe77eb21b5a8b0f067d3f297a26aa185d3...,2016-06-14 21:12:19.0,0


Finally the dataset is split into three subsets for training, validation, and testing. Since BERT4Rec adopts leave-one-out evaluation method, the dataset is split in a way that for each user, the last item of the rating sequence is held as the test data, the item just before the last is held as the validation set, and the remaining items are used for training.

In [23]:
def split_df(df, user_count):
    """splits dataset to train, validation, and test sets

    Args:
        df (pd.Dataframe): the preprocessed dataframe
        user_count (int): number of all users in the dataset

    Returns:
        Tuple: a tuple of data splits
    """
    user_group = df.groupby('userid')
    user2items = user_group.apply(lambda d: list(d.sort_values(by='utcdate')['offerid']))
    user2labels = user_group.apply(lambda d: list(d.sort_values(by='utcdate')['rating']))    
    
    train, val, test, val_label, test_label = {}, {}, {}, {}, {}
    for user in range(user_count):
        items = user2items[user]
        labels = user2labels[user]
        train[user], val[user], test[user] = items[:-2], items[-2:-1], items[-1:]
        val_label[user], test_label[user] = labels[-2:-1], labels[-1:]
    return train, val, test, val_label, test_label

def filter_train(df, train, user_count):
    """
    keep 1 labelled items from train dataset
    """
    train_filtered = {}
    df = df.sort_values('utcdate')
    
    for user in range(user_count):
        df_user_items = df[df['userid'] == user]
        df_user_items = df_user_items.iloc[:-2]
        df_user_items = df_user_items[df_user_items['rating'] == 1]
        train_filtered[user] = list(df_user_items['offerid'])

    return train_filtered

All the preprocessing functions are applied to the data as below, and the final dataframe and the data splits are stored in the `preprocessed` directory.

In [26]:
# df = filter_min_mc(df)
# df = filter_min_uc(df)
# df, umap, smap = densify_index(df)

user_count = len(umap)
item_count = len(smap)

train, val, test, val_label, test_label = split_df(df2, user_count)

dataset = {'train': train,
            'val': val,
            'test': test,
            'val_label': val_label,
            'test_label': test_label,
            'umap': umap,
            'smap': smap}

# temp_train = filter_train(df2, train, user_count)
# temp_train

dataset_path = args.preprocessed.joinpath('dataset.pkl')
with dataset_path.open('wb') as f:
    pickle.dump(dataset, f)

df_path = args.preprocessed.joinpath('preprocessed.csv')
df.to_csv(df_path, index=False)


In [27]:
len(train)

9381

To see how the data is split, let's take a look at the ratings by user 0 that are sorted based on the timestamp, and then see which partitions of the sequence are included in the train, validation, and test sets.

In [28]:
df2[df2['userid'].isin([0])]

Unnamed: 0,userid,offerid,countrycode,category,merchant,utcdate,rating
1623625,0,10726,de,100014313,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-06 15:08:57.0,1
2892595,0,10944,de,108501,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-06 15:08:57.0,1
4793651,0,7403,de,100332323,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-06 15:14:57.0,1
4958179,0,7403,de,100332323,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-07 12:25:05.0,1
6875972,0,10726,de,100014313,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-07 12:25:05.0,1
7965389,0,10944,de,108501,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-06 15:14:57.0,1
8141324,0,10944,de,108501,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-07 12:25:05.0,1
15564534,0,7403,de,100332323,3d491e93a91678c953bc1a2a60e67384fb2492a5142826...,2016-06-06 15:08:57.0,1


The train data includes all the above ratings except the last two.

In [29]:
train[0]

[10726, 10944, 7403, 7403, 10944, 7403]

The last but one rating in used for validation.

In [30]:
val[0]

[10726]

The last rating is used as the test data.

In [31]:
test[0]

[10944]

## Negative Sampling

To evaluate BERT4Rec, each ground truth item in the validation and test sets is paired with a number of sampled negative items that the user has not interacted with. The evaluation mertrics are then calculated using these values.

Below two negative samplers are presented:
1. Negative sampling based on popularity: The popular items are those ones that are rated more frequently. In this case, the negative samples are selected from a sorted list of popular items that the user has not interacted with.
2. Random negative sampling: The items are randomly sampled.

In [33]:
def random_negative_samples(train, val, test, user_count, item_count):
    """generates negative samples randomly

    Args:
        train (dict): train dataset
        val (dict): validation dataset
        test (dict): test dataset
        user_count (int): total number of users
        item_count (int): total number of items

    Returns:
        dict: negatives samples for all users
    """
    np.random.seed(args.data_seed)
    negative_samples = {}
    print('Sampling negative items')
    for user in trange(user_count):
        seen = set(train[user])
        seen.update(val[user])
        seen.update(test[user])

        samples = []
        for _ in range(args.negative_sample_size):
            item = np.random.choice(item_count) + 1
            while item in seen or item in samples:
                item = np.random.choice(item_count) + 1
            samples.append(item)

        negative_samples[user] = samples

    negatives_path = args.preprocessed.joinpath('random_negatives.pkl')
    with negatives_path.open('wb') as f:
        pickle.dump(negative_samples, f)

    return negative_samples

In [34]:
neg_sample_test = random_negative_samples(train, val, test, user_count, item_count)

Sampling negative items


100%|██████████| 9381/9381 [00:04<00:00, 2173.03it/s]
