In [15]:
import os
import json
import pandas as pd
import random
from pathlib import Path
from dotmap import DotMap

# import methods which required modification for kasandr dataset
from utils import get_sampled_data
# negative_sampling_offline, create_item2cate

from recommenders.datasets.amazon_reviews import _reviews_preprocessing, _meta_preprocessing, _create_instance, _create_item2cate, _get_sampled_data, _data_processing, _data_generating, _create_vocab, _negative_sampling_offline, download_and_extract

In [16]:
DATA_PATH = "/ssd003/projects/aieng/public/recsys_datasets/kasandr/de"

train_path = os.path.join(DATA_PATH, r'train_de.csv')
test_path = os.path.join(DATA_PATH, r'test_de.csv')

df = pd.read_csv(train_path, delimiter = '\t')

df = df[df['rating'] == 1]

In [17]:
# DATA_PATH = "/ssd003/projects/aieng/public/recsys_datasets/amazon"
DATA_PATH = Path(f'./Data/kasandr')


# Directories to store train, validation and test splits
train_path = os.path.join(DATA_PATH, r'train_data')
valid_path = os.path.join(DATA_PATH, r'valid_data')
test_path = os.path.join(DATA_PATH, r'test_data')

# Files paths to store the list of existing ids for user, item and item category 
user_vocab_path = os.path.join(DATA_PATH, r'user_vocab.pkl')
item_vocab_path = os.path.join(DATA_PATH, r'item_vocab.pkl')
cate_vocab_path = os.path.join(DATA_PATH, r'category_vocab.pkl')
output_file_path = os.path.join(DATA_PATH, r'output.txt')


valid_num_ngs = 4 # number of negative instances with a positive instance for validation
test_num_ngs = 9 # number of negative instances with a positive instance for testing


args = DotMap()

args.min_uc = 5
args.max_uc = 100
args.min_mc = 5
args.min_us_prod_types = 2

args.eval_set_size = 500

args.train_batch_size = 128
args.val_batch_size = 128
args.test_batch_size = 128

args.negative_sample_size = 30

args.data_seed = 98765

args.user_sample = 0.2

In [18]:
def filter_min_mc(df):
    """removes the movie records that have been rated less frequent than minimun

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_mc > 0:
            item_sizes = df.groupby('offerid').size()
            good_items = item_sizes.index[item_sizes >= args.min_mc]
            df = df[df['offerid'].isin(good_items)]
    return df

def filter_min_uc(df):
    """removes the user records that have rated less frequent than minimun

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_uc > 0:
            user_sizes = df.groupby('userid').size()
            good_users = user_sizes.index[user_sizes >= args.min_uc]
            df = df[df['userid'].isin(good_users)]
    return df

def filter_max_uc(df):
    """removes the user records that have too many records

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.max_uc > 0:
            user_sizes = df.groupby('userid').size()
            good_users = user_sizes.index[user_sizes <= args.max_uc]
            df = df[df['userid'].isin(good_users)]
    return df

def filter_min_user_product_types(df):
    """removes the user records only have records with one single product

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    if args.min_us_prod_types > 0:
            user_num_products = df.groupby('userid')['offerid'].agg(unique_count='nunique').reset_index()
            
            good_users = user_num_products[user_num_products['unique_count'] >= args.min_us_prod_types]['userid']
            df = df[df['userid'].isin(good_users)]
    return df

# reassign userid and offerid to int range

def densify_index(df):
    """reassigns the user and movie ids to remove the gaps caused by deletions

    Args:
        df (pd.Dataframe): a pandas dataframe including uid, sid, rating, timestamp, 
        title, and genre)

    Returns:
        df (pd.Dataframe): the updated dataframe
    """
    umap = {u: i for i, u in enumerate(set(df['userid']))}
    smap = {s: i for i, s in enumerate(set(df['offerid']))}
    df['userid'] = df['userid'].map(umap)
    df['offerid'] = df['offerid'].map(smap)
    return df, umap, smap

In [19]:
df2 = df.copy()

df2 = filter_min_mc(df2)
df2 = filter_min_uc(df2)
df2 = filter_max_uc(df2)
df2 = filter_min_user_product_types(df2)
# df2, umap, smap = densify_index(df2)
df2['cate_id'] = "Other"
df2['utcdate']= pd.to_datetime(df2['utcdate'])
df2['timestamp'] = df2['utcdate'].astype('int64') // 10 ** 9
# df2['userid'] = df2['userid'].astype(str)
# df2['offerid'] = df2['offerid'].astype(str)
df2['rating'] = df2['rating'].astype(str)

df2.rename(columns={'rating':'label', 
                    'userid':'user_id',
                    'offerid':'item_id'}, inplace=True)

df2 = df2[['label','user_id','item_id','timestamp','cate_id']]

df2 = df2.sort_values(by=['user_id', 'timestamp'])
df2

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,label,user_id,item_id,timestamp,cate_id
13231264,1,000193911b6e17798ea648be1fadc686d8b0b16ff2a67a...,28935fd932d7d34307a12ee5a81fc13d,1464807760,Other
3750266,1,000193911b6e17798ea648be1fadc686d8b0b16ff2a67a...,d3b81f15eeafc2899600c788fe104e58,1464876602,Other
3352461,1,000193911b6e17798ea648be1fadc686d8b0b16ff2a67a...,21497c7349778ec30c7be546f19a382d,1465036280,Other
9638058,1,000193911b6e17798ea648be1fadc686d8b0b16ff2a67a...,d3b81f15eeafc2899600c788fe104e58,1465332950,Other
3329748,1,000193911b6e17798ea648be1fadc686d8b0b16ff2a67a...,86fed321c2e1ddd326c5de0bc33f6d2f,1465919640,Other
...,...,...,...,...,...
4026175,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,d3c31adc85f1adc25bf1c87ee1f6e139,1465456444,Other
4782868,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,e5efa2013bc258405428c5d40aab74c9,1465540772,Other
743215,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,2bfd670e616b8f088797487eb6b71515,1465626127,Other
10962254,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,77aa6e8d9605cdf02713251291d9c5b9,1465626137,Other


In [20]:
df_path = DATA_PATH.joinpath('preprocessed.csv')
df2.to_csv(df_path, index=False, header=False, sep ='\t')

In [21]:
f_train = open(df_path, "r")

for line in list(f_train)[:2]:
    arr = line.strip("\n").split("\t")
    
arr

['1',
 '000193911b6e17798ea648be1fadc686d8b0b16ff2a67a5b0be8cd2a9ec5f13d',
 'd3b81f15eeafc2899600c788fe104e58',
 '1464876602',
 'Other']

In [22]:
# Establish global item to category dictionairy
_create_item2cate(df_path)

# Sample subset of interactions and store in csv file 
sampled_instance_path = get_sampled_data(df_path, sample_rate=0.1)

# Load csv into dataframe to visualize
ss_ns_df = pd.read_csv(sampled_instance_path, sep="\t", names=["label", "user_id", "item_id", "timestamp", "cate_id"])
ss_ns_df

Unnamed: 0,label,user_id,item_id,timestamp,cate_id
0,1,003970a8d5df526d80ded14535e15cf4ea04f277abe6cf...,95bdd77e179831bcb1bb9b8bf39a7f0d,1465066829,Other
1,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,b203435e0ca71f90508403e762d3a42f,1464850748,Other
2,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,a5e01c7b865e0ae92bd78ad6db1d1576,1464943154,Other
3,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,bf0ddbcb5fcb316d0a2f6f3655b27b91,1464943300,Other
4,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,1464807455,Other
...,...,...,...,...,...
32476,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,0e314aac8bb3e8b47cb665a4a62009d1,1464983618,Other
32477,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,131c583fddb7f37a3172f01867d8c48d,1465394490,Other
32478,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,dd07b38db5e79ae18d8dd8bfd143277e,1465394490,Other
32479,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,d37111749fe898b02f86688363a4598b,1465397402,Other


In [23]:
preprocessed_output = _data_processing(sampled_instance_path)

pp_df = pd.read_csv(preprocessed_output, sep="\t", names=["set", "label", "user_id", "item_id", "timestamp", "cate_id"])
pp_df


Unnamed: 0,set,label,user_id,item_id,timestamp,cate_id
0,test,1,003970a8d5df526d80ded14535e15cf4ea04f277abe6cf...,95bdd77e179831bcb1bb9b8bf39a7f0d,1465066829,Other
1,train,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,b203435e0ca71f90508403e762d3a42f,1464850748,Other
2,valid,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,a5e01c7b865e0ae92bd78ad6db1d1576,1464943154,Other
3,test,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,bf0ddbcb5fcb316d0a2f6f3655b27b91,1464943300,Other
4,train,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,1464807455,Other
...,...,...,...,...,...,...
32476,train,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,0e314aac8bb3e8b47cb665a4a62009d1,1464983618,Other
32477,train,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,131c583fddb7f37a3172f01867d8c48d,1465394490,Other
32478,train,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,dd07b38db5e79ae18d8dd8bfd143277e,1465394490,Other
32479,valid,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,d37111749fe898b02f86688363a4598b,1465397402,Other


In [24]:
_data_generating(preprocessed_output, train_path, valid_path, test_path)

train_df = pd.read_csv(train_path, sep="\t", index_col=False, names=["label", "user_id", "item_id", "cate_id", "timestamp", "prev_item_ids", "prev_cate_ids", "prev_timestamps"])
train_df

Unnamed: 0,label,user_id,item_id,cate_id,timestamp,prev_item_ids,prev_cate_ids,prev_timestamps
0,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,Other,1464867431,cdca1333b6820ab29501f879d8dae7c9,Other,1464807455
1,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,Other,1464867447,"cdca1333b6820ab29501f879d8dae7c9,cdca1333b6820...","Other,Other",14648074551464867431
2,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,Other,1464867528,"cdca1333b6820ab29501f879d8dae7c9,cdca1333b6820...","Other,Other,Other",146480745514648674311464867447
3,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,Other,1464867577,"cdca1333b6820ab29501f879d8dae7c9,cdca1333b6820...","Other,Other,Other,Other",1464807455146486743114648674471464867528
4,1,0053b8d4cc3ffb98201781e8d2ac7f668513248f5d5d51...,cdca1333b6820ab29501f879d8dae7c9,Other,1464867922,"cdca1333b6820ab29501f879d8dae7c9,cdca1333b6820...","Other,Other,Other,Other,Other","1464807455,1464867431,1464867447,1464867528,14..."
...,...,...,...,...,...,...,...,...
23822,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,c15ed4a6bcf39c880f56d03fe1aec56a,Other,1464876998,162decae9179e5b628166ffafca22ab1,Other,1464807808
23823,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,75f9aa1430b0e847acb3040e7207784f,Other,1464983603,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other",14648078081464876998
23824,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,0e314aac8bb3e8b47cb665a4a62009d1,Other,1464983618,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other",146480780814648769981464983603
23825,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,131c583fddb7f37a3172f01867d8c48d,Other,1465394490,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other",1464807808146487699814649836031464983618


In [25]:
# Create user, item and category vocabulary files
_create_vocab(train_path, user_vocab_path, item_vocab_path, cate_vocab_path)

The final step of the data preprocessing is to randomly sample negative samples to include in the validation and testing set using the `_negative_sampling_offline`. Negative samples are interactions between users and interactions that did not happen. It is important to have negative samples in order to evaluate the performance of recommender systems. Negative samples are denoted using label of 0 and are only present in the validation and test set. 

In [26]:
# Add negative sample to validation and testing 
_negative_sampling_offline(
    sampled_instance_path, valid_path, test_path, valid_num_ngs, test_num_ngs
)

In [27]:
# Visualize validation dataset dataframe
valid_df = pd.read_csv(valid_path, sep="\t", index_col=False, names=["label", "user_id", "item_id", "cate_id", "timestamp", "prev_item_ids", "prev_cate_ids", "prev_timestamps"])
valid_df

Unnamed: 0,label,user_id,item_id,cate_id,timestamp,prev_item_ids,prev_cate_ids,prev_timestamps
0,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,a5e01c7b865e0ae92bd78ad6db1d1576,Other,1464943154,b203435e0ca71f90508403e762d3a42f,Other,1464850748
1,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,5a6a0ad75329c2721d7273cf8cbb7b63,Other,1464943154,b203435e0ca71f90508403e762d3a42f,Other,1464850748
2,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,d1b4365db02dd693661e0186dbfcc2e7,Other,1464943154,b203435e0ca71f90508403e762d3a42f,Other,1464850748
3,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,7b77553a80a31ccf5d0fe2a387748575,Other,1464943154,b203435e0ca71f90508403e762d3a42f,Other,1464850748
4,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,0f2fcf95319f5c1e5745371351f521e5,Other,1464943154,b203435e0ca71f90508403e762d3a42f,Other,1464850748
...,...,...,...,...,...,...,...,...
12065,1,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,d37111749fe898b02f86688363a4598b,Other,1465397402,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
12066,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,0fc06555404935b0bc18407da4c143b5,Other,1465397402,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
12067,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,a5fc37404646ac3d34118489cdbfb341,Other,1465397402,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
12068,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,eb0389774fca117ee06c5c02a6ba76af,Other,1465397402,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."


In [28]:
# Visualize test dataset dataframe
test_df = pd.read_csv(test_path, sep="\t", index_col=False, names=["label", "user_id", "item_id", "cate_id", "timestamp", "prev_item_ids", "prev_cate_ids", "prev_timestamps"])
test_df

Unnamed: 0,label,user_id,item_id,cate_id,timestamp,prev_item_ids,prev_cate_ids,prev_timestamps
0,1,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,bf0ddbcb5fcb316d0a2f6f3655b27b91,Other,1464943300,"b203435e0ca71f90508403e762d3a42f,a5e01c7b865e0...","Other,Other",14648507481464943154
1,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,5a6a0ad75329c2721d7273cf8cbb7b63,Other,1464943300,"b203435e0ca71f90508403e762d3a42f,a5e01c7b865e0...","Other,Other",14648507481464943154
2,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,de5795e6274c9538b6afe3d98112611d,Other,1464943300,"b203435e0ca71f90508403e762d3a42f,a5e01c7b865e0...","Other,Other",14648507481464943154
3,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,e9d41bf5aeba0591ef080d3513375338,Other,1464943300,"b203435e0ca71f90508403e762d3a42f,a5e01c7b865e0...","Other,Other",14648507481464943154
4,0,0051becd5b9fed9a79ffa923e256e28f3eeadd80206cee...,eb0389774fca117ee06c5c02a6ba76af,Other,1464943300,"b203435e0ca71f90508403e762d3a42f,a5e01c7b865e0...","Other,Other",14648507481464943154
...,...,...,...,...,...,...,...,...
28125,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,0f2fcf95319f5c1e5745371351f521e5,Other,1465626127,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
28126,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,04ab0460f36b12d22fd4321eaa9b39cb,Other,1465626127,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
28127,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,162decae9179e5b628166ffafca22ab1,Other,1465626127,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
28128,0,fffea4925ff656615c4e6b690cf1cce459237f36b64de3...,2a21cb8865da59cc4845d822c6923ada,Other,1465626127,"162decae9179e5b628166ffafca22ab1,c15ed4a6bcf39...","Other,Other,Other,Other,Other,Other,Other","1464807808,1464876998,1464983603,1464983618,14..."
