# Load train/test split

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import trange

In [2]:
def load_interactions(path, n_splits=5):
    df = pd.read_pickle(os.path.join(os.getcwd(), path))
    df[['interactions', 'train', 'val', 'test']] = df[['interactions', 'train', 'val', 'test']].applymap(lambda x: np.array(x, dtype=np.int32))
    interactions_dict = {}
    for split in trange(n_splits):
        for column in ['train', 'val', 'test']:
            interactions_dict[split, column] = pd.DataFrame({
                'user_id': df['user_id'],
                'steam_id': df['steam_id'],
                'item_id': df[column].apply(lambda x: x[split, 0]),
                'playtime_forever': df[column].apply(lambda x: x[split, 1]),
                'playtime_2weeks': df[column].apply(lambda x: x[split, 2])})
    return interactions_dict

In [3]:
interactions = load_interactions('interactions_splits.pkl.gz')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.02it/s]


In [4]:
interactions[0, 'train'].head()

Unnamed: 0,user_id,steam_id,item_id,playtime_forever,playtime_2weeks
0,76561197981203305,76561197981203305,"[1461, 1999, 1984, 761, 2820, 819, 187, 506, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,bosslucek,76561198029968002,"[4014, 1018, 3632, 2843, 2755, 219, 6245, 2621...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,icantwait,76561197971666535,"[886, 2010, 419, 2217, 1293, 2809, 802, 155, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,76561198067911521,76561198067911521,"[1849, 1038, 229, 400, 1386, 1437, 1363, 515, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,kushziller,76561198021307778,"[2883, 401, 2243, 4408, 3966, 1487, 1888, 2708...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
interactions[0, 'val'].head()

Unnamed: 0,user_id,steam_id,item_id,playtime_forever,playtime_2weeks
0,76561197981203305,76561197981203305,"[181, 275, 3833, 1936, 1102, 1227, 1939, 2098,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,bosslucek,76561198029968002,"[1413, 1815, 2234, 294, 2707, 2410, 88, 2466, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,icantwait,76561197971666535,"[424, 1381, 1817, 479, 2692, 2099, 2668, 438, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,76561198067911521,76561198067911521,"[3146, 17, 695, 1794, 233, 1309, 1299, 3341, 8...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,kushziller,76561198021307778,"[843, 4681, 2342, 1101, 1754, 3507, 4578, 619,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Debug: check if users have any duplicates

In [6]:
for split in range(5):
    for stage in ['train', 'val', 'test']:
        df = interactions[split, stage]
        c=0
        for i, x in enumerate(df['item_id']):
            c += len(x) - len(np.unique(x))
        print(f'split {split} {stage}: {c} duplicates')
            
print('done')

split 0 train: 0 duplicates
split 0 val: 0 duplicates
split 0 test: 0 duplicates
split 1 train: 0 duplicates
split 1 val: 0 duplicates
split 1 test: 0 duplicates
split 2 train: 0 duplicates
split 2 val: 0 duplicates
split 2 test: 0 duplicates
split 3 train: 0 duplicates
split 3 val: 0 duplicates
split 3 test: 0 duplicates
split 4 train: 0 duplicates
split 4 val: 0 duplicates
split 4 test: 0 duplicates
done
