In [1]:
import pandas as pd
import numpy as np
from os.path import join as pjoin
import random

In [2]:
random.seed(1531513)

In [3]:
dtype_dict=str
all_sessions = pd.read_csv('data/train_sessions.csv', dtype=dtype_dict)
all_purchases = pd.read_csv('data/train_purchases.csv', dtype=dtype_dict)
all_features = pd.read_csv('data/item_features.csv', dtype=dtype_dict)

In [4]:
all_item_ids = set(all_sessions.item_id.tolist()) | set(all_purchases.item_id.tolist())

### Three validation targets

In [5]:
def get_filtered(all_sessions, all_purchases, all_features, ym):
    target_sessions = all_sessions[all_sessions.date.apply(lambda x: x[:7] <= ym)]
    target_purchases = all_purchases[all_purchases.date.apply(lambda x: x[:7] <= ym)]
    
    val_te_session_ids = list(set(all_sessions[all_sessions.date.apply(lambda x: x[:7] == ym)].session_id.unique().tolist()))
    print("before shuffle, first 3 session ids", val_te_session_ids[:3])
    random.shuffle(val_te_session_ids)
    print("after shuffle, first 3 session ids", val_te_session_ids[:3])

    unique_items = set(target_sessions.item_id.tolist()) | set(target_purchases.item_id.tolist())
    item_features = all_features[all_features.item_id.isin(unique_items)]
    
    div = (len(val_te_session_ids) // 2) - 1
    val_session_ids = set(val_te_session_ids[:div])
    te_session_ids = set(val_te_session_ids[div:])
    val_te_session_ids = set(val_te_session_ids)
    
    tr_sessions = target_sessions[~target_sessions.session_id.isin(val_te_session_ids)].reset_index(drop=True)
    val_sessions = target_sessions[target_sessions.session_id.isin(val_session_ids)].reset_index(drop=True)
    te_sessions = target_sessions[target_sessions.session_id.isin(te_session_ids)].reset_index(drop=True)
    
    tr_purchases = target_purchases[~target_purchases.session_id.isin(val_te_session_ids)].reset_index(drop=True)
    val_purchases = target_purchases[target_purchases.session_id.isin(val_session_ids)].reset_index(drop=True)
    te_purchases = target_purchases[target_purchases.session_id.isin(te_session_ids)].reset_index(drop=True)
    
    candidate_items = set()
    candidate_items |= set(te_purchases.item_id.tolist())
    print('# cand items added', len(candidate_items))
    candidate_items |= set(val_purchases.item_id.tolist())
    print('# cand items added', len(candidate_items))
    candidate_items |= set(val_sessions.item_id.tolist())
    print('# cand items added', len(candidate_items))
    candidate_items |= set(te_sessions.item_id.tolist())
    print('# cand items added', len(candidate_items))
    candidate_items = list(candidate_items)
    
    candidate_items = pd.DataFrame({"item_id": candidate_items})
    candidate_items.head(3)
    print(
        "# tr sessions", len(tr_purchases),
        "# val sessions", len(val_purchases),
        "# te sessions", len(te_purchases),
        )
    return (tr_sessions, val_sessions, te_sessions, 
            tr_purchases, val_purchases, te_purchases, 
            candidate_items, item_features)

In [6]:
def save(dataset_xx, dir_name):
    (tr_sessions, val_sessions, te_sessions, 
     tr_purchases, val_purchases, te_purchases, 
     candidate_items, item_features) = dataset_xx
    
    def tocsv(df, dir_name, fname):
        df.to_csv(pjoin(dir_name, fname), index=False)
    
    tocsv(candidate_items, dir_name, 'candidate_items.csv')
    tocsv(item_features, dir_name, 'item_features.csv')
    tocsv(tr_sessions, dir_name, 'train_sessions.csv')
    tocsv(tr_purchases, dir_name, 'train_purchases.csv')
    tocsv(val_sessions, dir_name, 'val_sessions.csv')
    tocsv(val_purchases, dir_name, 'val_purchases.csv')
    tocsv(te_sessions, dir_name, 'te_sessions.csv')
    tocsv(te_purchases, dir_name, 'te_purchases.csv')

In [7]:
dataset_2006 = get_filtered(all_sessions, all_purchases, all_features, ym='2020-06')

before shuffle, first 3 session ids ['1818893', '1831503', '2842564']
after shuffle, first 3 session ids ['1570552', '192406', '2013356']
# cand items added 4586
# cand items added 5326
# cand items added 6791
# cand items added 7307
# tr sessions 214401 # val sessions 30317 # te sessions 30319


In [8]:
dataset_2105 = get_filtered(all_sessions, all_purchases, all_features, ym='2021-05')

before shuffle, first 3 session ids ['1445558', '1256193', '3910290']
after shuffle, first 3 session ids ['2566286', '2208744', '437024']
# cand items added 4164
# cand items added 4769
# cand items added 5783
# cand items added 5943
# tr sessions 918382 # val sessions 40808 # te sessions 40810


In [9]:
dataset_2104 = get_filtered(all_sessions, all_purchases, all_features, ym='2021-04')

before shuffle, first 3 session ids ['758213', '576063', '670917']
after shuffle, first 3 session ids ['365857', '3671327', '142223']
# cand items added 4187
# cand items added 4858
# cand items added 6297
# cand items added 6735
# tr sessions 848637 # val sessions 34871 # te sessions 34874


In [10]:
!rm -rf /data/project/rw/recsys2022/vali_2006
!mkdir /data/project/rw/recsys2022/vali_2006

!rm -rf /data/project/rw/recsys2022/vali_2105
!mkdir /data/project/rw/recsys2022/vali_2105

!rm -rf /data/project/rw/recsys2022/vali_2104
!mkdir /data/project/rw/recsys2022/vali_2104

In [11]:
save(dataset_2006, "/data/project/rw/recsys2022/vali_2006")

In [None]:
save(dataset_2105, "/data/project/rw/recsys2022/vali_2105")

In [None]:
save(dataset_2104, "/data/project/rw/recsys2022/vali_2104")