In [1]:
import os
import sys

sys.path.append('../src')

from tqdm.notebook import tqdm

tqdm.pandas()

import collections

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from typing import List

from utils import load_pickle
from dataset import HMDataset
from torch.utils.data import DataLoader

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
articleIds_index = load_pickle("../data/working/article_id_map.pkl")

In [3]:
data = pd.read_parquet("../data/split/0_valid.parquet.gzip")
data = data[data["article_id"].isin(articleIds_index.keys())]
data["article_id"] = data["article_id"].map(articleIds_index)

# data["t_dat"] = data["t_dat"].dt.strftime('%Y-%m-%d')

In [4]:
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
29933918,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,36238,0.032746,2
29933919,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,54475,0.065525,2
29933920,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,54475,0.065525,2
29933921,2020-08-05,0011e0bd4c39195ff342c0ca0ac0601ce2b943a8265506...,70923,0.032746,2
29933922,2020-08-05,0013bde09d10db6b0a6a3b0987ac60b643013dfc6f924b...,42384,0.025407,2


In [546]:
out = data.groupby(['t_dat', 'customer_id', 'article_id', 'sales_channel_id']).size().reset_index().rename(columns={0:'article_id_freq'})
out["article_id_freq"] = out["article_id_freq"].clip(1, 9)
out = out.groupby("customer_id")[["article_id", "sales_channel_id", "article_id_freq", "t_dat"]].agg(lambda x: list(x))

def serial_number(t_dat_seq):
    u_t_dat = sorted(list(set(t_dat_seq)))
    t_dat_map = {d: min(i+1, 9) for i, d in enumerate(u_t_dat)}
    return [t_dat_map[d] for d in t_dat_seq]

out["active_token_id"] = out["t_dat"].apply(lambda x: serial_number(x))
out.drop("t_dat", axis=1, inplace=True)

In [547]:
out.head()

Unnamed: 0_level_0,article_id,sales_channel_id,article_id_freq,active_token_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,[7155],[1],[1],[1]
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,[46436],[2],[1],[1]
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,"[30568, 45864, 68898, 71993]","[2, 2, 2, 2]","[1, 1, 1, 1]","[1, 1, 1, 1]"
0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8cd0c725276a467a2a,"[1803, 28803]","[2, 2]","[1, 1]","[1, 1]"
00009d946eec3ea54add5ba56d5210ea898def4b46c68570cf0096d962cacc75,"[25734, 66506, 67015, 71563, 23161, 30591, 306...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, ...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, ..."


In [566]:
out.to_numpy()[4]

array([list([25734, 66506, 67015, 71563, 23161, 30591, 30604, 57065, 68265, 69784, 69815, 70019, 3808, 6481, 7126, 47392, 67899, 71566]),
       list([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
       list([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1]),
       list([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3])],
      dtype=object)

In [548]:
c_id = "00009d946eec3ea54add5ba56d5210ea898def4b46c68570cf0096d962cacc75"

In [549]:
data.query("customer_id==@c_id")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
30333500,2020-08-14,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,67015,0.022695,2
30333501,2020-08-14,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,71563,0.015136,2
30333502,2020-08-14,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,25734,0.030254,2
30333503,2020-08-14,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,66506,0.030254,2
30420845,2020-08-17,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,69784,0.025407,2
30420846,2020-08-17,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,69784,0.025407,2
30420847,2020-08-17,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,70019,0.033881,2
30420848,2020-08-17,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,70019,0.033881,2
30420849,2020-08-17,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,69815,0.025407,2
30420850,2020-08-17,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,69815,0.025407,2
