In [2]:
import pandas as pd
import numpy as np
import h5py
import os
import torch

In [11]:
class Amex_Dataset:
    # def __init__(self,df_series,df_feature,uidxs,df_y=None):
    def __init__(self,df_series,uidxs,df_y=None,label_name = 'target',id_name = 'customer_ID'):
        self.df_series = df_series
        # self.df_feature = df_feature
        self.df_y = df_y
        self.uidxs = uidxs
        self.label_name = label_name
        self.id_name = id_name
        self.is_train = df_y is not None

    def __len__(self):
        return (len(self.uidxs))

    def __getitem__(self, index):
        i1,i2,idx = self.uidxs[index]
        series = self.df_series.iloc[i1:i2+1,1:].drop(['S_2'],axis=1).values
        time_ref = self.df_series.iloc[i1:i2+1,1:]['S_2']
        # series = self.df_series.iloc[i1:i2+1,1:].drop(['year_month','S_2'],axis=1).values

        if len(series.shape) == 1:
            series = series.reshape((-1,)+series.shape[-1:])
        # series_ = series.copy()
        # series_[series_!=0] = 1.0 - series_[series_!=0] + 0.001
        # feature = self.df_feature.loc[idx].values[1:]
        # feature_ = feature.copy()
        # feature_[feature_!=0] = 1.0 - feature_[feature_!=0] + 0.001
        
        if self.is_train:
            # emb_path = f"amex_emb/{args.data_type}/{args.sampling}/train/"
            file_path = os.path.join(emb_path, f"{idx}.h5")
            # print(f'file_path: {file_path}')

            with h5py.File(file_path, 'r') as hf:
                emb_data = hf['stacked_embeddings'][:]
                emb_tensor = torch.from_numpy(emb_data)


            label = self.df_y.loc[idx,[self.label_name]].values
            return {
                    'SERIES': series,#np.concatenate([series,series_],axis=1),
                    # 'FEATURE': np.concatenate([feature,feature_]),
                    'LABEL': label,
                    'time_ref': time_ref,
                    'idx': idx,
                    'emb_tensor': emb_tensor,
                    }
        else:
            return {
                    'SERIES': series,#np.concatenate([series,series_],axis=1),
                    # 'FEATURE': np.concatenate([feature,feature_]),
                    'time_ref': time_ref,
                    'idx': idx,
                    }

    def collate_fn(self, batch):
        """
        Padding to same size.
        """

        batch_size = len(batch)
        batch_series = torch.zeros((batch_size, 13, batch[0]['SERIES'].shape[1]))
        batch_mask = torch.zeros((batch_size, 13))
        # batch_feature = torch.zeros((batch_size, batch[0]['FEATURE'].shape[0]))
        batch_y = torch.zeros(batch_size)
        # batch_time_ref = np.array([sample['time_ref'] for sample in batch])
        # batch_time_ref = [sample['time_ref'] for sample in batch]
        batch_idx = np.array([sample['idx'] for sample in batch])
        batch_emb_tensor = None

        # ADD THIS DEBUG BLOCK:
        for item in batch:
            if 'emb_tensor' in item:
                # This will show you exactly which customer_ID (idx) has the wrong shape
                if item['emb_tensor'].shape != (223, 768):
                    print(f"DEBUG: customer_ID {item['idx']} has mismatched shape: {item['emb_tensor'].shape}")

        for i, item in enumerate(batch):
            v = item['SERIES']
            batch_series[i, :v.shape[0], :] = torch.tensor(v).float()
            batch_mask[i,:v.shape[0]] = 1.0
            # v = item['FEATURE'].astype(np.float32)
            # batch_feature[i] = torch.tensor(v).float()

            if self.is_train:
                v = item['LABEL'].astype(np.float32)
                batch_y[i] = torch.tensor(v).float()
                batch_emb_tensor = torch.stack([sample['emb_tensor'] for sample in batch], dim=0) 

        return {'batch_series':batch_series
                ,'batch_mask':batch_mask
                # ,'batch_feature':batch_feature
                ,'batch_y':batch_y
                # ,'batch_time_ref':batch_time_ref
                ,'batch_idx':batch_idx
                ,'batch_emb_tensor':batch_emb_tensor
                }

In [4]:
train_test = 'train'
emb_path = f"amex_emb/13month/10pct/{train_test}/"
file_path = os.path.join(emb_path, f"4.h5")
embeddings_stack = []

with h5py.File(file_path, 'r') as hf:
    data = hf['stacked_embeddings'][:]
    tensor = torch.from_numpy(data)
    embeddings_stack.append(tensor.squeeze(0))

data.shape

(223, 768)

In [5]:
train_test = 'train'
emb_path = f"amex_emb/original/10pct/{train_test}/"
file_path = os.path.join(emb_path, f"4.h5")
embeddings_stack = []

with h5py.File(file_path, 'r') as hf:
    data = hf['stacked_embeddings'][:]
    tensor = torch.from_numpy(data)
    embeddings_stack.append(tensor.squeeze(0))

data.shape

(223, 768)

In [7]:
# from tqdm import tqdm
# train_test = 'train'

# emb_path = f"/export/home2/zongqi001/000_data/amex/original_100pct/emb/{train_test}/"

# for index in tqdm(range(1000000)):
#     file_path = os.path.join(emb_path, f"{index}.h5")
#     embeddings_stack = []

#     with h5py.File(file_path, 'r') as hf:
#         data = hf['stacked_embeddings'][:]
#         tensor = torch.from_numpy(data)
#         embeddings_stack.append(tensor.squeeze(0))

#     if data.shape[0] !=223:
#         print(index)
#         break

In [8]:
INPUT_PATH = '../../000_data/amex/original_100pct'
emb_path = '../../000_data/amex/original_100pct/emb/train/'
input_path = INPUT_PATH
trainval_series     = pd.read_feather(f'{input_path}/df_nn_series_train.feather')
trainval_series_idx = pd.read_feather(f'{input_path}/df_nn_series_idx_train.feather').values
trainval_y = pd.read_csv(f'{input_path}/train_labels.csv')

In [12]:
from tqdm import tqdm
from torch.utils.data import DataLoader
from itertools import islice
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
enum_obj = enumerate(skf.split(trainval_y,trainval_y['target']))
fold_index, (trn_index, val_index) = next(enum_obj)

train_dataset = Amex_Dataset(trainval_series,[trainval_series_idx[i] for i in trn_index],trainval_y)
train_dataloader = DataLoader(train_dataset,batch_size=16,shuffle=True, drop_last=False, collate_fn=train_dataset.collate_fn,num_workers=16)

# batches = list(train_dataloader)
# target_item = batches[22534]

# enum_obj2 = enumerate(tqdm(train_dataloader))

# target_item = next(islice(enum_obj2, 22534, 22535))
target_item = next(enumerate(train_dataloader))
print(target_item)


(0, {'batch_series': tensor([[[0.9500, 0.3200, 0.0600,  ..., 0.0000, 0.0000, 0.0100],
         [0.8900, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0100],
         [0.9000, 0.0000, 0.0100,  ..., 0.0000, 0.0000, 0.0100],
         ...,
         [0.9700, 0.0900, 0.0200,  ..., 0.0000, 0.0000, 0.0100],
         [0.9700, 0.0000, 0.0200,  ..., 0.0000, 0.0000, 0.0100],
         [0.9500, 0.0000, 0.0200,  ..., 0.0000, 0.0000, 0.0100]],

        [[0.6800, 0.0000, 0.1000,  ..., 0.0000, 0.0000, 0.0000],
         [0.6800, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0100],
         [0.6900, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0100],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.3000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0100],
         [0.3200, 0.1500, 0.1000,  ..

In [25]:
target_item[1]['batch_idx']

array([329461, 310005,  32408,  99533, 263379, 144548, 257785, 151262,
       248016, 163298, 339290, 342762,  93875, 322972,  15847,  38076])

In [None]:
train_test = 'train'
emb_path = f"ETTm1/24/{train_test}"
file_path = os.path.join(emb_path, f"4.h5")
embeddings_stack = []

with h5py.File(file_path, 'r') as hf:
    data = hf['embeddings'][:]
    tensor = torch.from_numpy(data)
    embeddings_stack.append(tensor.squeeze(0))

data.shape

(7, 768)

In [4]:
input_path = '../../000_data/amex/13month_0.1pct'
test_series_idx = pd.read_feather(f'{input_path}/df_nn_series_idx_test.feather')

test_series_idx

Unnamed: 0,min,max,feature_idx
0,0,12,0
1,13,25,1
2,26,38,2
3,39,51,3
4,52,64,4
...,...,...,...
68,884,896,68
69,897,909,69
70,910,922,70
71,923,935,71


In [8]:
test_y = pd.read_csv(f'{input_path}/test_labels.csv')
test_y['target']

0     0
1     0
2     1
3     0
4     0
     ..
68    1
69    0
70    0
71    0
72    0
Name: target, Length: 73, dtype: int64

In [20]:
input_path = '../../000_data/amex/13month_0.1pct'
train_series     = pd.read_feather(f'{input_path}/df_nn_series_train.feather')
train_series_idx = pd.read_feather(f'{input_path}/df_nn_series_idx_train.feather').values
train_y = pd.read_csv(f'{input_path}/train_labels.csv')

In [21]:
train_series.shape

(2834, 222)

In [22]:
train_series_idx.shape

(218, 3)

In [23]:
train_y.shape

(218, 2)

In [24]:
train_series.head(2)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,oneHot_D_64_3,oneHot_D_66_0.0,oneHot_D_66_100.0,oneHot_D_68_0.0,oneHot_D_68_100.0,oneHot_D_68_200.0,oneHot_D_68_300.0,oneHot_D_68_400.0,oneHot_D_68_500.0,oneHot_D_68_600.0
0,000f8675ede66cc6affd4c048db11a00246d7ee623f453...,2017-03-20,0.71,0.0,0.0,0.81,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
1,000f8675ede66cc6affd4c048db11a00246d7ee623f453...,2017-04-19,0.71,0.0,0.01,0.81,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0


In [1]:
# train_series_idx

In [26]:
len(train_series_idx)

218

In [27]:
train_series.iloc[0:12+1,1:]['S_2']

0     2017-03-20
1     2017-04-19
2     2017-05-20
3     2017-06-19
4     2017-07-20
5     2017-08-19
6     2017-09-19
7     2017-10-20
8     2017-11-18
9     2017-12-20
10    2018-01-20
11    2018-02-17
12    2018-03-20
Name: S_2, dtype: object