In [27]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset, SequentialSampler, BatchSampler, Dataset
from data_loading import utils
import numpy as np
import tiny_tf.tf as tf

In [39]:
class FinData(Dataset):
    def __init__(self, data, target, era, hidden=None, mode='train', transform=None, cache_dir=None):
        self.data = data
        self.target = target
        self.mode = mode
        self.transform = transform
        self.cache_dir = cache_dir
        self.era = era
        self.hidden = hidden

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index.to_list()
        if self.transform:
            return self.transform(self.data.iloc[index].values)
        else:
            if type(index) is list:
                sample = {
                    'target': torch.Tensor(self.target[index].values),
                    'data':   torch.LongTensor(self.data[index]),
                    'era':    torch.Tensor(self.era[index].values),
                }
            else:
                sample = {
                    'target': torch.Tensor([self.target[index]]),
                    'data':   torch.LongTensor([self.data[index]]),
                    'era':    torch.Tensor([self.era[index]]),
                }
            if self.hidden is not None:
                sample['hidden'] = torch.Tensor(self.hidden[index])
        return sample

    def __len__(self):
        return len(self.data)

In [40]:
def create_dataloaders(dataset: Dataset, indexes: dict, batch_size):
    train_idx = indexes.get('train', None)
    val_idx = indexes.get('val', None)
    test_idx = indexes.get('test', None)
    dataloaders = {}
    if train_idx:
        train_set = Subset(
            dataset, train_idx)
        train_sampler = BatchSampler(
            train_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['train'] = DataLoader(
            dataset, sampler=train_sampler, num_workers=10, pin_memory=True)
    if val_idx:
        val_set = Subset(dataset, val_idx)
        val_sampler = BatchSampler(val_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['val'] = DataLoader(
            dataset, sampler=val_sampler, num_workers=10, pin_memory=True)
    if test_idx:
        test_set = Subset(dataset, test_idx)
        test_sampler = BatchSampler(SequentialSampler(
            test_set), batch_size=batch_size, drop_last=False)
        dataloaders['test'] = DataLoader(
            test_set, sampler=test_sampler, num_workers=10, pin_memory=True)
    return dataloaders

In [30]:
df = utils.load_data(root_dir='./data', mode='train')

In [31]:
data,target,features,era = utils.preprocess_data(df,nn=True)

In [32]:
t_idx = np.where(era < 100)[0].tolist()
v_idx = np.where(era >=100)[0].tolist()

In [41]:
dataset = FinData(data=data,target=target,era=era)
dataloaders = create_dataloaders(dataset=dataset,indexes={'train':t_idx,'val':v_idx},batch_size=100)

In [42]:
val_loader = dataloaders['val']

In [43]:
for batch in val_loader:
    print(batch)

{'target': tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.7500, 0.5000, 0.5000, 0.7500,
         0.5000, 0.2500, 0.2500, 0.2500, 0.5000, 0.7500, 0.5000, 0.5000, 0.0000,
         0.0000, 0.5000, 0.5000, 0.5000, 0.5000, 0.7500, 0.2500, 0.5000, 0.5000,
         0.2500, 0.2500, 0.5000, 0.5000, 0.2500, 0.2500, 0.5000, 0.7500, 0.5000,
         0.5000, 0.2500, 0.5000, 1.0000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.2500, 1.0000, 0.5000, 0.7500, 0.5000, 0.5000, 0.5000, 0.5000, 0.7500,
         0.2500, 0.0000, 0.5000, 0.0000, 0.5000, 0.5000, 0.2500, 0.7500, 0.5000,
         0.2500, 0.2500, 0.2500, 0.0000, 0.5000, 0.5000, 0.5000, 0.2500, 0.5000,
         0.5000, 0.7500, 0.2500, 0.2500, 0.5000, 0.5000, 0.5000, 0.5000, 0.2500,
         0.0000, 0.2500, 0.5000, 0.7500, 0.5000, 0.7500, 0.5000, 1.0000, 0.5000,
         0.2500, 0.2500, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.2500,
         0.7500]]), 'data': tensor([[[0, 0, 0,  ..., 0, 0, 1],
         [0, 0, 0,  ..., 0, 0, 0],


In [10]:
val_sub = Subset(dataset=dataset,indices=v_idx)


In [11]:
val_sub.dataset

<__main__.FinData at 0x7fd0e7d47f10>

In [12]:
next(iter(val_loader.sampler))

NameError: name 'val_loader' is not defined

In [13]:
val_sub.dataset

<__main__.FinData at 0x7fd0e7d47f10>

In [14]:
era[val_sub.indices]

406099    100
406100    100
406101    100
406102    100
406103    100
         ... 
501803    120
501804    120
501805    120
501806    120
501807    120
Name: era, Length: 95709, dtype: int64

In [15]:
dl = DataLoader(val_sub,batch_size=100)

In [16]:
next(iter(dl))

  'data':   torch.LongTensor([self.data[index]]),


{'target': tensor([[0.5000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.7500],
         [0.5000],
         [0.5000],
         [0.7500],
         [0.5000],
         [0.2500],
         [0.2500],
         [0.2500],
         [0.5000],
         [0.7500],
         [0.5000],
         [0.5000],
         [0.0000],
         [0.0000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.7500],
         [0.2500],
         [0.5000],
         [0.5000],
         [0.2500],
         [0.2500],
         [0.5000],
         [0.5000],
         [0.2500],
         [0.2500],
         [0.5000],
         [0.7500],
         [0.5000],
         [0.5000],
         [0.2500],
         [0.5000],
         [1.0000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.5000],
         [0.2500],
         [1.0000],
         [0.5000],
         [0.7500],
         [0.5000],
         [0.5000],
         [0.5000],
  

In [36]:
train_set = Subset(
    dataset, t_idx)
train_sampler = BatchSampler(SequentialSampler(
    train_set), batch_size=100, drop_last=False)
t_loader = DataLoader(
    dataset, sampler=train_sampler, num_workers=10, pin_memory=True)

val_set = Subset(dataset, v_idx)
val_sampler = BatchSampler(
    val_set.indices, batch_size=100, drop_last=False)
v_loader= DataLoader(
    val_set, sampler=val_sampler, num_workers=10, pin_memory=True)

In [38]:
next(iter(v_loader))

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/james/.virtualenvs/Kaggle/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/james/.virtualenvs/Kaggle/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/james/.virtualenvs/Kaggle/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/james/.virtualenvs/Kaggle/lib/python3.8/site-packages/torch/utils/data/dataset.py", line 330, in __getitem__
    return self.dataset[self.indices[idx]]
TypeError: list indices must be integers or slices, not list
