In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
import pytorch_lightning as pl

import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.preprocessing import StandardScaler

In [4]:
class CTRPDataset(torch.utils.data.Dataset):
    def __init__(self, data, features, target):
        self.features = torch.FloatTensor(data[features].to_numpy())
        self.target = torch.FloatTensor(data[target].to_numpy())
        self.fold = torch.Tensor(data['fold'].to_numpy())
    
    def __getitem__(self, i):
        return self.features[i], self.target[i]

    def __len__(self):
        return len(self.target)

In [5]:
def create_dataloaders(dataset, fold):
    val_idx = torch.where(dataset.fold==fold)[0]
    train_loader = torch.utils.data.DataLoader(dataset[~val_idx])
    val_loader = torch.utils.data.DataLoader(dataset[val_idx])
    return train_loader, val_loader

In [6]:
data_path = Path("../../film-gex-data/processed/")
input_cols = joblib.load(data_path.joinpath("input_cols.pkl"))
cond_cols = joblib.load(data_path.joinpath("cond_cols.pkl"))
data = pd.read_pickle(data_path.joinpath("train_sub.pkl.gz"))

In [7]:
input_dataset = CTRPDataset(data, input_cols, 'cpd_avg_pv')
cond_dataset = CTRPDataset(data, cond_cols, 'cpd_avg_pv')

In [8]:
foo, bar = create_dataloaders(input_dataset, fold=0)

In [10]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
import pytorch_lightning as pl

import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.preprocessing import StandardScaler

In [11]:
class CTRPDataModule(pl.LightningDataModule):

    def __init__(self, path, target, val_fold, test_fold, batch_size=32):
        super().__init__()
        self.batch_size = batch_size
        self.path = Path(path)
        self.target = target
        self.val_fold = val_fold
        self.test_fold = test_fold

    # When doing distributed training, Datamodules have two optional arguments for
    # granular control over download/prepare/splitting data:
    def prepare_data(self):
        pass

    # OPTIONAL, called for every GPU/machine (assigning state is OK)
    def setup(self, stage):
        # read data
        self.data = pd.read_pickle(self.path.joinpath("train_sub.pkl.gz"))
        self.input_cols = joblib.load(self.path.joinpath("input_cols.pkl"))
        self.cond_cols = joblib.load(self.path.joinpath("cond_cols.pkl"))
        # idx
        self.train_idx = np.where((self.data['fold']!=self.val_fold) & (self.data['fold']!=self.test_fold))[0]
        self.val_idx = np.where(self.data==self.val_fold)[0]
        self.test_idx = np.where(self.data==self.test_fold)[0]
        # transform inputs
        self.scaler = StandardScaler()
        input_train_data = self.scaler.fit_transform(self.data.iloc[self.train_idx][self.input_cols])
        input_val_data = self.scaler.transform(self.data.iloc[self.val_idx][self.input_cols])
        input_test_data = self.scaler.transform(self.data.iloc[self.test_idx][self.input_cols])
        
        if stage == 'fit':
            self.train_dataset = CTRPDataset(input_train_data, 
                                             self.data.iloc[self.train_idx][self.cond_cols].to_numpy(),
                                             self.data.iloc[self.train_idx][self.target].to_numpy())
            self.val_dataset = CTRPDataset(input_val_data,
                                           self.data.iloc[self.val_idx][self.cond_cols].to_numpy(),
                                           self.data.iloc[self.val_idx][self.target].to_numpy())
            return self.train_dataset, self.val_dataset
        if stage == 'test':
            self.test_dataset = CTRPDataset(input_test_data,
                                            self.data.iloc[self.test_idx][self.cond_cols].to_numpy(),
                                            self.data.iloc[self.test_idx][self.target].to_numpy())
            return self.test_dataset

    # return the dataloader for each split
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [12]:
foo = CTRPDataModule(path="../../film-gex-data/processed/",
                     target='cpd_avg_pv',
                     val_fold=1,
                     test_fold=2)

In [13]:
foo.setup(stage='fit')

(<__main__.CTRPDataset at 0x7feab73e1730>,
 <__main__.CTRPDataset at 0x7feab73e1ac0>)

In [14]:
foo.setup(stage='test')

<__main__.CTRPDataset at 0x7feab73e17c0>

In [None]:
# create dataset
# get train/val idx for gks
# create datamodule, transform (norm), to tensor
# ...