# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [1]:
import comet_ml
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torch.nn.functional as F

torch.manual_seed(0)

<torch._C.Generator at 0x7fd692f4acf0>

In [2]:

class LightningNN(pl.LightningModule):
    def __init__(self, N_features, N_labels):
        super(LightningNN, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_features, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Linear(64, N_labels),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.cross_entropy(y_hat, y)
        self.log("val_loss", val_loss, on_step=True, on_epoch=True, logger=True)
        return val_loss
    
model = LightningNN(3,3)

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [3]:
class GeneExpressionData(Dataset):
    def __init__(self, filename, labelname):
        self._filename = filename
        self._labelname = labelname
        self._total_data = 0
        
        with open(filename, "r") as f:
            self._total_data = len(f.readlines()) - 1
    
    def __getitem__(self, idx):        
        line = linecache.getline(self._filename, idx + 2)
        label = linecache.getline(self._labelname, idx + 2)
        
        csv_data = csv.reader([line])
        csv_label = csv.reader([label])
        
        data = [x for x in csv_data][0]
        label = [x for x in csv_label][0]
        return torch.from_numpy(np.array([float(x) for x in data])).float(), [int(float(x)) for x in label][0]
    
    def __len__(self):
        return self._total_data
    
    def num_labels(self):
        return pd.read_csv(self._labelname)['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])

Since PyTorch loss functions require classes in $[0, C]$, we'll first add $1$ to the labels and re-write it out so we can use it for training

In [4]:
def fix_labels(file):
    labels = pd.read_csv(file)
    labels['# label'] = labels['# label'].astype(int) + 1
    labels.to_csv('fixed_' + file.split('/')[-1], index=False)

fix_labels('../data/processed/labels/primary_labels_neighbors_50_components_50_clust_size_100.csv')

Great, we now continue as normal

In [20]:
t = GeneExpressionData(
    filename='../data/processed/umap/primary_reduction_neighbors_100_components_3.csv',
    labelname='fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv'
)

Let's see how fast it takes to load a minibatch of data

In [21]:
%%time 

for i in range(64):
    t.__getitem__(i)

CPU times: user 4.78 ms, sys: 1.77 ms, total: 6.55 ms
Wall time: 4.65 ms


Before we train our model, we need to split our data into training and testing sets, in order to get an unbiased evaluation of our model's performance. Likely, we will initially overfit the training set since we provide no regularization.

In [22]:
train_size = int(0.8 * len(t))
test_size = len(t) - train_size

train, test = torch.utils.data.random_split(t, [train_size, test_size])

In [23]:
traindata = DataLoader(train, batch_size = 8, num_workers = 0)
valdata = DataLoader(test, batch_size = 8, num_workers = 0)

Now that we've defined our `DataLoader`, let's test it when training a simple Neural Network

## Using PyTorch Lightning

PyTorch lightning seems nicer than Ignite, especially for GPU training. Let's test it out 

In [31]:
from torchmetrics import Accuracy
from pytorch_lightning.loggers import CometLogger

class NN(pl.LightningModule):
    def __init__(self, N_features, N_labels, weights):
        super(NN, self).__init__()
        self.weights = weights
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_features, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Linear(64, N_labels),
        )
        
        self.accuracy = Accuracy()
        
#         self.cometlogger = CometLogger(
#             api_key="neMNyjJuhw25ao48JEWlJpKRR",
#             project_name="gene-expression-classification",  # Optional
#             experiment_name="test3"
#         )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        
        loss = F.cross_entropy(y_hat, y, weight=self.weights)
        acc = self.accuracy(y_hat.softmax(dim=-1), y)
        
        
        self.log("train_pred", y_hat.softmax(dim=-1).argmax(dim=1))
        self.log("train_tru", y)
        
        self.log("train_loss", loss, on_step=True, on_epoch=True, logger=True)
        self.log("train_accuracy", acc, on_step=True, on_epoch=True, logger=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.cross_entropy(y_hat, y, weight=self.weights)
        acc = self.accuracy(y_hat.softmax(dim=-1), y)

        self.log("val_loss", val_loss, on_step=True, on_epoch=True, logger=True)
        self.log("train_accuracy", acc, on_step=True, on_epoch=True, logger=True)
        return val_loss
    

In [25]:
from sklearn.utils.class_weight import compute_class_weight

def class_weights(label_df):
    label_df = pd.read_csv(label_df)
    
    weights = compute_class_weight(
        class_weight='balanced', 
        classes=np.unique(label_df), 
        y=label_df.values.reshape(-1)
    ) 

    weights = torch.from_numpy(weights)
    return weights.float()

weights = class_weights('fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv')

In [26]:
weights

tensor([  0.6428,   2.0281,  43.2046,   1.8374,   0.8580,  33.3467, 102.0523,
        100.3226,   0.5029,   0.6207,   2.0534,   0.4522,   0.3983,  13.3462,
          1.3319,   0.3946])

In [32]:
model = NN(t.num_features(), t.num_labels(), weights)
model

NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=3, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
  (accuracy): Accuracy()
)

In [33]:
comet_logger = pl.loggers.CometLogger(
        api_key="neMNyjJuhw25ao48JEWlJpKRR",
        project_name="gene-expression-classification",  # Optional
        experiment_name="test_01"
    )


trainer = pl.Trainer(auto_lr_find=True, max_epochs=10, logger=comet_logger)
trainer.fit(model, traindata, valdata)


CometLogger will be initialized in online mode
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jlehrer1/gene-expression-classification/757e5a60efba47729d1c6821733d8dd0
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [2928]               : (0.009379980154335499, 13.915987968444824)
COMET INFO:     train_accuracy_step [133] : (0.5, 1.0)
COMET INFO:     train_loss_step [133]     : (0.016776276752352715, 1.841537356376648)
COMET INFO:   Others:
COMET INFO:     Name : test2
COMET INFO:   Uploads:
COMET INFO:     confusion-matrix         : 29276
COMET INFO:     environment details      : 1
COMET INFO:     filename                 : 1
COMET INFO:     git metadata             : 

In [14]:
a= np.array([[-0.0956,  0.0667,  0.2978,  0.3059, -0.4875, -0.6018,  0.9924,  0.3331,
          0.6128, -0.5120, -0.5063,  0.3345, -0.0863,  0.1947,  0.1109,  0.3478],
        [-0.2569,  0.1289,  0.4715,  0.3449, -0.2374, -0.3352,  0.7315,  0.4310,
          0.4933, -0.4394, -0.5972,  0.2676,  0.0477,  0.5278,  0.0189,  0.0890],
        [-0.1431,  0.2215,  0.3246,  0.3507, -0.2735, -0.3416,  0.5464,  0.1399,
          0.2135, -0.1563, -0.5456,  0.2188,  0.0482,  0.3996,  0.0487,  0.0625],
        [-0.1184,  0.0485,  0.3864,  0.3085, -0.3412, -0.6601,  0.8256,  0.3796,
          0.6838, -0.5963, -0.4536,  0.2136, -0.1142,  0.4071, -0.0383,  0.2752],
        [-0.2536,  0.0690,  0.4268,  0.2535, -0.1399, -0.2384,  0.5493,  0.3850,
          0.4556, -0.4013, -0.4894,  0.1892,  0.0616,  0.4456,  0.0194,  0.0671],
        [-0.2009,  0.1465,  0.3602,  0.2529, -0.1755, -0.2467,  0.5161,  0.2884,
          0.3654, -0.3046, -0.4859,  0.1878,  0.0387,  0.4268,  0.0325,  0.0650],
        [-0.3138,  0.1538,  0.5169,  0.4463, -0.3781, -0.3656,  0.8290,  0.3274,
          0.3683, -0.3494, -0.7921,  0.4668,  0.0602,  0.5689, -0.0036,  0.0318],
        [-0.0835,  0.1875,  0.3068,  0.4515, -0.4492, -0.5668,  1.1131,  0.5104,
          0.5162, -0.4346, -0.6959,  0.3610,  0.0562,  0.4453,  0.0424,  0.2393]])

l = np.array([ 0,  0,  4, 11, 15, 11, 15,  8])

In [15]:
t[0]

(tensor([-2.6639,  2.5859,  6.3119]), 3)

In [16]:
t = torch.from_numpy(a)
l = torch.from_numpy(l)
t.softmax(dim=-1)

tensor([[0.0477, 0.0561, 0.0707, 0.0713, 0.0322, 0.0288, 0.1416, 0.0733, 0.0969,
         0.0315, 0.0316, 0.0734, 0.0482, 0.0638, 0.0587, 0.0743],
        [0.0406, 0.0597, 0.0842, 0.0741, 0.0414, 0.0376, 0.1091, 0.0808, 0.0860,
         0.0338, 0.0289, 0.0686, 0.0551, 0.0890, 0.0535, 0.0574],
        [0.0486, 0.0700, 0.0776, 0.0796, 0.0426, 0.0398, 0.0968, 0.0645, 0.0694,
         0.0479, 0.0325, 0.0698, 0.0588, 0.0836, 0.0589, 0.0597],
        [0.0472, 0.0558, 0.0782, 0.0723, 0.0378, 0.0275, 0.1213, 0.0777, 0.1053,
         0.0293, 0.0338, 0.0658, 0.0474, 0.0798, 0.0511, 0.0700],
        [0.0424, 0.0585, 0.0837, 0.0704, 0.0475, 0.0430, 0.0946, 0.0803, 0.0861,
         0.0366, 0.0335, 0.0660, 0.0581, 0.0853, 0.0557, 0.0584],
        [0.0454, 0.0643, 0.0796, 0.0715, 0.0466, 0.0434, 0.0931, 0.0741, 0.0801,
         0.0410, 0.0342, 0.0670, 0.0577, 0.0851, 0.0574, 0.0593],
        [0.0380, 0.0606, 0.0871, 0.0812, 0.0356, 0.0360, 0.1190, 0.0721, 0.0751,
         0.0366, 0.0235, 0.0829, 0.05

In [17]:
t.softmax(dim=-1)

tensor([[0.0477, 0.0561, 0.0707, 0.0713, 0.0322, 0.0288, 0.1416, 0.0733, 0.0969,
         0.0315, 0.0316, 0.0734, 0.0482, 0.0638, 0.0587, 0.0743],
        [0.0406, 0.0597, 0.0842, 0.0741, 0.0414, 0.0376, 0.1091, 0.0808, 0.0860,
         0.0338, 0.0289, 0.0686, 0.0551, 0.0890, 0.0535, 0.0574],
        [0.0486, 0.0700, 0.0776, 0.0796, 0.0426, 0.0398, 0.0968, 0.0645, 0.0694,
         0.0479, 0.0325, 0.0698, 0.0588, 0.0836, 0.0589, 0.0597],
        [0.0472, 0.0558, 0.0782, 0.0723, 0.0378, 0.0275, 0.1213, 0.0777, 0.1053,
         0.0293, 0.0338, 0.0658, 0.0474, 0.0798, 0.0511, 0.0700],
        [0.0424, 0.0585, 0.0837, 0.0704, 0.0475, 0.0430, 0.0946, 0.0803, 0.0861,
         0.0366, 0.0335, 0.0660, 0.0581, 0.0853, 0.0557, 0.0584],
        [0.0454, 0.0643, 0.0796, 0.0715, 0.0466, 0.0434, 0.0931, 0.0741, 0.0801,
         0.0410, 0.0342, 0.0670, 0.0577, 0.0851, 0.0574, 0.0593],
        [0.0380, 0.0606, 0.0871, 0.0812, 0.0356, 0.0360, 0.1190, 0.0721, 0.0751,
         0.0366, 0.0235, 0.0829, 0.05