# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [26]:
import comet_ml
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torch.nn.functional as F
import sys, os


We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [5]:
class GeneExpressionData(Dataset):
    def __init__(self, filename, labelname):
        self._filename = filename
        self._labelname = labelname
        self._total_data = 0
        
        with open(filename, "r") as f:
            self._total_data = len(f.readlines()) - 1
    
    def __getitem__(self, idx):        
        line = linecache.getline(self._filename, idx + 2)
        label = linecache.getline(self._labelname, idx + 2)
        
        csv_data = csv.reader([line])
        csv_label = csv.reader([label])
        
        data = [x for x in csv_data][0]
        label = [x for x in csv_label][0]
        return torch.from_numpy(np.array([float(x) for x in data])).float(), [int(float(x)) for x in label][0]
    
    def __len__(self):
        return self._total_data
    
    def num_labels(self):
        return pd.read_csv(self._labelname)['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])

Since PyTorch loss functions require classes in $[0, C]$, we'll first add $1$ to the labels and re-write it out so we can use it for training

In [11]:
def fix_labels(file):
    labels = pd.read_csv(file)
    labels['# label'] = labels['# label'].astype(int) + 1
    labels.to_csv('fixed_' + file.split('/')[-1], index=False)

fix_labels('../data/processed/labels/primary_labels_neighbors_50_components_100_clust_size_100.csv')

Great, we now continue as normal

In [12]:
t = GeneExpressionData(
    filename='../data/processed/umap/primary_reduction_neighbors_100_components_3.csv',
    labelname='fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv'
)
t.num_labels()

16

Let's see how fast it takes to load a minibatch of data

In [13]:
%%time 

for i in range(64):
    t.__getitem__(i)

CPU times: user 2.82 ms, sys: 811 µs, total: 3.63 ms
Wall time: 5.44 ms


Before we train our model, we need to split our data into training and testing sets, in order to get an unbiased evaluation of our model's performance. Likely, we will initially overfit the training set since we provide no regularization.

In [14]:
train_size = int(0.8 * len(t))
test_size = len(t) - train_size

train, test = torch.utils.data.random_split(t, [train_size, test_size])

In [15]:
traindata = DataLoader(train, batch_size = 8, num_workers = 0)
valdata = DataLoader(test, batch_size = 8, num_workers = 0)

Now that we've defined our `DataLoader`, let's test it when training a simple Neural Network

## Using PyTorch Lightning

PyTorch lightning seems nicer than Ignite, especially for GPU training. Let's test it out 

In [16]:
class GeneClassifier(pl.LightningModule):
    def __init__(self, N_features, N_labels, weights, layers):
        """
        Initialize the gene classifier neural network

        Parameters:
        N_features: Number of features in the inpute matrix 
        N_labels: Number of classes 
        """

        super(GeneClassifier, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_features, 512),
            nn.ReLU(),
            nn.Linear(512, WIDTH),
            nn.ReLU(),
            nn.Linear(WIDTH, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Linear(64, N_labels),
        )
        
        self.accuracy = Accuracy()
        self.weights = weights

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=1e-3, momentum=0.8)
        return optimizer

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=self.weights)
        acc = self.accuracy(y_hat.softmax(dim=-1), y)

        self.log("train_loss", loss, on_step=False, on_epoch=True, logger=True)
        self.log("train_accuracy", acc, on_step=False, on_epoch=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.cross_entropy(y_hat, y, weight=self.weights)
        acc = self.accuracy(y_hat.softmax(dim=-1), y)

        self.log("val_loss", val_loss, on_step=False, on_epoch=True, logger=True)
        self.log("val_accuracy", acc, on_step=False, on_epoch=True, logger=True)
        return val_loss

In [17]:
from sklearn.utils.class_weight import compute_class_weight

def class_weights(label_df):
    label_df = pd.read_csv(label_df)
    
    weights = compute_class_weight(
        class_weight='balanced', 
        classes=np.unique(label_df), 
        y=label_df.values.reshape(-1)
    ) 

    weights = torch.from_numpy(weights)
    return weights.float()

weights = class_weights('fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv')

In [18]:
weights

tensor([  0.6428,   2.0281,  43.2046,   1.8374,   0.8580,  33.3467, 102.0523,
        100.3226,   0.5029,   0.6207,   2.0534,   0.4522,   0.3983,  13.3462,
          1.3319,   0.3946])

In [19]:
model = NN(t.num_features(), t.num_labels(), weights)
model

NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=3, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=16, bias=True)
  )
  (accuracy): Accuracy()
)

In [28]:
comet_logger = pl.loggers.CometLogger(
    api_key="neMNyjJuhw25ao48JEWlJpKRR",
    project_name="gene-expression-classification",  # Optional
    experiment_name="test_01"
)


checkpointcallback = pl.callbacks.ModelCheckpoint(
    dirpath='checkpoints',
    filename='classifier-checkpoint-{epoch}-{self.width}-{self.layers}',
    every_n_epochs=1,
)

class UploadCallback(pl.callbacks.Callback):
    def __init__(self, path, WIDTH, LAYERS) -> None:
        super().__init__()
        self.path = path 
        self.width = WIDTH
        self.layers = LAYERS

    def on_train_epoch_end(self, trainer, pl_module):
        epoch = trainer.current_epoch
        print ('Uploading file...')

CometLogger will be initialized in online mode


In [29]:
trainer = pl.Trainer(auto_lr_find=True, max_epochs=10, logger=comet_logger)
trainer.fit(model, traindata, valdata)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jlehrer1/gene-expression-classification/abcf0e8a7670451e83f0f54799432cf9


  | Name              | Type       | Params
-------------------------------------------------
0 | flatten           | Flatten    | 0     
1 | linear_relu_stack | Sequential | 35.9 K
2 | accuracy          | Accuracy   | 0     
-------------------------------------------------
35.9 K    Trainable params
0         Non-trainable params
35.9 K    Total params
0.144     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]



MisconfigurationException: The metric `tensor([ 8., 12.,  9.,  3., 15.,  8., 12.,  8.])` does not contain a single element, thus it cannot be converted to a scalar.

In [15]:
t[0]

(tensor([-2.6639,  2.5859,  6.3119]), 3)

In [16]:
t = torch.from_numpy(a)
l = torch.from_numpy(l)
t.softmax(dim=-1)

tensor([[0.0477, 0.0561, 0.0707, 0.0713, 0.0322, 0.0288, 0.1416, 0.0733, 0.0969,
         0.0315, 0.0316, 0.0734, 0.0482, 0.0638, 0.0587, 0.0743],
        [0.0406, 0.0597, 0.0842, 0.0741, 0.0414, 0.0376, 0.1091, 0.0808, 0.0860,
         0.0338, 0.0289, 0.0686, 0.0551, 0.0890, 0.0535, 0.0574],
        [0.0486, 0.0700, 0.0776, 0.0796, 0.0426, 0.0398, 0.0968, 0.0645, 0.0694,
         0.0479, 0.0325, 0.0698, 0.0588, 0.0836, 0.0589, 0.0597],
        [0.0472, 0.0558, 0.0782, 0.0723, 0.0378, 0.0275, 0.1213, 0.0777, 0.1053,
         0.0293, 0.0338, 0.0658, 0.0474, 0.0798, 0.0511, 0.0700],
        [0.0424, 0.0585, 0.0837, 0.0704, 0.0475, 0.0430, 0.0946, 0.0803, 0.0861,
         0.0366, 0.0335, 0.0660, 0.0581, 0.0853, 0.0557, 0.0584],
        [0.0454, 0.0643, 0.0796, 0.0715, 0.0466, 0.0434, 0.0931, 0.0741, 0.0801,
         0.0410, 0.0342, 0.0670, 0.0577, 0.0851, 0.0574, 0.0593],
        [0.0380, 0.0606, 0.0871, 0.0812, 0.0356, 0.0360, 0.1190, 0.0721, 0.0751,
         0.0366, 0.0235, 0.0829, 0.05

In [17]:
t.softmax(dim=-1)

tensor([[0.0477, 0.0561, 0.0707, 0.0713, 0.0322, 0.0288, 0.1416, 0.0733, 0.0969,
         0.0315, 0.0316, 0.0734, 0.0482, 0.0638, 0.0587, 0.0743],
        [0.0406, 0.0597, 0.0842, 0.0741, 0.0414, 0.0376, 0.1091, 0.0808, 0.0860,
         0.0338, 0.0289, 0.0686, 0.0551, 0.0890, 0.0535, 0.0574],
        [0.0486, 0.0700, 0.0776, 0.0796, 0.0426, 0.0398, 0.0968, 0.0645, 0.0694,
         0.0479, 0.0325, 0.0698, 0.0588, 0.0836, 0.0589, 0.0597],
        [0.0472, 0.0558, 0.0782, 0.0723, 0.0378, 0.0275, 0.1213, 0.0777, 0.1053,
         0.0293, 0.0338, 0.0658, 0.0474, 0.0798, 0.0511, 0.0700],
        [0.0424, 0.0585, 0.0837, 0.0704, 0.0475, 0.0430, 0.0946, 0.0803, 0.0861,
         0.0366, 0.0335, 0.0660, 0.0581, 0.0853, 0.0557, 0.0584],
        [0.0454, 0.0643, 0.0796, 0.0715, 0.0466, 0.0434, 0.0931, 0.0741, 0.0801,
         0.0410, 0.0342, 0.0670, 0.0577, 0.0851, 0.0574, 0.0593],
        [0.0380, 0.0606, 0.0871, 0.0812, 0.0356, 0.0360, 0.1190, 0.0721, 0.0751,
         0.0366, 0.0235, 0.0829, 0.05