# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [1]:
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

<torch._C.Generator at 0x7fe0e153de50>

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [70]:
class GeneExpressionData(Dataset):
    def __init__(self, filename, labelname):
        self._filename = filename
        self._labelname = labelname
        self._total_data = 0
        
        with open(filename, "r") as f:
            self._total_data = len(f.readlines()) - 1
    
    def __getitem__(self, idx):
        line = linecache.getline(self._filename, idx + 2)
        label = linecache.getline(self._labelname, idx + 2)
        
        csv_data = csv.reader([line])
        csv_label = csv.reader([label])
        
        data = [x for x in csv_data][0]
        label = [x for x in csv_label][0]
        
        return torch.from_numpy(np.array([float(x) for x in data[1:]])).float(), [int(float(x)) for x in label][0]
    
    def __len__(self):
        return self._total_data
    
    def num_labels(self):
        return pd.read_csv(self._labelname)['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])
    
class CustomDataset2(Dataset):
    def __init__(self, data, labels):
        self.data = pd.read_csv(data)
        self.labels = pd.read_csv(labels)
        
    def __getitem__(self, i):
        return torch.from_numpy(self.data.iloc[i, :].values).float(), self.labels.iloc[i, :].values[0]
    
    def __len__(self):
        return self.data.shape[0]
    
    def num_labels(self):
        return self.labels['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])

Since PyTorch loss functions require classes in $[0, C]$, we'll first add $1$ to the labels and re-write it out so we can use it for training

In [71]:
def fix_labels(file):
    labels = pd.read_csv(file)
    labels['# label'] = labels['# label'].astype(int) + 1
    labels.to_csv('fixed_' + file.split('/')[-1], index=False)

fix_labels('../data/processed/labels/primary_labels_neighbors_50_components_50_clust_size_100.csv')

Great, we now continue as normal

In [72]:
t = GeneExpressionData(
    filename='../data/processed/pca/pca_components_100_primary.csv',
    labelname='fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv'
)
t2 = CustomDataset2(
    '../data/processed/pca/pca_components_50_primary.csv',
    'fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv'
)

In [74]:
t2.num_labels()

16

In [75]:
t.__getitem__(3)

(tensor([-1.3597e+00,  2.1595e-01,  2.9131e+00, -1.5263e+00, -1.3147e+00,
         -8.7690e-01, -7.5645e-01,  2.6089e+00,  6.7652e-01,  6.7589e-01,
          1.9877e-01, -6.4176e-01,  1.9006e+00, -5.0790e-01,  8.5522e-01,
          4.5851e-01, -2.0374e-02, -8.0659e-01, -4.3794e-01, -7.1086e-01,
         -1.5406e+00,  6.7969e-01, -2.2015e+00, -5.4034e-01,  2.3422e-01,
          3.8402e-01, -9.5410e-01, -1.8976e-01,  7.4636e-01,  1.4714e+00,
         -3.6774e-01, -1.0464e+00, -5.8961e-01,  2.1272e+00,  1.3821e-02,
         -2.2644e-01, -2.0342e-01,  1.6462e+00, -1.5513e-01,  7.7032e-01,
         -1.5469e+00,  5.9110e-01, -3.3212e-01,  3.3323e-01, -6.0824e-01,
         -1.1824e-01, -7.9207e-01,  3.8812e-02, -9.4262e-01,  1.3539e+00,
         -2.1229e-02,  7.1252e-01, -9.6122e-01,  1.5301e+00, -6.1528e-01,
          2.8482e-01, -3.5894e-01, -5.0734e-02,  1.7886e-01, -1.8986e-03,
          5.3039e-01, -7.2092e-01,  7.6054e-01, -1.8609e-01, -3.6199e-01,
         -7.0708e-01, -2.3196e-01,  1.

Let's see how fast it takes to load a minibatch of data

In [76]:
%%time 

for i in range(64):
    t.__getitem__(i)

CPU times: user 13.2 ms, sys: 2.69 ms, total: 15.9 ms
Wall time: 15.4 ms


Before we train our model, we need to split our data into training and testing sets, in order to get an unbiased evaluation of our model's performance. Likely, we will initially overfit the training set since we provide no regularization.

In [78]:
train_size = int(0.8 * len(t))
test_size = len(t) - train_size

train, test = torch.utils.data.random_split(t2, [train_size, test_size])

In [88]:
traindata = DataLoader(train, batch_size = 64, num_workers = 0)
valdata = DataLoader(test, batch_size = 64, num_workers = 0)

Now that we've defined our `DataLoader`, let's test it when training a simple Neural Network

In [89]:
class NN(nn.Module):
    def __init__(self, N_features, N_labels):
        super().__init__()
        
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, N_labels),
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [90]:
model = NN(
    N_features=t2.num_features(),
    N_labels=t2.num_labels()
)

Now we can define our criterion, optimization method and train our model on our dataset

In [91]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
loss_arr = []

And finally train our model

In [92]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Precision, Recall, Loss

# model = NN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

trainer = create_supervised_trainer(model, optimizer, criterion)

val_metrics = {
    "precision": Precision(),
    "recall": Recall(),
    "crossentropy": Loss(criterion)
}

evaluator = create_supervised_evaluator(model, metrics=val_metrics)

log_interval = 1

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(traindata)
    metrics = evaluator.state.metrics
    print(
        f"Training Results - Epoch: {trainer.state.epoch}\
        Avg recall: {metrics['recall']} \
        Avg precision: {metrics['precision']} \
        Avg loss: {metrics['crossentropy']}"
    )

In [None]:
trainer.run(traindata, max_epochs=10)

Training Results - Epoch: 1        Avg recall: tensor([0.1636, 0.9152, 0.5818, 0.5511, 0.5631, 0.0000, 0.0000, 0.0000, 0.4562,
        0.1930, 0.1135, 0.4007, 0.6225, 0.0000, 0.0558, 0.3065],
       dtype=torch.float64)         Avg precision: tensor([0.4611, 0.8030, 0.8533, 0.5541, 0.3985, 0.0000, 0.0000, 0.0000, 0.4155,
        0.2575, 0.2576, 0.3353, 0.3574, 0.0000, 0.6388, 0.3890],
       dtype=torch.float64)         Avg loss: 1.6443173081365037
Training Results - Epoch: 2        Avg recall: tensor([0.1375, 0.9206, 0.4864, 0.5456, 0.5334, 0.0000, 0.0000, 0.0000, 0.4143,
        0.2775, 0.1424, 0.3932, 0.6597, 0.0043, 0.1739, 0.3389],
       dtype=torch.float64)         Avg precision: tensor([0.4699, 0.7940, 0.9554, 0.5507, 0.4284, 0.0000, 0.0000, 0.0000, 0.4336,
        0.2483, 0.2588, 0.3575, 0.3957, 0.6000, 0.5028, 0.4044],
       dtype=torch.float64)         Avg loss: 1.6164747041781333
Training Results - Epoch: 3        Avg recall: tensor([0.1826, 0.9251, 0.7091, 0.5515, 0.5354,