# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [1]:
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

<torch._C.Generator at 0x7fe1304dde70>

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [2]:
class GeneExpressionData(Dataset):
    def __init__(self, filename, labelname):
        self._filename = filename
        self._labelname = labelname
        self._total_data = 0
        
        with open(filename, "r") as f:
            self._total_data = len(f.readlines()) - 1
    
    def __getitem__(self, idx):
        line = linecache.getline(self._filename, idx + 2)
        label = linecache.getline(self._labelname, idx + 2)
        
        csv_data = csv.reader([line])
        csv_label = csv.reader([label])
        
        data = [x for x in csv_data][0]
        label = [x for x in csv_label][0]
        
        return torch.from_numpy(np.array([float(x) for x in data[1:]])).float(), [int(float(x)) for x in label][0]
    
    def __len__(self):
        return self._total_data
    
    def num_labels(self):
        return pd.read_csv(self._labelname)['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])

Since PyTorch loss functions require classes in $[0, C]$, we'll first add $1$ to the labels and re-write it out so we can use it for training

In [3]:
def fix_labels(file):
    labels = pd.read_csv(file)
    labels['# label'] = labels['# label'].astype(int) + 1
    labels.to_csv('fixed_' + file.split('/')[-1], index=False)

fix_labels('../data/processed/labels/primary_labels_neighbors_50_components_50_clust_size_100.csv')

Great, we now continue as normal

In [4]:
t = GeneExpressionData(
    filename='../data/processed/pca/pca_components_50_primary.csv',
    labelname='fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv'
)

In [5]:
t.__getitem__(3)

(tensor([-0.7190,  3.1106, -2.7917, -0.5684,  0.8418, -3.6876,  0.8462,  0.8769,
          0.8501, -1.2519,  0.2484, -0.1826, -1.0845,  1.9183, -0.0116,  0.9930,
         -0.2720, -0.1639, -0.1635,  0.4015, -0.3665,  0.4014, -0.6133, -0.0702,
          0.3934,  1.2825, -0.5827,  0.1928, -0.3193, -0.6941,  0.0094,  0.6268,
         -0.4087,  0.7147,  1.3372,  0.7491,  0.0941,  0.9887, -0.8340, -1.0065,
         -0.1222, -1.0664, -0.1622, -0.3615,  0.7603, -0.9312, -0.1044, -0.4603,
          0.1306]),
 3)

Let's see how fast it takes to load a minibatch of data

In [6]:
%%time 

for i in range(64):
    t.__getitem__(i)

CPU times: user 3.35 ms, sys: 1.18 ms, total: 4.53 ms
Wall time: 3.52 ms


Before we train our model, we need to split our data into training and testing sets, in order to get an unbiased evaluation of our model's performance. Likely, we will initially overfit the training set since we provide no regularization.

In [7]:
train_size = int(0.8 * len(t))
test_size = len(t) - train_size

train, test = torch.utils.data.random_split(t, [train_size, test_size])

In [8]:
traindata = DataLoader(train, batch_size = 64, num_workers = 0)
valdata = DataLoader(test, batch_size = 64, num_workers = 0)

Now that we've defined our `DataLoader`, let's test it when training a simple Neural Network

In [9]:
class NN(nn.Module):
    def __init__(self, N_features, N_labels):
        super(NN, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Linear(64, N_labels),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [10]:
model = NN(
    N_features=t.num_features(),
    N_labels=t.num_labels()
)

Now we can define our criterion, optimization method and train our model on our dataset

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
loss_arr = []

And finally train our model

In [12]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Precision, Recall, Loss

# model = NN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

trainer = create_supervised_trainer(model, optimizer, criterion)

val_metrics = {
    "precision": Precision(),
    "recall": Recall(),
    "crossentropy": Loss(criterion)
}

evaluator = create_supervised_evaluator(model, metrics=val_metrics)

log_interval = 1

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(traindata)
    metrics = evaluator.state.metrics
    
    print(
        f"Training Results - Epoch: {trainer.state.epoch}\
        Avg loss: {metrics['crossentropy']}"
    )

In [13]:
trainer.run(traindata, max_epochs=50)

Training Results - Epoch: 1        Avg loss: 2.2940958954509756
Training Results - Epoch: 2        Avg loss: 2.293602996495674
Training Results - Epoch: 3        Avg loss: 2.2933361298646444
Training Results - Epoch: 4        Avg loss: 2.293051939588324
Training Results - Epoch: 5        Avg loss: 2.2925969876655645
Training Results - Epoch: 6        Avg loss: 2.2925751268750783
Training Results - Epoch: 7        Avg loss: 2.2925759518105684
Training Results - Epoch: 8        Avg loss: 2.2925759518105684
Training Results - Epoch: 9        Avg loss: 2.2925759518105684
Training Results - Epoch: 10        Avg loss: 2.2925759518105684
Training Results - Epoch: 11        Avg loss: 2.2925759518105684
Training Results - Epoch: 12        Avg loss: 2.2925759518105684
Training Results - Epoch: 13        Avg loss: 2.2925759518105684
Training Results - Epoch: 14        Avg loss: 2.2925759518105684
Training Results - Epoch: 15        Avg loss: 2.2925759518105684
Training Results - Epoch: 16        

Engine run is terminating due to exception: 


KeyboardInterrupt: 