# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [1]:
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

<torch._C.Generator at 0x7fbbb093be30>

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [2]:
class GeneExpressionData(Dataset):
    def __init__(self, filename, labelname):
        self._filename = filename
        self._labelname = labelname
        self._total_data = 0
        
        with open(filename, "r") as f:
            self._total_data = len(f.readlines()) - 1
    
    def __getitem__(self, idx):
        if idx == 0:
            return self.__getitem__(1)
        
        line = linecache.getline(self._filename, idx + 1)
        label = linecache.getline(self._labelname, idx + 1)
        
        csv_data = csv.reader([line])
        csv_label = csv.reader([label])
        
        data = [x for x in csv_data][0]
        label = [x for x in csv_label][0]
        
        return torch.from_numpy(np.array([float(x) for x in data])).float(), [int(float(x)) for x in label][0]
    
    def __len__(self):
        return self._total_data
    
    def num_labels(self):
        return pd.read_csv(self._labelname)['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])

Since PyTorch loss functions require classes in $[0, C]$, we'll first add $1$ to the labels and re-write it out so we can use it for training

In [3]:
def fix_labels(file):
    labels = pd.read_csv(file)
    labels['# label'] = labels['# label'].astype(int) + 1
    labels.to_csv('fixed_' + file.split('/')[-1], index=False)

fix_labels('../data/processed/primary_labels_neighbors_500_components_100_clust_size_100.csv')

Let's test this quickly and then continue

In [4]:
fixed_labels = pd.read_csv('fixed_primary_labels_neighbors_500_components_100_clust_size_100.csv')
fixed_labels['# label'].value_counts()

1    123188
0     64253
2      1968
Name: # label, dtype: int64

Great, we now continue as normal

In [22]:
t = GeneExpressionData(
    filename='../data/processed/primary_reduction_neighbors_500_components_100.csv',
    labelname='fixed_primary_labels_neighbors_500_components_100_clust_size_250.csv'
)

In [23]:
df = pd.read_csv('../data/processed/primary_reduction_neighbors_500_components_100.csv')
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,1.174407,4.605017,5.700520,4.349964,0.009240,4.443735,4.977513,1.097707,4.923816,...,1.977319,6.758616,3.999133,4.044898,4.987543,1.790490,4.197807,6.984423,9.260715,7.174916
1,1,1.187639,4.608121,5.704485,4.363834,0.024695,4.443859,4.973516,1.105194,4.919911,...,1.981181,6.762886,3.999778,4.040472,4.999935,1.792208,4.192843,6.986716,9.258625,7.172100
2,2,1.179722,4.631567,5.731539,4.270429,0.007747,4.434293,4.971396,1.129670,4.909443,...,2.006164,6.765180,4.005325,4.039588,5.014343,1.786922,4.206350,6.986751,9.256733,7.179493
3,3,1.233760,4.637250,5.735640,4.311743,0.078151,4.433172,4.963012,1.207390,4.900370,...,2.000760,6.769579,3.998335,4.026979,4.999375,1.817481,4.194461,6.987063,9.243600,7.164564
4,4,1.188722,4.624152,5.721036,4.324148,0.035837,4.437555,4.966181,1.132194,4.908939,...,1.998834,6.764676,4.001324,4.034964,5.004535,1.799192,4.201400,6.986776,9.252455,7.172283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189404,189404,1.181309,4.649603,5.754858,4.248226,-0.014871,4.430346,4.971585,1.140923,4.899366,...,2.013938,6.766751,4.011445,4.043420,5.018497,1.773362,4.211319,6.986798,9.257227,7.180955
189405,189405,1.181091,4.658228,5.758984,4.250365,-0.020029,4.426039,4.967492,1.133520,4.895007,...,2.023807,6.771064,4.014005,4.036175,5.034298,1.776111,4.210604,6.986619,9.255919,7.182779
189406,189406,1.176980,4.635231,5.733450,4.225388,-0.008851,4.433665,4.974230,1.149803,4.905263,...,2.008648,6.768187,4.008596,4.042135,5.014659,1.782834,4.212437,6.986119,9.254880,7.180373
189407,189407,1.175157,4.637198,5.734905,4.307137,-0.013657,4.434302,4.975091,1.114408,4.909545,...,1.996098,6.758251,4.008240,4.046164,4.986329,1.776909,4.210703,6.980882,9.256554,7.178543


Let's see how fast it takes to load a minibatch of data

In [7]:
%%time 

for i in range(64):
    t.__getitem__(i)

CPU times: user 110 ms, sys: 49.7 ms, total: 159 ms
Wall time: 158 ms


In [13]:
t.num_labels()

3

Before we train our model, we need to split our data into training and testing sets, in order to get an unbiased evaluation of our model's performance. Likely, we will initially overfit the training set since we provide no regularization.

In [9]:
train_size = int(0.8 * len(t))
test_size = len(t) - train_size

train, test = torch.utils.data.random_split(t, [train_size, test_size])

In [30]:
traindata = DataLoader(train, batch_size = 8, num_workers = 0)
valdata = DataLoader(test, batch_size = 8, num_workers = 0)

Now that we've defined our `DataLoader`, let's test it when training a simple Neural Network

In [24]:
class NN(nn.Module):
    def __init__(self, N_features, N_labels):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.BatchNorm1d(num_features=N_features),
            nn.Linear(in_features=N_features, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=32),
            nn.ReLU(),
#             nn.Conv1d(in_channels=32, out_channels=8, kernel_size=1),
            nn.Linear(in_features=32, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=N_labels),
        )
        
    def forward(self, x):
        return self.network(x)

In [25]:
model = NN(
    N_features=t.num_features(),
    N_labels=t.num_labels()
)

m = torch.randn(10, t.num_features(), 1)

m.unsqueeze(dim=3).shape

torch.Size([10, 101, 1, 1])

Now we can define our criterion, optimization method and train our model on our dataset

In [26]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
loss_arr = []

And finally train our model

In [27]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, ClassificationReport

trainer = create_supervised_trainer(model, optimizer, criterion)

In [28]:
epochs = 100000

for i in range(epochs):
    model.train()

    for X, y in traindata:
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        loss_arr.append(loss.item())
        
    print(f'Epoch {i} is {loss_arr[i]}')

NameError: name 'traindata' is not defined

In [None]:
# 1.0587053998627307
# 1.058727260653217
# 1.0583432531826011