# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [29]:
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

<torch._C.Generator at 0x7fbbb093be30>

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [42]:
class GeneExpressionData(Dataset):
    def __init__(self, filename, labelname):
        self._filename = filename
        self._labelname = labelname
        self._total_data = 0
        
        with open(filename, "r") as f:
            self._total_data = len(f.readlines()) - 1
    
    def __getitem__(self, idx):
        if idx == 0:
            return self.__getitem__(1)
        
        line = linecache.getline(self._filename, idx + 1)
        label = linecache.getline(self._labelname, idx + 1)
        
        csv_data = csv.reader([line])
        csv_label = csv.reader([label])
        
        data = [x for x in csv_data][0]
        label = [x for x in csv_label][0]
        
        return torch.from_numpy(np.array([float(x) for x in data])).float(), [int(float(x)) for x in label][0]
    
    def __len__(self):
        return self._total_data
    
    def num_labels(self):
        return pd.read_csv(self._labelname)['# label'].nunique()
    
    def num_features(self):
        return len(self.__getitem__(0)[0])

Since PyTorch loss functions require classes in $[0, C]$, we'll first add $1$ to the labels and re-write it out so we can use it for training

In [43]:
def fix_labels(file):
    labels = pd.read_csv(file)
    labels['# label'] = labels['# label'].astype(int) + 1
    labels.to_csv('fixed_' + file.split('/')[-1], index=False)

fix_labels('../data/processed/primary_labels_neighbors_50_components_50_clust_size_100.csv')

Let's test this quickly and then continue

In [44]:
test = pd.read_csv('../data/processed/primary_labels_neighbors_50_components_50_clust_size_100.csv')

Great, we now continue as normal

In [45]:
t = GeneExpressionData(
    filename='../data/processed/primary_reduction_neighbors_50_components_50.csv',
    labelname='fixed_primary_labels_neighbors_50_components_50_clust_size_100.csv'
)

In [46]:
test = pd.read_csv('../data/processed/primary_reduction_neighbors_50_components_50.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,0,1.744161,4.117685,4.33174,4.484131,6.329494,4.800045,8.771153,5.297653,1.556207,...,5.40235,3.36863,2.418743,6.432809,0.835661,1.250952,6.484134,6.12385,5.155292,5.048383
1,1,1.736738,4.116271,4.351516,4.516574,6.31279,4.82383,8.744577,5.311924,1.578302,...,5.4268,3.394699,2.405094,6.430271,0.767878,1.236046,6.4678,6.092812,5.154266,5.026052
2,2,1.746748,4.142232,4.338512,4.473803,6.406012,4.792516,8.739078,5.297864,1.562608,...,5.414892,3.362663,2.415097,6.435054,0.771711,1.220557,6.511899,6.157069,5.161437,5.075896
3,3,1.700582,4.098061,4.376571,4.506793,6.336909,4.819511,8.761378,5.286463,1.478201,...,5.396242,3.379358,2.37604,6.44328,0.805606,1.196619,6.480151,6.122002,5.148429,5.038345
4,4,1.747905,4.11343,4.337235,4.481512,6.325781,4.797932,8.767736,5.293727,1.562555,...,5.405585,3.369891,2.417424,6.433032,0.827843,1.240131,6.488463,6.131742,5.153129,5.050637


Let's see how fast it takes to load a minibatch of data

In [48]:
%%time 

for i in range(64):
    t.__getitem__(i)

CPU times: user 5.34 ms, sys: 330 µs, total: 5.67 ms
Wall time: 10.6 ms


In [49]:
t.num_labels()

16

Before we train our model, we need to split our data into training and testing sets, in order to get an unbiased evaluation of our model's performance. Likely, we will initially overfit the training set since we provide no regularization.

In [50]:
train_size = int(0.8 * len(t))
test_size = len(t) - train_size

train, test = torch.utils.data.random_split(t, [train_size, test_size])

In [51]:
traindata = DataLoader(train, batch_size = 8, num_workers = 0)
valdata = DataLoader(test, batch_size = 8, num_workers = 0)

Now that we've defined our `DataLoader`, let's test it when training a simple Neural Network

In [52]:
class NN(nn.Module):
    def __init__(self, N_features, N_labels):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.BatchNorm1d(num_features=N_features),
            nn.Linear(in_features=N_features, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=N_labels),
        )
        
    def forward(self, x):
        return self.network(x)

In [53]:
model = NN(
    N_features=t.num_features(),
    N_labels=t.num_labels()
)

m = torch.randn(10, t.num_features(), 1)

m.unsqueeze(dim=3).shape

torch.Size([10, 51, 1, 1])

Now we can define our criterion, optimization method and train our model on our dataset

In [54]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
loss_arr = []

And finally train our model

In [58]:
epochs = 100000

for i in range(epochs):
    model.train()

    for X, y in traindata:
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        loss_arr.append(loss.item())
        
    print(f'Epoch {i} is {loss_arr[i]}')

Epoch 0 is 2.791111469268799
Epoch 1 is 2.796858310699463
Epoch 2 is 2.7463631629943848
Epoch 3 is 2.812925338745117
Epoch 4 is 2.726506233215332
Epoch 5 is 2.7621710300445557
Epoch 6 is 2.7859160900115967
Epoch 7 is 2.7637641429901123
Epoch 8 is 2.7586119174957275


KeyboardInterrupt: 

In [None]:
# 1.0587053998627307
# 1.058727260653217
# 1.0583432531826011