# Modeling Sanity Check: Making sure everything is ok

In this notebook, we'll test our entire network pipeline because SURELY there are bugs.

In [1]:
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import sys
import torch
import pytorch_lightning as pl
sys.path.append('../src/')

Let's define our custom data class and make sure everything is being streamed in correctly

In [2]:
from models.train_neural_network import GeneExpressionData, GeneClassifier

In [3]:
data = GeneExpressionData(
    filename='../data/processed/primary.csv',
    labelname='../data/processed/meta_primary_labels.csv',
    class_label='Subtype'
)

In [4]:
model = GeneClassifier(
    N_features = data.num_features(),
    N_labels = data.num_labels(),
    weights=data.compute_class_weights(),
    params={
        'width' : 2,
        'layers': 2,
        'epochs': 10,
        'lr': 3e-5,
        'momentum': 1e-4,
        'weight_decay': 1e-4
    }
)

Now that we have our dataset, at least make sure a forward pass is computing correctly, and that our model can at least overfit on a small subset of the training set. Therefore, we'll subset our dataset and create the train and val loaders this way.

In [5]:
from torch.utils.data import Subset

tr_10k = Subset(data, range(10))

In [6]:
def train_test(data):
    train_size = int(0.80 * len(data))
    test_size = len(data) - train_size

    train, test = torch.utils.data.random_split(data, [train_size, test_size])

    traindata = DataLoader(train, batch_size=2, num_workers=0)
    valdata = DataLoader(test, batch_size=2, num_workers=0)
    
    return traindata, valdata

train, test = train_test(tr_10k)

In [7]:
len(train), len(test)

(4, 1)

Even though we'll ultimately be using PyTorch Lightning for GPU training, let's try writing the training loop here so we can debug each step. To do this, we'll need to redefine the optimizer and loss

In [8]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [9]:
for epoch in range(1000):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, sample in enumerate(train, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = sample

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 0: # print every 2000 mini-batches
            print(epoch, running_loss / 100)
            running_loss = 0.0

print('Finished Training')

0 0.03588733196258545
1 0.040311880111694336
2 0.039396278858184815
3 0.033199169635772706
4 0.040228414535522464
5 0.03930528879165649
6 0.03773817777633667
7 0.033140125274658205
8 0.0401544189453125
9 0.034819321632385256
10 0.035713632106781
11 0.03307077407836914
12 0.040081353187561036
13 0.034741759300231934
14 0.03302297115325928
15 0.035627782344818115
16 0.035610709190368656
17 0.03996336221694946
18 0.034917333126068116
19 0.03991274356842041
20 0.034650473594665526
21 0.03462893962860107
22 0.034602615833282474
23 0.035491628646850584
24 0.03741095781326294
25 0.03480499267578125
26 0.03544099330902099
27 0.03542392253875733
28 0.03884379386901855
29 0.038814001083374024
30 0.03537285804748535
31 0.035355889797210695
32 0.035339035987854005
33 0.03532217025756836
34 0.03962238550186157
35 0.03461549758911133
36 0.03864071607589722
37 0.0395417046546936
38 0.03718757152557373
39 0.03522205829620361
40 0.035205457210540775
41 0.03518890380859375
42 0.0351727819442749
43 0.035

344 0.03384944677352905
345 0.030599570274353026
346 0.030584135055541993
347 0.03057019233703613
348 0.030555658340454102
349 0.033781270980834964
350 0.030527102947235107
351 0.030512573719024657
352 0.03373018264770508
353 0.03370457172393799
354 0.03047107219696045
355 0.02841529846191406
356 0.02839277982711792
357 0.028379065990447996
358 0.03363891124725342
359 0.03361757040023804
360 0.030382068157196046
361 0.03359306812286377
362 0.03357574939727783
363 0.030337235927581786
364 0.030323643684387207
365 0.03030982494354248
366 0.030295767784118653
367 0.030281267166137694
368 0.03346804618835449
369 0.030254485607147216
370 0.030239095687866212
371 0.03342144966125488
372 0.02817605495452881
373 0.030195670127868654
374 0.030180959701538085
375 0.03338095188140869
376 0.033363678455352784
377 0.030133934020996095
378 0.030119221210479736
379 0.028103785514831545
380 0.030090384483337403
381 0.030078508853912354
382 0.03326024532318115
383 0.028025383949279784
384 0.03004942655

681 0.026484751701354982
682 0.02944788694381714
683 0.026459064483642578
684 0.026447012424468994
685 0.026436057090759277
686 0.026423485279083253
687 0.02449915647506714
688 0.0293823504447937
689 0.02446373224258423
690 0.02637923717498779
691 0.02934474468231201
692 0.029277186393737793
693 0.02441908121109009
694 0.026331264972686768
695 0.02631979703903198
696 0.024380736351013184
697 0.026298072338104248
698 0.029274818897247316
699 0.029262685775756837
700 0.024331471920013427
701 0.02624964475631714
702 0.029238741397857666
703 0.026224629878997804
704 0.024343953132629395
705 0.026206533908843994
706 0.02619590997695923
707 0.029181954860687257
708 0.026177446842193603
709 0.02916750431060791
710 0.026153521537780763
711 0.02421332597732544
712 0.02613011121749878
713 0.02901627779006958
714 0.029108819961547853
715 0.02875765085220337
716 0.029048700332641602
717 0.02903396129608154
718 0.0290187931060791
719 0.02602915287017822
720 0.026018753051757812
721 0.02600675582885

In [14]:
from typing import *
from torchmetrics import Accuracy
import torch.nn.functional as F

class TEST(pl.LightningModule):
    def __init__(self, 
        N_features: int, 
        N_labels: int, 
        weights: List[torch.Tensor], 
        params: Dict[str, float],
    ):
        super(TEST, self).__init__()

        # Set hyperparameters
        self.width = params['width']
        self.layers = params['layers']
        self.lr = params['lr']
        self.momentum = params['momentum']
        self.weight_decay = params['weight_decay']

        layers = self.layers*[
            nn.Linear(self.width, self.width),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.BatchNorm1d(self.width),
        ]

        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_features, self.width),
            *layers,
            nn.Linear(self.width, N_labels),
        )

        self.accuracy = Accuracy(average='weighted', num_classes=N_labels)
        self.weights = weights

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(),
            lr=self.lr, 
            momentum=self.momentum, 
            weight_decay=self.weight_decay,
        )

        return optimizer

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=self.weights)
        acc = self.accuracy(y_hat.softmax(dim=-1), y)

        self.log("train_loss", loss, logger=True, on_epoch=True)
        self.log("train_accuracy", acc, logger=True, on_epoch=True)

        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.cross_entropy(y_hat, y, weight=self.weights)
        acc = self.accuracy(y_hat.softmax(dim=-1), y)

        self.log("val_loss", val_loss, logger=True, on_epoch=True)
        self.log("val_accuracy", acc, logger=True, on_epoch=True)

        return val_loss
    
model = TEST(
    N_features = data.num_features(),
    N_labels = data.num_labels(),
    weights=data.compute_class_weights(),
    params={
        'width' : 2,
        'layers': 2,
        'epochs': 10,
        'lr': 0.001,
        'momentum': 0,
        'weight_decay':0,
    }
)

Our cost function is converging on a small subset of the data, which is good! Now let's try this same training routine with PyTorch Lightning to make sure nothing is going awry there.

In [None]:
from pytorch_lightning import Trainer

run = Trainer()
run.fit(model, train, test)