# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [17]:
import comet_ml
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import CometLogger
import torch.nn.functional as F
import sys, os
from pathlib import Path
import plotly.express as px 
from sklearn.utils.class_weight import compute_class_weight
import torch

sys.path.append('../src')
sys.path.append('../src/models/lib')
here = Path().cwd()

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [18]:
from models.lib.neural import *
from models.lib.data import *

In [19]:
datafiles = ['../data/processed/primary.csv']
labelfiles = ['../data/processed/meta_primary_labels.csv']

train, test, insize, outsize, weights = \
    generate_datasets(datafiles, labelfiles, class_label='Type', stratified=True)

In [20]:
len(train), len(test)

(151527, 37882)

In [21]:
weights

tensor([69.8926,  0.1540,  4.9032,  0.9191,  4.1998, 52.1788, 10.0323,  6.4579,
         0.6407,  7.7278])

In [22]:
model = GeneClassifier(
    N_features=insize, 
    N_labels=outsize,
    params={
            'width': 1024,
            'layers': 2,
            'lr': 0.001,
            'momentum': 0,
            'weight_decay': 0.001
        },
    weighted_metrics=False
)

train = DataLoader(train, num_workers=0, batch_size=4)
val = DataLoader(test, num_workers=0, batch_size=4)

In [23]:
X, y = next(iter(train))

In [24]:
model(X)

tensor([[ 1.1333, -0.3828,  0.3608, -0.2938,  0.0883, -0.2186, -0.9397,  0.4611,
         -0.5906, -1.0151],
        [-0.6193, -0.0089,  0.2923, -0.4239,  0.0032, -0.7708, -0.0876,  0.2213,
         -0.2629,  0.1882],
        [ 0.0157, -0.5066,  0.1053,  0.8745, -0.7001,  0.6816,  1.2412, -0.6100,
          0.6397,  0.5132],
        [-0.4325,  0.8261, -0.6382, -0.1817,  0.5566,  0.1977, -0.1216, -0.0543,
          0.2404,  0.2682]], grad_fn=<AddmmBackward0>)

In [None]:
comet_logger = CometLogger(
    api_key="neMNyjJuhw25ao48JEWlJpKRR",
    project_name=f"cell-classifier-test",  # Optional
    workspace="jlehrer1",
)

trainer = pl.Trainer(logger=comet_logger)

trainer.fit(model, train, val)

CometLogger will be initialized in online mode
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jlehrer1/cell-classifier-test/34583fb80d68490bbf89204ecd2d6b3f
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [7897]                  : (0.00540797458961606, 6.022818565368652)
COMET INFO:     train_loss_epoch [2]         : (0.3755848705768585, 0.5826876759529114)
COMET INFO:     train_loss_step [1579]       : (0.009096709080040455, 6.052515029907227)
COMET INFO:     val_loss_epoch [2]           : (2.7649288177490234, 7.088824272155762)
COMET INFO:     val_loss_step [18942]        : (0.2742463946342468, 16.88737678527832)
COMET INFO:     weighted_train_accuracy [2]  : (0.83327

Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [10]:
model(X)

tensor([[-1.3049,  8.9003, -1.7906,  2.1041,  1.8243, -1.7744, -0.8112, -1.2819,
         -3.3153,  0.5295],
        [-2.4705,  8.2906, -4.4748,  3.8690, -0.3931, -1.9528, -3.9667, -1.1073,
         -1.6483,  0.9394],
        [-2.1421,  5.5030, -4.0730,  2.5152,  1.1927, -1.7682, -1.0582, -0.1848,
         -1.1301, -0.0446],
        [-0.0874, -0.7812,  5.4952, -2.9307, -3.8575, -0.1813,  0.5236, -1.7776,
          8.6499, -4.1772]], grad_fn=<AddmmBackward0>)

In [11]:
%%time
calc_accuracy(model, X, y)

NameError: name 'calc_accuracy' is not defined

In [12]:
from torchmetrics import Accuracy

In [13]:
acc = Accuracy()


In [14]:
%%time
acc(model(X), y)

CPU times: user 11.5 ms, sys: 1.45 ms, total: 12.9 ms
Wall time: 11.5 ms


tensor(1.)

In [15]:
acc(model(X).softmax(), y)

TypeError: softmax() received an invalid combination of arguments - got (), but expected one of:
 * (int dim, torch.dtype dtype)
 * (name dim, *, torch.dtype dtype)


In [None]:
y_hat = model(X)
y_hat = y_hat.softmax(dim=-1)

In [None]:
acc(y_hat, y)

In [None]:
calc_accuracy(model, X, y)

In [None]:
X2, y2 = (next(iter(data)))

In [None]:
data = iter(data)

In [None]:
data

In [None]:
from torchmetrics import Accuracy, Precision, Recall

acc = Accuracy()
prec = Precision()
rec = Recall()

In [None]:
acc(model(X), y)

In [None]:
prec(model(X), y)

In [None]:
rec(model(X), y)