# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [1]:
import comet_ml
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torch.nn.functional as F
import sys, os
from pathlib import Path
import plotly.express as px 
from sklearn.utils.class_weight import compute_class_weight
import torch

sys.path.append('../src')
sys.path.append('../src/models/lib')
here = Path().cwd()

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [2]:
from models.lib.neural import *
from models.lib.data import *

In [3]:
datafiles = ['../data/processed/primary.csv']
labelfiles = ['../data/processed/meta_primary_labels.csv']

train, test, insize, outsize, weights = \
    generate_datasets(datafiles, labelfiles, class_label='Type', stratified=True)

In [4]:
len(train), len(test)

(151527, 37882)

In [5]:
weights

tensor([69.8926,  0.1540,  4.9032,  0.9191,  4.1998, 52.1788, 10.0323,  6.4579,
         0.6407,  7.7278])

In [6]:
model = GeneClassifier(N_features=insize, N_labels=outsize)

In [9]:
train = DataLoader(train, num_workers=0, batch_size=4)
val = DataLoader(test, num_workers=0, batch_size=4)

In [8]:
generate_trainer()

In [59]:
model(X)

tensor([[ 0.4000, -0.9368,  0.1138, -0.1338, -0.1483, -1.5089, -0.3255, -0.2571,
         -0.5768,  0.4814],
        [-0.7349, -0.1435,  0.1228, -0.7582, -0.1529,  1.5237,  0.3549, -0.2442,
         -0.0928,  0.9806],
        [ 1.0578,  0.7611, -0.0709,  0.1246,  0.5997,  0.7226,  0.8388,  0.0865,
         -0.0727, -0.4233],
        [-0.6517,  0.3108, -0.2082,  0.8627, -0.1989, -0.8510, -0.8709,  0.3801,
          0.6751, -1.0508]], grad_fn=<AddmmBackward0>)

In [60]:
%%time
calc_accuracy(model, X, y)

CPU times: user 34.5 ms, sys: 4.03 ms, total: 38.5 ms
Wall time: 36.2 ms


0.0

In [61]:
from torchmetrics import Accuracy

In [62]:
acc = Accuracy()


In [63]:
%%time
acc(model(X), y)

CPU times: user 21.2 ms, sys: 3.23 ms, total: 24.4 ms
Wall time: 22.7 ms


tensor(0.)

In [64]:
acc(model(X).softmax(), y)

TypeError: softmax() received an invalid combination of arguments - got (), but expected one of:
 * (int dim, torch.dtype dtype)
 * (name dim, *, torch.dtype dtype)


In [65]:
y_hat = model(X)
y_hat = y_hat.softmax(dim=-1)

In [66]:
acc(y_hat, y)

tensor(0.)

In [67]:
calc_accuracy(model, X, y)

0.0

In [68]:
X2, y2 = (next(iter(data)))

In [69]:
data = iter(data)

In [70]:
data

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x7fab5232c100>

In [71]:
from torchmetrics import Accuracy, Precision, Recall

acc = Accuracy()
prec = Precision()
rec = Recall()

In [72]:
acc(model(X), y)

tensor(0.)

In [73]:
prec(model(X), y)

tensor(0.)

In [74]:
rec(model(X), y)

tensor(0.)