# Dataloading 01

In this notebook, we'll figure out how to use PyTorch's DataLoader class to load our massive files without reading the entirety of them into memory

In [31]:
import comet_ml
import dask.dataframe as dd
import pandas as pd 
import torch
import linecache 
import csv
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import CometLogger
import torch.nn.functional as F
import sys, os
from pathlib import Path
import plotly.express as px 
from sklearn.utils.class_weight import compute_class_weight
import torch

sys.path.append('../src')
sys.path.append('../src/models/lib')
here = Path().cwd()

We'll first design a custom dataset to use with PyTorch's `DataLoader` class

In [32]:
from models.lib.neural import *
from models.lib.data import *

In [33]:
datafiles = ['../data/processed/primary.csv']
labelfiles = ['../data/processed/meta_primary_labels.csv']

train, test, insize, outsize, weights = \
    generate_datasets(datafiles, labelfiles, class_label='Type', stratified=True)

In [34]:
len(train), len(test)

(151527, 37882)

In [35]:
weights

tensor([69.8926,  0.1540,  4.9032,  0.9191,  4.1998, 52.1788, 10.0323,  6.4579,
         0.6407,  7.7278])

In [36]:
model = GeneClassifier(N_features=insize, N_labels=outsize)

train = DataLoader(train, num_workers=0, batch_size=4)
val = DataLoader(test, num_workers=0, batch_size=4)

In [37]:
X, y = next(iter(train))

In [38]:
model(X)

tensor([[-0.6798,  0.3413, -0.3399,  1.3097,  0.2905,  0.0935, -0.0018, -0.8892,
          0.8596, -0.2144],
        [ 0.5010, -1.2067,  0.3861, -0.0619, -1.3026, -0.9374, -0.3402, -0.1631,
          0.2862, -0.1166],
        [ 0.4340,  0.5259, -0.2140, -0.5265, -0.2789,  0.4569,  0.8034,  0.8394,
         -0.7086,  0.1767],
        [-0.1738,  0.3566,  0.1026, -0.6650,  1.2814,  0.4737, -0.4216,  0.1999,
         -0.3543,  0.1748]], grad_fn=<AddmmBackward0>)

In [None]:
comet_logger = CometLogger(
    api_key="neMNyjJuhw25ao48JEWlJpKRR",
    project_name=f"cell-classifier-test",  # Optional
    workspace="jlehrer1",
)

trainer = pl.Trainer(logger=comet_logger)

trainer.fit(model, train, val)

CometLogger will be initialized in online mode
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jlehrer1/cell-classifier-test/6ef0cb33786a4a78bcdb252b2ca35526


  | Name              | Type       | Params
-------------------------------------------------
0 | flatten           | Flatten    | 0     
1 | linear_relu_stack | Sequential | 17.8 M
-------------------------------------------------
17.8 M    Trainable params
0         Non-trainable params
17.8 M    Total params
71.094    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [None]:
model(X)

In [None]:
%%time
calc_accuracy(model, X, y)

In [None]:
from torchmetrics import Accuracy

In [None]:
acc = Accuracy()


In [None]:
%%time
acc(model(X), y)

In [None]:
acc(model(X).softmax(), y)

In [None]:
y_hat = model(X)
y_hat = y_hat.softmax(dim=-1)

In [None]:
acc(y_hat, y)

In [None]:
calc_accuracy(model, X, y)

In [None]:
X2, y2 = (next(iter(data)))

In [None]:
data = iter(data)

In [None]:
data

In [None]:
from torchmetrics import Accuracy, Precision, Recall

acc = Accuracy()
prec = Precision()
rec = Recall()

In [None]:
acc(model(X), y)

In [None]:
prec(model(X), y)

In [None]:
rec(model(X), y)