In [None]:
%load_ext autoreload
%autoreload 2

## Imports

In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset
from pytorch_lightning import LightningModule, Trainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.nn as nn
import torchmetrics

## Load data

In [None]:
EVENTS_PATH = "data/QG_nsubs.npz?dl=1"

with np.load(EVENTS_PATH) as f:
    X, y = f['X'], f['y']

In [None]:
X.shape, y.shape

((100000, 45), (100000,))

In [None]:
X[:,0].shape

(100000,)

In [None]:
y[0]

0.0

In [None]:
X[0]

array([0.58288529, 0.35983362, 0.15946587, 0.28643438, 0.10931263,
       0.03136041, 0.29199412, 0.10462717, 0.02529765, 0.26962497,
       0.08968088, 0.0185029 , 0.26170934, 0.08504337, 0.01718934,
       0.23999388, 0.07369913, 0.01286971, 0.24165983, 0.07294354,
       0.01229564, 0.21727208, 0.06340214, 0.01102354, 0.20602818,
       0.05551519, 0.00822112, 0.21182996, 0.05651417, 0.00794706,
       0.20925511, 0.05459273, 0.00716666, 0.19743325, 0.04912426,
       0.00563262, 0.19182676, 0.0455615 , 0.00474475, 0.18342361,
       0.04259337, 0.00448976, 0.17466835, 0.03901473, 0.00406034])

In [None]:
np.unique(y)

array([0., 1.])

In [None]:
data = {f"feature_{idx}":X[:, idx] for idx in range(45)}

In [None]:
data["label"] = y

In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,label
0,0.582885,0.359834,0.159466,0.286434,0.109313,0.03136,0.291994,0.104627,0.025298,0.269625,...,0.191827,0.045562,0.004745,0.183424,0.042593,0.00449,0.174668,0.039015,0.00406,0.0
1,0.260756,0.084642,0.019812,0.240676,0.073122,0.015786,0.225725,0.063897,0.012387,0.215447,...,0.126111,0.021134,0.002316,0.122458,0.018581,0.001079,0.117416,0.017345,0.000992,0.0
2,0.564012,0.330684,0.127177,0.29271,0.110119,0.024787,0.258846,0.086578,0.017215,0.240176,...,0.150572,0.034919,0.003328,0.147431,0.032971,0.003017,0.143079,0.031147,0.002829,0.0
3,0.395935,0.170801,0.042085,0.344588,0.130606,0.026208,0.271026,0.090233,0.017891,0.243731,...,0.164546,0.03485,0.004036,0.151774,0.029703,0.002897,0.14963,0.028986,0.00285,0.0
4,0.196304,0.051556,0.011701,0.192528,0.048644,0.010743,0.178389,0.043404,0.00906,0.172218,...,0.0844,0.012952,0.001005,0.078391,0.011012,0.000815,0.076849,0.010029,0.000572,0.0


In [None]:
# Can't use categorical
dset = Dataset.from_pandas(df)
dset

Dataset({
    features: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'label'],
    num_rows: 100000
})

## Define model

In [None]:
class DNNModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.accuracy = torchmetrics.Accuracy()
        self.roc_auc = torchmetrics.AUROC(num_classes=2)
        self.layers = nn.Sequential(nn.Linear(45, 100), nn.ReLU(), nn.Linear(100, 100), nn.ReLU(), nn.Linear(100, 2))

    def forward(self, x):
        return self.layers(x.view(x.size(0), -1))

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss = F.cross_entropy(self(x), y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.cross_entropy(y_hat, y)
        # log step metric
        self.accuracy(y_hat, y)
        self.roc_auc(y_hat, y)
        self.log("val_accuracy", self.accuracy, on_step=True, on_epoch=True, prog_bar=True,)
        self.log("val_roc_auc", self.roc_auc, on_step=True, on_epoch=True, prog_bar=True, )
        self.log("val_loss", val_loss, on_step=True, on_epoch=True, prog_bar=True,)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

In [None]:
model = DNNModel()

In [None]:
model.summarize()

  | Name     | Type       | Params
----------------------------------------
0 | accuracy | Accuracy   | 0     
1 | roc_auc  | AUROC      | 0     
2 | layers   | Sequential | 14.9 K
----------------------------------------
14.9 K    Trainable params
0         Non-trainable params
14.9 K    Total params
0.060     Total estimated model params size (MB)

In [None]:
model.

In [None]:
X.shape

(100000, 45)

In [None]:
y.shape

(100000,)

In [None]:
X_pt = torch.from_numpy(X).float()
y_pt = torch.from_numpy(y).long()

In [None]:
y_pt[0]

tensor(0)

In [None]:
train_ds = TensorDataset(X_pt[:90_000], y_pt[:90_000])
val_ds = TensorDataset(X_pt[90_000:], y_pt[90_000:])

In [None]:
train_dl = DataLoader(train_ds, batch_size=100)
val_dl = DataLoader(val_ds, batch_size=100)

In [None]:
for batch in dl:
    break

In [None]:
batch

[tensor([[5.8289e-01, 3.5983e-01, 1.5947e-01,  ..., 1.7467e-01, 3.9015e-02,
          4.0603e-03],
         [2.6076e-01, 8.4642e-02, 1.9812e-02,  ..., 1.1742e-01, 1.7345e-02,
          9.9190e-04],
         [5.6401e-01, 3.3068e-01, 1.2718e-01,  ..., 1.4308e-01, 3.1147e-02,
          2.8288e-03],
         ...,
         [4.4769e-01, 2.0842e-01, 5.1291e-02,  ..., 7.2588e-02, 1.1365e-02,
          9.3176e-04],
         [1.3482e-01, 3.9163e-02, 1.5009e-02,  ..., 2.1892e-02, 1.7439e-03,
          8.1177e-05],
         [3.6589e-01, 1.4499e-01, 3.0012e-02,  ..., 1.1757e-01, 2.1363e-02,
          1.5061e-03]]),
 tensor([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
         0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
         1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
         0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
         0, 1, 1, 1])]

In [None]:
trainer = Trainer(
    gpus=1,
    max_epochs=10,
    progress_bar_refresh_rate=20,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, train_dataloader=train_dl, val_dataloaders=val_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type       | Params
----------------------------------------
0 | accuracy | Accuracy   | 0     
1 | roc_auc  | AUROC      | 0     
2 | layers   | Sequential | 14.9 K
----------------------------------------
14.9 K    Trainable params
0         Non-trainable params
14.9 K    Total params
0.060     Total estimated model params size (MB)


Epoch 2:  74%|███████▍  | 740/1000 [00:18<00:06, 40.64it/s, loss=0.47, v_num=13] 
Epoch 0:  90%|█████████ | 900/1000 [00:03<00:00, 244.53it/s, loss=0.462, v_num=14]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/100 [00:00<?, ?it/s][A
Epoch 0:  94%|█████████▍| 940/1000 [00:03<00:00, 244.53it/s, loss=0.462, v_num=14]
Validating:  40%|████      | 40/100 [00:00<00:00, 122.39it/s][A
Validating:  60%|██████    | 60/100 [00:00<00:00, 124.38it/s][A
Epoch 0: 100%|██████████| 1000/1000 [00:04<00:00, 229.91it/s, loss=0.462, v_num=14]
Epoch 0: 100%|██████████| 1000/1000 [00:04<00:00, 220.06it/s, loss=0.462, v_num=14, val_accuracy_step=0.780, val_roc_auc_step=0.867, val_loss_step=0.489, val_accuracy_epoch=0.771, val_roc_auc_epoch=0.869, val_loss_epoch=0.495]
Epoch 1:  90%|█████████ | 900/1000 [00:04<00:00, 224.24it/s, loss=0.448, v_num=14, val_accuracy_step=0.780, val_roc_auc_step=0.867, val_loss_step=0.489, val_accuracy_epoch=0.771, val_roc_auc_epoch=0.869, val_loss_epoch=0.4

In [None]:
trainer.validate(model, dataloaders=val_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validating: 100%|██████████| 100/100 [00:00<00:00, 110.16it/s]--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_accuracy': 0.7940000295639038,
 'val_accuracy_epoch': 0.7940000295639038,
 'val_loss': 0.4502171277999878,
 'val_loss_epoch': 0.4502171277999878,
 'val_roc_auc': 0.87178635597229,
 'val_roc_auc_epoch': 0.87178635597229}
--------------------------------------------------------------------------------
Validating: 100%|██████████| 100/100 [00:00<00:00, 111.28it/s]


[{'val_accuracy': 0.7940000295639038,
  'val_accuracy_epoch': 0.7940000295639038,
  'val_roc_auc': 0.87178635597229,
  'val_roc_auc_epoch': 0.87178635597229,
  'val_loss': 0.4502171277999878,
  'val_loss_epoch': 0.4502171277999878}]