In [None]:
%load_ext autoreload
%autoreload 2

## Imports

In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset
from pytorch_lightning import LightningModule, Trainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.nn as nn

## Load data

In [None]:
EVENTS_PATH = "data/QG_nsubs.npz?dl=1"

with np.load(EVENTS_PATH) as f:
    X, y = f['X'], f['y']

In [None]:
X.shape, y.shape

((100000, 45), (100000,))

In [None]:
X[:,0].shape

(100000,)

In [None]:
y[0]

0.0

In [None]:
X[0]

array([0.58288529, 0.35983362, 0.15946587, 0.28643438, 0.10931263,
       0.03136041, 0.29199412, 0.10462717, 0.02529765, 0.26962497,
       0.08968088, 0.0185029 , 0.26170934, 0.08504337, 0.01718934,
       0.23999388, 0.07369913, 0.01286971, 0.24165983, 0.07294354,
       0.01229564, 0.21727208, 0.06340214, 0.01102354, 0.20602818,
       0.05551519, 0.00822112, 0.21182996, 0.05651417, 0.00794706,
       0.20925511, 0.05459273, 0.00716666, 0.19743325, 0.04912426,
       0.00563262, 0.19182676, 0.0455615 , 0.00474475, 0.18342361,
       0.04259337, 0.00448976, 0.17466835, 0.03901473, 0.00406034])

In [None]:
np.unique(y)

array([0., 1.])

In [None]:
data = {f"feature_{idx}":X[:, idx] for idx in range(45)}

In [None]:
data["label"] = y

In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,label
0,0.582885,0.359834,0.159466,0.286434,0.109313,0.03136,0.291994,0.104627,0.025298,0.269625,...,0.191827,0.045562,0.004745,0.183424,0.042593,0.00449,0.174668,0.039015,0.00406,0.0
1,0.260756,0.084642,0.019812,0.240676,0.073122,0.015786,0.225725,0.063897,0.012387,0.215447,...,0.126111,0.021134,0.002316,0.122458,0.018581,0.001079,0.117416,0.017345,0.000992,0.0
2,0.564012,0.330684,0.127177,0.29271,0.110119,0.024787,0.258846,0.086578,0.017215,0.240176,...,0.150572,0.034919,0.003328,0.147431,0.032971,0.003017,0.143079,0.031147,0.002829,0.0
3,0.395935,0.170801,0.042085,0.344588,0.130606,0.026208,0.271026,0.090233,0.017891,0.243731,...,0.164546,0.03485,0.004036,0.151774,0.029703,0.002897,0.14963,0.028986,0.00285,0.0
4,0.196304,0.051556,0.011701,0.192528,0.048644,0.010743,0.178389,0.043404,0.00906,0.172218,...,0.0844,0.012952,0.001005,0.078391,0.011012,0.000815,0.076849,0.010029,0.000572,0.0


In [None]:
# Can't use categorical
dset = Dataset.from_pandas(df)
dset

Dataset({
    features: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'label'],
    num_rows: 100000
})

## Define model

In [None]:
class DNNModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(45, 64), nn.ReLU(), nn.Linear(64, 64), nn.ReLU(), nn.Linear(64, 2))

    def forward(self, x):
        return self.layers(x.view(x.size(0), -1))

    def training_step(self, batch):
        x, y = batch
        loss = F.cross_entropy(self(x), y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

In [None]:
model = DNNModel()

In [None]:
X.shape

(100000, 45)

In [None]:
y.shape

(100000,)

In [None]:
X_pt = torch.from_numpy(X).float()
y_pt = torch.from_numpy(y).long()

In [None]:
y_pt[0]

tensor(0)

In [None]:
train_ds = TensorDataset(X_pt, y_pt)

In [None]:
dl = DataLoader(train_ds, batch_size=100)

In [None]:
for batch in dl:
    break

In [None]:
xb.size()

torch.Size([4, 45])

In [None]:
yb.size()

torch.Size([4])

Epoch 0:  46%|████▋     | 11580/25000 [01:00<01:09, 193.00it/s, loss=0.393, v_num=0]

In [None]:
xb[0]

tensor([0.5829, 0.3598, 0.1595, 0.2864, 0.1093, 0.0314, 0.2920, 0.1046, 0.0253,
        0.2696, 0.0897, 0.0185, 0.2617, 0.0850, 0.0172, 0.2400, 0.0737, 0.0129,
        0.2417, 0.0729, 0.0123, 0.2173, 0.0634, 0.0110, 0.2060, 0.0555, 0.0082,
        0.2118, 0.0565, 0.0079, 0.2093, 0.0546, 0.0072, 0.1974, 0.0491, 0.0056,
        0.1918, 0.0456, 0.0047, 0.1834, 0.0426, 0.0045, 0.1747, 0.0390, 0.0041],
       dtype=torch.float64)

In [None]:
model.training_step(batch)

tensor(0.6900, grad_fn=<NllLossBackward0>)

In [None]:
trainer = Trainer(
    gpus=1,
    max_epochs=3,
    progress_bar_refresh_rate=20,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 7.2 K 
--------------------------------------
7.2 K     Trainable params
0         Non-trainable params
7.2 K     Total params
0.029     Total estimated model params size (MB)


Epoch 0:  46%|████▋     | 11580/25000 [01:08<01:19, 169.65it/s, loss=0.393, v_num=0]
Epoch 2: 100%|██████████| 1000/1000 [00:04<00:00, 227.52it/s, loss=0.459, v_num=1]
