In [94]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm, trange
from pytorch_revgrad import RevGrad

import os
import sys
import random
import pickle
from functools import reduce
from typing import Any, List, Dict, Tuple, Callable, Iterable, Optional
from dataclasses import dataclass

In [2]:
if torch.cuda.is_available():
    dev = torch.device('cuda')
else:
    dev = torch.device('cpu')
dev

device(type='cuda')

In [3]:
seed = int(np.pi * 100_000_000)
torch.random.manual_seed(seed + 1)
np.random.seed(seed + 2)
random.seed(seed + 3)

In [4]:
skewed_data_dir = 'data/skewed'

In [5]:
z_syst = torch.tensor(np.sort(np.concatenate([
    np.arange(88, 97) / 100,
    np.arange(970, 1030) / 1000,
    np.arange(103, 107) / 100,
    np.arange(1070, 1130) / 1000,
    np.arange(113, 115) / 100,
], axis=-1)))
z_syst_up = 1.1 
z_syst_down = 0.8 
z_nominal = 1.0

z_syst_train = torch.tensor([
    0.7,  0.74, 0.78, 0.8,  0.84, 0.88, 0.9,  0.92,
    0.94, 0.96, 0.98, 0.99, 1.0,  1.01, 1.02, 1.04,
    1.06, 1.08, 1.09, 1.10, 1.11, 1.12, 1.13, 1.14,
])

In [6]:
@dataclass
class Dataset:
    x: torch.Tensor
    y: torch.Tensor
    z: torch.Tensor
    weights: torch.Tensor
        
    def __getitem__(self, index) -> 'Dataset':
        return Dataset(
            x=self.x[index],
            y=self.y[index],
            z=self.z[index],
            weights=self.weights[index],
        )
        
    def __setitem__(self, index, item: 'Dataset') -> None:
        self.x[index] = item.x
        self.y[index] = item.y
        self.z[index] = item.z
        self.weights[index] = item.weights
        
    def __len__(self) -> int:
        return len(self.x)
    
    def sizeof(self) -> int:
        return sum(
            reduce(lambda a, b: a * b, tensor.shape) * tensor.element_size()
            for tensor in [
                self.x,
                self.y,
                self.z,
                self.weights,
            ]
        )
    
    def to(self, device) -> 'Dataset':
        return Dataset(
            x=self.x.to(device),
            y=self.y.to(device),
            z=self.z.to(device),
            weights=self.weights.to(device),
        )

In [7]:
def to_tensor(x, device=None) -> torch.Tensor:
    if device is None:
        device = dev
    if isinstance(x, (pd.DataFrame, pd.Series)):
        return torch.tensor(x.values.astype(np.float32)).to(device)
    if isinstance(x, (np.ndarray, list)):
        return torch.tensor(x).to(device)
    raise TypeError(f'Unknown type {type(x)}')

In [8]:
def make_dataset(x, y, z, weights) -> Dataset:
    return Dataset(
        x=to_tensor(x, device='cpu'),
        y=to_tensor(y, device='cpu'),
        z=to_tensor(z, device='cpu'),
        weights=to_tensor(weights, device='cpu'),
    )

In [9]:
def load_training_data_for(z: float) -> Tuple[Dataset, Dataset]:
    path = os.path.join(skewed_data_dir, f'HiggsML_TES_{round(z, 2)}.h5')
    # Read and shuffle.
    df = pd.read_hdf(path, 'data_syst').sample(frac=1).reset_index()
    
    target = df['Label'] == 'b'
    weights = df['Weight']
    z = df['Z']
    assert (z == z[0]).all()
    indices = df['index']
    df.drop(['Label', 'Z', 'Weight', 'index', 'KaggleSet'], axis=1, inplace=True)
    
    train_indices = indices % 2 == 0
    train_set = make_dataset(
        x=df[train_indices],
        y=target[train_indices],
        z=z[train_indices],
        weights=weights[train_indices],
    )
    
    test_indices = ~train_indices
    test_set = make_dataset(
        x=df[test_indices],
        y=target[test_indices],
        z=z[test_indices],
        weights=weights[test_indices],
    )
    
    scale_up = 1.0
    class_weights = (
        weights[target == 0].sum(),
        weights[target == 1].sum(),
    )
    test_class_weights = (
        test_set.weights[test_set.y == 0].sum(),
        test_set.weights[test_set.y == 1].sum(),
    )
    
    for label in (0, 1):
        factor_train = scale_up * max(class_weights) / class_weights[label]
        train_set.weights[train_set.y == label] *= factor_train
        factor_test = class_weights[label] / test_class_weights[label]
        test_set.weights[test_set.y == label] *= factor_test
    
    return train_set, test_set

In [10]:
def concat_datasets(datasets: List[Dataset]) -> Dataset:
    x = [data.x for data in datasets]
    y = [data.y for data in datasets]
    z = [data.z for data in datasets]
    weights = [data.weights for data in datasets]
    return Dataset(
        x=torch.cat(x),
        y=torch.cat(y),
        z=torch.cat(z),
        weights=torch.cat(weights),
    )

In [11]:
def shuffle(collection):
    return collection[torch.randperm(len(collection))]

In [12]:
def load_training_data(
    z_values: List[float],
) -> Tuple[Dataset, Dataset, StandardScaler]:
    train_datasets: List[Dataset] = []
    test_datasets: List[Dataset] = []
    
    total_size = 0
    for z in tqdm(z_values):
        train_dataset, test_dataset = load_training_data_for(float(z))
        train_datasets.append(train_dataset)
        current_size = train_dataset.sizeof() + test_dataset.sizeof()
        total_size += current_size
        print(f'z = {z:.2f}, size = {current_size >> 20}M, total size = {total_size >> 20}M')
        test_datasets.append(test_dataset)
    
    train_cat = concat_datasets(train_datasets)
    test_cat = concat_datasets(test_datasets)
    train_cat = shuffle(train_cat)
    test_cat = shuffle(test_cat)
    
    scaler = StandardScaler()
    train_cat.x = torch.tensor(scaler.fit_transform(train_cat.x), dtype=torch.float32)
    test_cat.x = torch.tensor(scaler.transform(test_cat.x), dtype=torch.float32)
    
    return train_cat, test_cat, scaler

In [13]:
data_train, data_test, scaler = load_training_data(z_syst_train)

  0%|          | 0/24 [00:00<?, ?it/s]

z = 0.70, size = 58M, total size = 58M
z = 0.74, size = 64M, total size = 123M
z = 0.78, size = 70M, total size = 193M
z = 0.80, size = 73M, total size = 267M
z = 0.84, size = 78M, total size = 345M
z = 0.88, size = 84M, total size = 430M
z = 0.90, size = 87M, total size = 517M
z = 0.92, size = 90M, total size = 608M
z = 0.94, size = 93M, total size = 701M
z = 0.96, size = 95M, total size = 797M
z = 0.98, size = 98M, total size = 895M
z = 0.99, size = 100M, total size = 995M
z = 1.00, size = 101M, total size = 1097M
z = 1.01, size = 102M, total size = 1200M
z = 1.02, size = 104M, total size = 1304M
z = 1.04, size = 107M, total size = 1411M
z = 1.06, size = 109M, total size = 1521M
z = 1.08, size = 112M, total size = 1634M
z = 1.09, size = 114M, total size = 1748M
z = 1.10, size = 115M, total size = 1863M
z = 1.11, size = 115M, total size = 1979M
z = 1.12, size = 115M, total size = 2094M
z = 1.13, size = 115M, total size = 2210M
z = 1.14, size = 115M, total size = 2325M


In [14]:
data_train = data_train.to(dev)
data_test = data_test.to(dev)

In [15]:
data_train.x.shape

torch.Size([8238657, 34])

## Training

In [113]:
def train(
    model,
    num_epochs: int,
    batch_size: int,
    dataset_filter: Optional[torch.Tensor] = None,
    Optimizer = torch.optim.RMSprop,
    include_z: bool = False,
    weight_decay: float = 0.0001,
):
    dataset = data_train
    if dataset_filter is not None:
        dataset = data_train[dataset_filter]
        
    opt = Optimizer(model.parameters(), weight_decay=weight_decay)
    train_dataset = torch.utils.data.TensorDataset(
        dataset.x,
        dataset.y,
        dataset.z,
        dataset.weights,
    )
    loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
    for epoch in trange(num_epochs):
        total_loss_value = 0.0
        num_correct = 0
        num_total = 0
        for x, y, z, weights in tqdm(loader, leave=False):
            opt.zero_grad()
            inputs = (x, z) if include_z else x
            output = model(inputs)
            if isinstance(output, tuple):
                y_pred = output[0]
            else:
                y_pred = output
            y_pred = y_pred.flatten()
            
            assert len(y_pred) == len(x)
            
            if hasattr(model, 'compute_loss'):
                loss_value = model.compute_loss(output, y, weights, z)
            else:
                loss_value = nn.BCELoss(weight=weights)(y_pred, y)
            loss_value.backward()
            with torch.no_grad():
                total_loss_value += float(loss_value) * len(x)
                this_num_correct = int(((y_pred > 0.5) == (y > 0.5)).sum())
                assert this_num_correct <= len(y_pred)
                num_total += len(y_pred)
                num_correct += this_num_correct
            opt.step()
        assert num_total == len(dataset)
        accuracy = num_correct / len(dataset)
        total_loss_value /= len(dataset)
        print(f'Epoch {epoch+1}: loss = {total_loss_value:.5f}, accuracy = {accuracy:.5f}')
    return model

In [114]:
def train_or_load(*args, path, **kwargs):
    if os.path.exists(path):
        print(f'Loading trained model from {path}')
        with open(path, 'rb') as f:
            return torch.load(f)
    return train(*args, **kwargs)

In [123]:
@dataclass
class Evaluation:
    accuracy: float
    loss: float
        
def evaluate(
    model,
    dataset,
    dataset_filter: Optional[torch.Tensor] = None,
    z = None,
    temp_gpu: bool = True,
    leave_progress_bar: bool = True,
):
    if temp_gpu:
        model.to(dev)
    try:
        with torch.no_grad():
            if dataset_filter is not None:
                dataset = dataset[dataset_filter]
            if z is None:
                test_dataset = torch.utils.data.TensorDataset(
                    dataset.x,
                    dataset.y,
                    dataset.z,
                    dataset.weights,
                )
            else:
                test_dataset = torch.utils.data.TensorDataset(
                    dataset.x,
                    dataset.y,
                    z,
                    dataset.weights,
                )
            loader = torch.utils.data.DataLoader(test_dataset, batch_size=4096, shuffle=True)
            loss_value = 0.0
            num_correct = 0
            for x, y, zvalue, weights in tqdm(loader, leave=leave_progress_bar):
                inputs = x if z is None else (x, zvalue)
                
                output = model(inputs)
                if isinstance(output, tuple):
                    y_pred = output[0]
                else:
                    y_pred = output
                y_pred = y_pred.flatten()

                assert len(y_pred) == len(x)

                if hasattr(model, 'compute_loss'):
                    loss_value += float(model.compute_loss(output, y, weights, zvalue))
                else:
                    loss_value += float(nn.BCELoss(weight=weights)(y_pred, y)) * len(x)
                num_correct += int(((y_pred > 0.5) == (y > 0.5)).sum())

            accuracy = num_correct / len(dataset)
            return Evaluation(accuracy=accuracy, loss=loss_value/len(dataset))
    finally:
        if temp_gpu:
            model.cpu()

### Baseline model (nominal z)

In [78]:
class BaselineModel(nn.Module):
    def __init__(
        self,
        num_hidden_layers: int,
        num_hidden_nodes: int,
        num_inputs = None,
        final_activation: bool = True,
    ):
        if num_inputs is None:
            num_inputs = data_train.x.shape[1]
        super().__init__()
        input_layer = nn.Sequential(
            nn.Linear(num_inputs, num_hidden_nodes),
            nn.ReLU(),
        )
        hidden_layers = [
            nn.Sequential(
                nn.Linear(num_hidden_nodes, num_hidden_nodes),
                nn.ReLU(),
            )
            for _ in range(num_hidden_layers - 1)
        ]
        output_layer = nn.Sequential(
            nn.Linear(num_hidden_nodes, 1),
            nn.Sigmoid() if final_activation else nn.Identity(),
        )
        self._layers = nn.Sequential(input_layer, *hidden_layers, output_layer)
        
    def forward(self, x):
        return self._layers(x)

In [26]:
baseline_nominal_model = train_or_load(
    BaselineModel(num_hidden_layers=10, num_hidden_nodes=512).to(dev),
    num_epochs=50,
    batch_size=2048,
    Optimizer=lambda *a, **k: torch.optim.RMSprop(*a, **k, lr=0.001, alpha=0.9),
    dataset_filter=torch.isclose(data_train.z, torch.tensor(z_nominal, dtype=torch.float32)),
    path='data/model.baseline-nominal.pt',
).cpu()

Loading trained model from data/model.baseline-nominal.pt


In [121]:
evaluate(
    baseline_nominal_model,
    data_test,
    torch.isclose(data_test.z, torch.tensor(z_nominal, dtype=torch.float32)),
)

  0%|          | 0/88 [00:00<?, ?it/s]

Evaluation(accuracy=0.9262328359537064, loss=0.019609023755159338)

In [28]:
with open('data/model.baseline-nominal.pt', 'wb') as f:
    torch.save(baseline_nominal_model, f)

### Baseline model (low z)

In [30]:
baseline_down_model = train_or_load(
    BaselineModel(num_hidden_layers=10, num_hidden_nodes=64).to(dev),
    num_epochs=50,
    batch_size=2048,
    Optimizer=lambda *a, **k: torch.optim.RMSprop(*a, **k, lr=0.001, alpha=0.9),
    dataset_filter=torch.isclose(data_train.z, torch.tensor(z_syst_down, dtype=torch.float32)),
    path='data/model.baseline-down.pt',
).cpu()

Loading trained model from data/model.baseline-down.pt


In [31]:
evaluate(
    baseline_down_model,
    data_test,
    torch.isclose(data_test.z, torch.tensor(z_syst_down, dtype=torch.float32)),
)

  0%|          | 0/64 [00:00<?, ?it/s]

Evaluation(accuracy=0.9598042049681063, loss=0.006369620191470305)

In [32]:
with open('data/model.baseline-down.pt', 'wb') as f:
    torch.save(baseline_down_model, f)

### Baseline model (high z)

In [33]:
baseline_up_model = train_or_load(
    BaselineModel(num_hidden_layers=10, num_hidden_nodes=512).to(dev),
    num_epochs=50,
    batch_size=2048,
    Optimizer=lambda *a, **k: torch.optim.RMSprop(*a, **k, lr=0.001, alpha=0.9),
    dataset_filter=torch.isclose(data_train.z, torch.tensor(z_syst_up, dtype=torch.float32)),
    path='data/model.baseline-up.pt',
).cpu()

Loading trained model from data/model.baseline-up.pt


In [34]:
evaluate(
    baseline_up_model,
    data_test,
    torch.isclose(data_test.z, torch.tensor(z_syst_up, dtype=torch.float32)),
)

  0%|          | 0/100 [00:00<?, ?it/s]

Evaluation(accuracy=0.9172103437571041, loss=0.024045526460747226)

In [35]:
with open('data/model.baseline-up.pt', 'wb') as f:
    torch.save(baseline_up_model, f)

### Data augmentation model

In [36]:
aug_model = train_or_load(
    BaselineModel(num_hidden_layers=10, num_hidden_nodes=512).to(dev),
    num_epochs=10,
    batch_size=2048,
    Optimizer=lambda *a, **k: torch.optim.Adam(*a, **k, lr=0.001),
    path='data/model.aug.pt',
).cpu()

Loading trained model from data/model.aug.pt


In [37]:
evaluate(aug_model, data_test)

  0%|          | 0/2012 [00:00<?, ?it/s]

Evaluation(accuracy=0.9626696201063024, loss=0.005541750624016032)

In [38]:
with open('data/model.aug.pt', 'wb') as f:
    torch.save(aug_model, f)

### Uncertainty aware model

In [40]:
class UncertaintyAwareModel(nn.Module):
    def __init__(self, num_hidden_layers: int, num_hidden_nodes: int):
        super().__init__()
        self._layers_a = UncertaintyAwareModel.create_layers(num_hidden_layers, num_hidden_nodes)
        self._layers_b = UncertaintyAwareModel.create_layers(num_hidden_layers, num_hidden_nodes)
    
    @staticmethod
    def create_layers(num_hidden_layers: int, num_hidden_nodes: int):
        input_layer = nn.Sequential(
            nn.Linear(data_train.x.shape[1] + 1, num_hidden_nodes),
            nn.ReLU(),
        )
        hidden_layers = [
            nn.Sequential(
                nn.Linear(num_hidden_nodes, num_hidden_nodes),
                nn.ReLU(),
            )
            for _ in range(num_hidden_layers - 1)
        ]
        output_layer = nn.Sequential(
            nn.Linear(num_hidden_nodes, 1),
            nn.Sigmoid(),
        )
        return nn.Sequential(input_layer, *hidden_layers, output_layer)
    
    def forward(self, inputs):
        x, z = inputs
        z = z.view((len(z), 1))
        input_tensor = torch.cat([x, z], dim=1)
        
        a = self._layers_a(input_tensor)
        b = self._layers_b(input_tensor)
        return torch.where(z < 1, a, b)

In [55]:
aware_model = train_or_load(
    UncertaintyAwareModel(num_hidden_layers=10, num_hidden_nodes=64).to(dev),
    num_epochs=10,
    batch_size=2048,
    include_z=True,
    Optimizer=lambda *a, **k: torch.optim.RMSprop(*a, **k, lr=0.001, alpha=0.9),
    weight_decay=0.00003,
    path='data/model.aware.pt',
).cpu()

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 1: loss = 0.03593, accuracy = 0.91485


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 2: loss = 0.01208, accuracy = 0.93731


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 3: loss = 0.01044, accuracy = 0.94103


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 4: loss = 0.00944, accuracy = 0.94361


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 5: loss = 0.00883, accuracy = 0.94493


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 6: loss = 0.00847, accuracy = 0.94608


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 7: loss = 0.00809, accuracy = 0.94732


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 8: loss = 0.00772, accuracy = 0.94860


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 9: loss = 0.00746, accuracy = 0.94980


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 10: loss = 0.00729, accuracy = 0.95015


In [56]:
evaluate(aware_model, data_test, z=data_test.z)

  0%|          | 0/2012 [00:00<?, ?it/s]

Evaluation(accuracy=0.9466954874607458, loss=0.010065191383443146)

In [57]:
with open('data/model.aware.pt', 'wb') as f:
    torch.save(aware_model, f)

### Adversarial model

In [116]:
class AdversarialModel(nn.Module):
    def __init__(self, lambda_weight: float):
        super().__init__()
        self._lambda_weight = lambda_weight
        self._classifier = BaselineModel(
            num_hidden_layers=10, 
            num_hidden_nodes=64
        )
        self._discriminator = BaselineModel(
            num_inputs=1,
            num_hidden_layers=10,
            num_hidden_nodes=64,
            final_activation=False,
        )
        self._grad_reversal = RevGrad()
    
    def forward(self, x):
        y_pred = self._classifier(x)
        z_pred = self._discriminator(self._grad_reversal(y_pred))
        return y_pred, z_pred
    
    def compute_loss(self, output, y, weights, z):
        y_pred, z_pred = output
        bce = nn.BCELoss(weight=weights)(y_pred.flatten(), y)
        mse = ((z_pred.flatten() - z)**2 * weights).sum() / len(weights)
        return bce + self._lambda_weight * mse

In [118]:
adv_model = train_or_load(
    AdversarialModel(lambda_weight=1).to(dev),
    num_epochs=15,
    batch_size=2048,
    #include_z=True,
    Optimizer=lambda *a, **k: torch.optim.RMSprop(*a, **k, lr=0.001, alpha=0.9),
    path='data/model.adv.pt',
).cpu()

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 1: loss = 0.04527, accuracy = 0.91872


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 2: loss = 0.02266, accuracy = 0.93965


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 3: loss = 0.02111, accuracy = 0.94312


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 4: loss = 0.02020, accuracy = 0.94561


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 5: loss = 0.01957, accuracy = 0.94729


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 6: loss = 0.01897, accuracy = 0.94894


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 7: loss = 0.01858, accuracy = 0.95019


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 8: loss = 0.01837, accuracy = 0.95072


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 9: loss = 0.01812, accuracy = 0.95171


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 10: loss = 0.01784, accuracy = 0.95254


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 11: loss = 0.01771, accuracy = 0.95310


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 12: loss = 0.01750, accuracy = 0.95400


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 13: loss = 0.01738, accuracy = 0.95425


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 14: loss = 0.01724, accuracy = 0.95482


  0%|          | 0/4023 [00:00<?, ?it/s]

Epoch 15: loss = 0.01719, accuracy = 0.95501


In [124]:
evaluate(adv_model, data_test)

  0%|          | 0/2012 [00:00<?, ?it/s]

Evaluation(accuracy=0.9587570650266436, loss=4.258731494854617e-06)

In [119]:
with open('data/model.adv.pt', 'wb') as f:
    torch.save(adv_model, f)