# Sheet 9

In [None]:
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
import math
from torch.utils.data import TensorDataset, DataLoader

# 2) Classifier Reweighting 

(a)

In [None]:
data_LO = np.load("data09/ttbarj_LO.npy")
data_NLO = np.load("data09/ttbarj_NLO.npy")
print(data_LO.shape, data_NLO.shape)

In [None]:
# combine LO and NLO into one array and shuffle it
data = np.concatenate((data_LO, data_NLO), axis=0)
labels = np.concatenate((np.zeros(data_LO.shape[0]), np.ones(data_NLO.shape[0])), axis=0)
idx = np.random.permutation(len(data))
data = data[idx,:]
labels=labels[idx,None]

In [None]:
def preprocess(event, mean=None, std=None):
    if mean is None or std is None:
        mean = event.mean(axis=0, keepdims=True)
        std = event.std(axis=0, keepdims=True)
    event = (event - mean) / std
    return event, mean, std

def create_dataloader(data, labels, batchsize, shuffle, mean=None, std=None):
    data, mean, std = preprocess(data, mean, std)
    data = torch.tensor(data).float()
    labels = torch.tensor(labels)
    loader = DataLoader(TensorDataset(data, labels), batch_size=batchsize, shuffle=shuffle)
    return loader, mean, std

In [None]:
# split into train, validation and test sets and create dataloaders
n1, n2 = 50000, 60000
batchsize = 512
data_trn, data_val, data_tst = data[:n1,:], data[n1:n2,:], data[n2:,:]
labels_trn, labels_val, labels_tst = labels[:n1,:], labels[n1:n2,:], labels[n2:,:]

mean, std = None, None
loader_trn, mean, std = create_dataloader(data_trn, labels_trn, batchsize, True, mean=mean, std=std)
loader_tst, mean, std = create_dataloader(data_tst, labels_tst, batchsize, False, mean=mean, std=std)
loader_val, mean, std = create_dataloader(data_val, labels_val, batchsize, False, mean=mean, std=std)

In [None]:
# TODO: Implement and train classifier

In [None]:
# TODO: Evaluate classifier on test set. 
# Visualize the classifier score. Compute ROC curve and AUC.

(c)

In [None]:
# TODO: Compute weights of events in the test set. 

In [None]:
def get_pt(particle):
    return np.sqrt(particle[:,1]**2 + particle[:,2]**2)
def get_mass(particle):
    return np.sqrt(particle[:,0]**2 - np.sum(particle[:,1:]**2, axis=-1))
def get_top1(event):
    return event[:,:4] + event[:,4:8] + event[:,8:12]

In [None]:
# TODO: Visualize transverse momentum (pt) and mass of the leading 
# top quark, using the functions defined above.

# 3) Event Generation with a Variational Autoencoder

In [None]:
# download data to folder data09
# this might take some time (50MB)
# you can also do this manually (download + unpack zip)
import os, sys
import wget
from zipfile import ZipFile
from tqdm import tqdm

DESTINATION = "data09"
url = "https://www.thphys.uni-heidelberg.de/~plehn/pics/"
filename = "tutorial-11-data.zip"
url = url + filename

os.makedirs(DESTINATION, exist_ok=True)
os.chdir(DESTINATION)
wget.download(url, filename)
with ZipFile(filename, "r") as zip_ref:
    for file in tqdm(iterable=zip_ref.namelist(), total=len(zip_ref.namelist())):
        zip_ref.extract(member=file)
os.chdir("..")
%ls data09

In [3]:
data_trn = np.load("data09/tutorial-11-data/dy_trn_data.npy")
data_tst = np.load("data09/tutorial-11-data/dy_tst_data.npy")
data_val = np.load("data09/tutorial-11-data/dy_val_data.npy")
print(data_trn.shape, data_tst.shape, data_val.shape)

(1384081, 8) (296588, 8) (296588, 8)


In [27]:
# helper functions for collider kinematics
# used for plotting and the extended preprocessing in part (d)
def get_mass(particle):
    return np.sqrt(np.clip(particle[:,0]**2 - np.sum(particle[:,1:]**2, axis=-1), 0, None))

def get_pt(particle):
    return np.sqrt(particle[:,1]**2 + particle[:,2]**2)

def get_eta(particle):
    p_absolute = np.sqrt(np.sum(particle[:,1:]**2, axis=-1))
    return np.arctanh(particle[:,3] / p_absolute)

def get_phi(particle):
    return np.arctan2(particle[:,2], particle[:,1])

def get_pt_phi_eta_mass(particle):
    pt = get_pt(particle)
    phi = get_phi(particle)
    eta = get_eta(particle)
    mass = get_mass(particle)
    return np.stack((pt, phi, eta, mass), axis=-1)

(b)

In [None]:
# TODO: Minimal preprocessing

In [None]:
# TODO: Construct and train VAE
# Test different bottleneck sizes

In [None]:
# helper function for plotting
# takes two sets of events (truth/train, generated) 
# of shape (n, 28) as input
components_Eppp = ["E [GeV]", "px [GeV]", "py [GeV]", "pz [GeV]"]
components_jetcoordinates = ["pt [GeV]", "phi", "eta", "mass [GeV]"]
def plot(truth, generated, bins=20):
    fig, axs = plt.subplots(4,4, figsize=(15,15))
    
    # plot (E, px, py, pz) for both particles
    for iparticle in range(2):
        for icomponent in range(4):
            ax = axs[iparticle, icomponent]
            xlabel = f"{components_Eppp[icomponent]} of particle {iparticle+1}"
            i = iparticle*4 + icomponent
            
            bins_local = bins
            _, bins_local, _ = ax.hist(truth[:,i], bins=bins_local, alpha=.5, label="truth", density=True)
            ax.hist(generated[:,i], bins=bins_local, alpha=.5, label="model", density=True)
            ax.legend()
            ax.set_xlabel(xlabel)
    
    # plot (pt phi eta mass) for both particles
    truth = np.concatenate((get_pt_phi_eta_mass(truth[:,:4]), 
                            get_pt_phi_eta_mass(truth[:,4:])), axis=-1)
    generated = np.concatenate((get_pt_phi_eta_mass(generated[:,:4]), 
                                get_pt_phi_eta_mass(generated[:,4:])), axis=-1)
    for iparticle in range(2):
        for icomponent in range(4):
            ax = axs[2+iparticle, icomponent]
            xlabel = f"{components_jetcoordinates[icomponent]} of particle {iparticle+1}"
            i = iparticle*4 + icomponent
            
            bins_local = bins
            _, bins_local, _ = ax.hist(truth[:,i], bins=bins_local, alpha=.5, label="truth", density=True)
            ax.hist(generated[:,i], bins=bins_local, alpha=.5, label="model", density=True)
            ax.legend()
            ax.set_xlabel(xlabel)

(c)

In [None]:
# TODO: Test mass and momentum conservation

(e)

In [None]:
# cuts in the dataset
get_pt(data_tst[:,:4]).min(), get_pt(data_tst[:,4:]).min() # pt cuts
get_eta(data_tst[:,:4]).min(), get_eta(data_tst[:,:4]).max()

eta_cut = 2.5 + 1e-5 # should be 2.5 but one event is weird
pt_cut = 10.

In [None]:
def preprocess2(events, mean=None, std=None):
    particle1, particle2 = events[:,:4], events[:,4:]
    events_jetcoordinates = np.stack((get_pt(particle1), get_phi(particle1), get_eta(particle1), get_mass(particle1), get_pt(particle2), get_phi(particle2), get_eta(particle2), get_mass(particle2)), axis=-1)
    events_reduced = events_jetcoordinates[:,[0,2,6]]
    
    events_reduced[:,0] = np.log(events_reduced[:,0] - pt_cut)
    events_reduced[:,1:] = np.arctanh(events_reduced[:,1:] / eta_cut)
    
    if mean is None or std is None:
        mean = events_reduced.mean(axis=0)
        std = events_reduced.std(axis=0)
    events_reduced = (events_reduced - mean) / std
    
    assert np.isfinite(events_reduced).all()
    return events_reduced, mean, std
    
def undo_preprocess2(events_reduced, mean, std):
    events_reduced = events_reduced * std + mean
    
    events_reduced[:,0] = np.exp(events_reduced[:,0]) + pt_cut
    events_reduced[:,[1,2]] = np.tanh(events_reduced[:,[1,2]]) * eta_cut
    
    pt1, eta1, eta2 = events_reduced.T
    phi1 = np.random.uniform(0, 2*np.pi, events_reduced.shape[0])
    mass1, mass2 = np.ones((2, events_reduced.shape[0])) * 0.105
    px1 = pt1 * np.cos(phi1)
    py1 = pt1 * np.sin(phi1)
    pz1 = pt1 * np.sinh(eta1)
    e1 = np.sqrt(mass1**2 + px1**2 + py1**2 + pz1**2)
    px2 = -px1
    py2 = -py1
    pt2 = pt1
    pz2 = pt2 * np.sinh(eta2)
    e2 = np.sqrt(mass2**2 + px2**2 + py2**2 + pz2**2)
    return np.stack((e1, px1, py1, pz1, e2, px2, py2, pz2), axis=-1)

In [None]:
# TODO: Train VAE on dataset with extended preprocessing