# Preparation

Install the necessary packages:

In [None]:
! pip install numpy awkward vector uproot lz4 xxhash pandas matplotlib tqdm


In [None]:
import os
import numpy as np
import awkward as ak
import uproot
import matplotlib.pyplot as plt
import vector
vector.register_awkward()


Here defines some helper functions to visualize a jet:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

typelist = ['ch+', 'ch-', 'nh', 'ph', 'el+', 'el-', 'mu+', 'mu-']


def make_subplot(ax, data, force_xylim=None):
    # default plotting configuration
    color_dict_ = {'ch': 'C0', 'nh': 'mediumpurple', 'ph': 'orange', 'el': 'red', 'mu': 'green'}
    color_dict = color_dict_.copy()
    color_dict.update({k + '+': color_dict_[k] for k in color_dict_})
    color_dict.update({k + '-': color_dict_[k] for k in color_dict_})
    if data.get('id') is None:
        data['id'] = ['default'] * len(data['pt'])
    if data.get('e') is None:
        for eta, phi, pt, id, d3d in zip(data['eta'], data['phi'], data['pt'], data['id'], data['d3d']):
            ptdraw = np.sqrt(pt) / 200
            alpha = 0.3
            if id in [4, 5]:
                ax.add_patch(mpl.patches.RegularPolygon((eta, phi), 3, radius=ptdraw, clip_on=True,
                                                        alpha=alpha, edgecolor='black', **make_color_args(id, d3d)))
            elif id in [6, 7]:
                ax.add_patch(mpl.patches.RegularPolygon((eta, phi), 3, radius=ptdraw, orientation=np.pi,
                                                        clip_on=True, alpha=alpha, edgecolor='black', **make_color_args(id, d3d)))
            elif id in [3]:
                ax.add_patch(mpl.patches.RegularPolygon((eta, phi), 5, radius=ptdraw,
                                                        clip_on=True, alpha=alpha, **make_color_args(id, d3d)))
            else:
                ax.add_patch(plt.Circle((eta, phi), ptdraw, clip_on=True, alpha=alpha, **make_color_args(id, d3d)))
    else:
        for eta, phi, pt, e, id, d3d in zip(data['eta'], data['phi'], data['pt'], data['e'], data['id'], data['d3d']):
            ax.add_patch(mpl.patches.Wedge((eta, phi), pt / 600., 90, 270,
                                           clip_on=True, alpha=alpha, **make_color_args(id, d3d)))
            ax.add_patch(mpl.patches.Wedge((eta, phi), e / 600., 270, 90,
                                           clip_on=True, alpha=alpha, **make_color_args(id, d3d)))
    max_ang = force_xylim if force_xylim else max(max(abs(data['eta'])), max(abs(data['phi'])))
    # make square plot centered at (0,0)
    ax.set_xlim(-max_ang, max_ang)
    ax.set_ylim(-max_ang, max_ang)
    ax.set_xlabel(r'$\Delta\eta$')
    ax.set_ylabel(r'$\Delta\phi$')
    ax.set_aspect('equal')
    return max_ang


def make_color_args(id, d3d):
    color = color_fader('#74c476', '#081d58', d3d)
    if id in [2, 3]:
        return {'edgecolor': color, 'linewidth': 1, 'fill': False}
    else:
        return {'facecolor': color}


def color_fader(c1, c2, mix=0):  # fade (linear interpolate) from color c1 (at mix=0) to c2 (mix=1)
    mix = min(1., mix)
    c1 = np.array(mpl.colors.to_rgb(c1))
    c2 = np.array(mpl.colors.to_rgb(c2))
    return mpl.colors.to_hex((1 - mix) * c1 + mix * c2)


def visualize(arrays, idx=0, title=None, ax=None):
    data = {}
    data['pt'] = np.hypot(arrays[idx].part_px, arrays[idx].part_py)
    data['eta'] = arrays[idx].part_deta
    data['phi'] = arrays[idx].part_dphi
    data['d3d'] = np.tanh(np.hypot(arrays[idx].part_d0val, arrays[idx].part_dzval))
    part_type = np.concatenate([
        [(arrays[idx].part_isChargedHadron) & (arrays[idx].part_charge == 1)],
        [(arrays[idx].part_isChargedHadron) & (arrays[idx].part_charge == -1)],
        [arrays[idx].part_isNeutralHadron],
        [arrays[idx].part_isPhoton],
        [(arrays[idx].part_isElectron) & (arrays[idx].part_charge == 1)],
        [(arrays[idx].part_isElectron) & (arrays[idx].part_charge == -1)],
        [(arrays[idx].part_isMuon) & (arrays[idx].part_charge == 1)],
        [(arrays[idx].part_isMuon) & (arrays[idx].part_charge == -1)],
    ], axis=0)
    data['id'] = np.argmax(part_type.T, axis=1)  # better

    assert len(data['eta'] == data['id'])
    if ax is None:
        _, ax = plt.subplots(figsize=(5, 5))
    make_subplot(ax, data, force_xylim=0.5)
    if title:
        ax.set_title(title)
    return ax


# Download the dataset

In [None]:
def download(url, fname, chunk_size=1024):
    '''https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51'''
    import requests
    from tqdm import tqdm

    if os.path.dirname(fname):
        os.makedirs(os.path.dirname(fname), exist_ok=True)

    resp = requests.get(url, stream=True)
    total = int(resp.headers.get('content-length', 0))
    with open(fname, 'wb') as file, tqdm(
        desc=fname,
        total=total,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)


In [None]:
signal_file = './JetClassMini/TTBar_000.root'
background_file = './JetClassMini/ZJetsToNuNu_000.root'

if not os.path.exists(signal_file):
    download('https://hqu.web.cern.ch/datasets/JetClassMini/TTBar_000.root', signal_file)
if not os.path.exists(background_file):
    download('https://hqu.web.cern.ch/datasets/JetClassMini/ZJetsToNuNu_000.root', background_file)


# Explore the dataset


In [None]:
# Load the content from the file
signal_tree = uproot.open(signal_file)['tree']


In [None]:
# Display the content of the "tree"
signal_tree.show()


In [None]:
# Load all arrays in the tree
# Each array is a column of the table
signal_table = signal_tree.arrays()


In [None]:
signal_table


In [None]:
background_table = uproot.open(background_file)['tree'].arrays()
background_table


In [None]:
fig, axes = plt.subplots(2, 5, figsize=(25, 10), dpi=300)
for idx in range(10):
    visualize(signal_table, idx, title=f'Top quark jet {idx}', ax=axes[idx % 2][idx // 2])


In [None]:
fig, axes = plt.subplots(2, 5, figsize=(25, 10), dpi=300)
for idx in range(10):
    visualize(background_table, idx, title=f'q/g jet {idx}', ax=axes[idx % 2][idx // 2])


# Jet properties


In [None]:
jet_features = [k for k in signal_table.fields if k.startswith('jet_')]
jet_features


In [None]:
df_jet_signal = ak.to_dataframe(signal_table[jet_features])
df_jet_signal.head(10)


In [None]:
df_jet_background = ak.to_dataframe(background_table[jet_features])
df_jet_background.head(10)


In [None]:
for k in jet_features:
    plt.figure(figsize=(5,5), dpi=150)
    plt.hist([df_jet_signal[k], df_jet_background[k]], 
             bins=50, label=['Top', 'q/g'], histtype='step', density=True)
    plt.xlabel(k)
    plt.legend()


# Jet constituent properties


In [None]:
particle_features = [k for k in signal_table.fields if k.startswith('part_')]
particle_features


In [None]:
def plot_particle_features(name):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5), dpi=300)
    axes[0].hist([ak.flatten(signal_table[name]), ak.flatten(background_table[name])],
                 bins=50, label=['Top', 'q/g'], histtype='step', density=True)
    axes[0].set_xlabel(name)
    axes[0].legend()

    axes[1].hist([ak.flatten(signal_table[name]), ak.flatten(background_table[name])],
                 bins=50, label=['Top', 'q/g'], histtype='step', density=True)
    axes[1].set_xlabel(name)
    axes[1].set_yscale('log')
    axes[1].legend()


In [None]:
for name in particle_features:
    plot_particle_features(name)


In [None]:
# Construct a Lorentz 4-vector from the (px, py, pz, energy) arrays
def add_features(table):
    table['part_p4'] = vector.zip({'px': table['part_px'],
                                   'py': table['part_py'],
                                   'pz': table['part_pz'],
                                   'energy': table['part_energy']})
    table['part_pt'] = table['part_p4'].pt
    table['part_pt_log'] = np.log(table['part_p4'].pt)


In [None]:
add_features(signal_table)
add_features(background_table)

plot_particle_features('part_pt')
plot_particle_features('part_pt_log')


In addition to the log transformation, another useful transformation is tanh(...), or tanh(const * ...):

In [None]:
signal_table['part_d0val_tanh'] = np.tanh(signal_table['part_d0val'])
background_table['part_d0val_tanh'] = np.tanh(background_table['part_d0val'])
plot_particle_features('part_d0val')
plot_particle_features('part_d0val_tanh')
