In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.6f' % x)

from tqdm import tqdm
tqdm.pandas()

In [2]:
train = pd.read_csv('./jet_complex_data/complex_train_R04_particle.csv')
test = pd.read_csv('./jet_complex_data/complex_test_R04_particle.csv')

df = pd.concat([train, test], axis=0)

train_length = len(train)

del train, test
gc.collect()

11

In [3]:
df.head()

Unnamed: 0,particle_category,particle_px,particle_py,particle_pz,particle_energy,particle_mass,jet_id
0,-211,-1.29973,0.368729,-0.057008,1.3594,0.13957,01291194f90c44c7bd79d9dbd50abd93
1,-2212,1.30542,0.418302,-0.426083,1.71493,0.93827,7df78235aa464796bbb386765d4784b5
2,22,-0.611265,0.111135,-0.225794,0.661044,0.0,9aeaad70b98f4bf0890734137f88c7ac
3,211,3.77857,0.205956,-1.08938,3.94033,0.13957,4261ccf933b64ce6ba83c623e9a5bf06
4,22,0.461909,0.330447,-0.16864,0.592448,0.0,e79cf654192e4cab8b83f50554fededc


In [4]:
df[['particle_px', 'particle_py', 'particle_pz', 'particle_energy', 'particle_mass']].describe()

Unnamed: 0,particle_px,particle_py,particle_pz,particle_energy,particle_mass
count,35791152.0,35791152.0,35791152.0,35791152.0,35791152.0
mean,-0.000353,7e-05,0.000218,7.457329,0.150251
std,15.901808,2.342705,2.300309,14.426177,0.237859
min,-453.928,-346.806,-347.04,1.5e-05,0.0
25%,-2.44307,-0.266066,-0.251261,1.0836,0.0
50%,-0.002857,-5e-06,9.9e-05,2.82665,0.13957
75%,2.4388,0.265616,0.251596,7.43467,0.13957
max,471.061,301.234,376.087,477.471,0.93957


In [5]:
def l2_dict(x, y, z): 
    return np.sqrt(np.square(x)+np.square(y)+np.square(z))

In [6]:
%%time

do_in_vec = np.vectorize(l2_dict, otypes=[np.float])
vec = do_in_vec(df.particle_px, df.particle_py, df.particle_pz)
df['particle_distance'] = vec

CPU times: user 4min 25s, sys: 1.9 s, total: 4min 27s
Wall time: 4min 26s


In [6]:
# vec = do_in_vec(df.particle_px, df.particle_py, 0)
# df['xy_dis'] = vec

# vec = do_in_vec(0, df.particle_py, df.particle_pz)
# df['yz_dis'] = vec

# vec = do_in_vec(df.particle_px, 0, df.particle_pz)
# df['zx_dis'] = vec

In [7]:
df['x_div_dist'] = df['particle_px'] / df['particle_distance']
# df['y_div_dist'] = df['particle_py'] / df['particle_distance']
# df['z_div_dist'] = df['particle_pz'] / df['particle_distance']
# df['xy_div_dist'] = df['xy_dis'] / df['particle_distance']
# df['yz_div_dist'] = df['yz_dis'] / df['particle_distance']
# df['zx_div_dist'] = df['zx_dis'] / df['particle_distance']

In [8]:
df['energy_x'] = df['particle_energy'] * df['x_div_dist']
# df['energy_y'] = df['particle_energy'] * df['y_div_dist']
# df['energy_z'] = df['particle_energy'] * df['z_div_dist']
# df['energy_xy'] = df['particle_energy'] * df['xy_div_dist']
# df['energy_yz'] = df['particle_energy'] * df['yz_div_dist']
# df['energy_zx'] = df['particle_energy'] * df['zx_div_dist']

In [9]:
df['mass_x'] = df['particle_mass'] * df['x_div_dist']
# df['mass_y'] = df['particle_mass'] * df['y_div_dist']
# df['mass_z'] = df['particle_mass'] * df['z_div_dist']
# df['mass_xy'] = df['particle_mass'] * df['xy_div_dist']
# df['mass_yz'] = df['particle_mass'] * df['yz_div_dist']
# df['mass_zx'] = df['particle_mass'] * df['zx_div_dist']

In [10]:
def angle(x, y):
    return np.degrees(np.math.atan(x / y)) if y != 0 else 0

do_in_vec = np.vectorize(angle, otypes=[np.float])

vec = do_in_vec(df.particle_px, df.particle_py)
df['angle_xy'] = vec
# vec = do_in_vec(df.particle_py, df.particle_px)
# df['angle_yx'] = vec
# vec = do_in_vec(df.particle_py, df.particle_pz)
# df['angle_yz'] = vec
# vec = do_in_vec(df.particle_pz, df.particle_py)
# df['angle_zy'] = vec
# vec = do_in_vec(df.particle_pz, df.particle_px)
# df['angle_zx'] = vec
vec = do_in_vec(df.particle_px, df.particle_pz)
df['angle_xz'] = vec

In [11]:
def calculate_speed(e, m):
    return np.sqrt(2*e/m) if m > 0 else 0

In [12]:
%%time

do_in_vec = np.vectorize(calculate_speed, otypes=[np.float])
vec = do_in_vec(df.particle_energy, df.particle_mass)
df['particle_speed'] = vec

CPU times: user 37.8 s, sys: 1.53 s, total: 39.3 s
Wall time: 38.9 s


In [13]:
df['speed_x'] = df['particle_speed'] * df['x_div_dist']
# df['speed_y'] = df['particle_speed'] * df['y_div_dist']
# df['speed_z'] = df['particle_speed'] * df['z_div_dist']
# df['speed_xy'] = df['particle_speed'] * df['xy_div_dist']
# df['speed_yz'] = df['particle_speed'] * df['yz_div_dist']
# df['speed_zx'] = df['particle_speed'] * df['zx_div_dist']

In [14]:
def calculate_travel_time(d, v):
    return np.abs(d) / v if v > 0 else 0

do_in_vec = np.vectorize(calculate_travel_time, otypes=[np.float])

# vec = do_in_vec(df.particle_distance, df.particle_speed)
# df['time_dis'] = vec

vec = do_in_vec(df.particle_px, df.speed_x)
df['time_x'] = vec

# vec = do_in_vec(df.particle_py, df.speed_y)
# df['time_y'] = vec

# vec = do_in_vec(df.particle_pz, df.speed_z)
# df['time_z'] = vec

# vec = do_in_vec(df.xy_dis, df.speed_xy)
# df['time_xy'] = vec

# vec = do_in_vec(df.yz_dis, df.speed_yz)
# df['time_yz'] = vec

# vec = do_in_vec(df.zx_dis, df.speed_zx)
# df['time_zx'] = vec

In [14]:
# df['particle_mv'] = df['particle_mass'] * df['particle_speed']
# df['mv_x'] = df['particle_mv'] * df['x_div_dist']
# df['mv_y'] = df['particle_mv'] * df['y_div_dist']
# df['mv_z'] = df['particle_mv'] * df['z_div_dist']
# df['mv_xy'] = df['particle_mv'] * df['xy_div_dist']
# df['mv_yz'] = df['particle_mv'] * df['yz_div_dist']
# df['mv_zx'] = df['particle_mv'] * df['zx_div_dist']

In [15]:
def brute_force(df):
    
#     df['jet_id_mass_max'] = df.groupby('jet_id')['particle_mass'].transform('max')
#     df['jet_id_mass_mean'] = df.groupby('jet_id')['particle_mass'].transform('mean')
#     df['jet_id_mass_min'] = df.groupby('jet_id')['particle_mass'].transform('min')
    df['jet_id_mass_std'] = df.groupby('jet_id')['particle_mass'].transform('std')
    
#     df['jet_id_energy_max'] = df.groupby('jet_id')['particle_energy'].transform('max')
#     df['jet_id_energy_mean'] = df.groupby('jet_id')['particle_energy'].transform('mean')
#     df['jet_id_energy_min'] = df.groupby('jet_id')['particle_energy'].transform('min')
    df['jet_id_energy_std'] = df.groupby('jet_id')['particle_energy'].transform('std')
    
#     df['jet_id_mass_x_max'] = df.groupby('jet_id')['mass_x'].transform('max')
#     df['jet_id_mass_x_mean'] = df.groupby('jet_id')['mass_x'].transform('mean')
#     df['jet_id_mass_x_min'] = df.groupby('jet_id')['mass_x'].transform('min')
    df['jet_id_mass_x_std'] = df.groupby('jet_id')['mass_x'].transform('std')
    
#     df['jet_id_mass_y_max'] = df.groupby('jet_id')['mass_y'].transform('max')
#     df['jet_id_mass_y_mean'] = df.groupby('jet_id')['mass_y'].transform('mean')
#     df['jet_id_mass_y_min'] = df.groupby('jet_id')['mass_y'].transform('min')
#     df['jet_id_mass_y_std'] = df.groupby('jet_id')['mass_y'].transform('std')
    
#     df['jet_id_mass_z_max'] = df.groupby('jet_id')['mass_z'].transform('max')
#     df['jet_id_mass_z_mean'] = df.groupby('jet_id')['mass_z'].transform('mean')
#     df['jet_id_mass_z_min'] = df.groupby('jet_id')['mass_z'].transform('min')
#     df['jet_id_mass_z_std'] = df.groupby('jet_id')['mass_z'].transform('std')
    
#     df['jet_id_mass_xy_max'] = df.groupby('jet_id')['mass_xy'].transform('max')
#     df['jet_id_mass_xy_mean'] = df.groupby('jet_id')['mass_xy'].transform('mean')
#     df['jet_id_mass_xy_min'] = df.groupby('jet_id')['mass_xy'].transform('min')
#     df['jet_id_mass_xy_std'] = df.groupby('jet_id')['mass_xy'].transform('std')
    
#     df['jet_id_mass_yz_max'] = df.groupby('jet_id')['mass_yz'].transform('max')
#     df['jet_id_mass_yz_mean'] = df.groupby('jet_id')['mass_yz'].transform('mean')
#     df['jet_id_mass_yz_min'] = df.groupby('jet_id')['mass_yz'].transform('min')
#     df['jet_id_mass_yz_std'] = df.groupby('jet_id')['mass_yz'].transform('std')
    
#     df['jet_id_mass_zx_max'] = df.groupby('jet_id')['mass_zx'].transform('max')
#     df['jet_id_mass_zx_mean'] = df.groupby('jet_id')['mass_zx'].transform('mean')
#     df['jet_id_mass_zx_min'] = df.groupby('jet_id')['mass_zx'].transform('min')
#     df['jet_id_mass_zx_std'] = df.groupby('jet_id')['mass_zx'].transform('std')
    
#     df['jet_id_energy_x_max'] = df.groupby('jet_id')['energy_x'].transform('max')
#     df['jet_id_energy_x_mean'] = df.groupby('jet_id')['energy_x'].transform('mean')
#     df['jet_id_energy_x_min'] = df.groupby('jet_id')['energy_x'].transform('min')
    df['jet_id_energy_x_std'] = df.groupby('jet_id')['energy_x'].transform('std')
    
#     df['jet_id_energy_y_max'] = df.groupby('jet_id')['energy_y'].transform('max')
#     df['jet_id_energy_y_mean'] = df.groupby('jet_id')['energy_y'].transform('mean')
#     df['jet_id_energy_y_min'] = df.groupby('jet_id')['energy_y'].transform('min')
#     df['jet_id_energy_y_std'] = df.groupby('jet_id')['energy_y'].transform('std')
    
#     df['jet_id_energy_z_max'] = df.groupby('jet_id')['energy_z'].transform('max')
#     df['jet_id_energy_z_mean'] = df.groupby('jet_id')['energy_z'].transform('mean')
#     df['jet_id_energy_z_min'] = df.groupby('jet_id')['energy_z'].transform('min')
#     df['jet_id_energy_z_std'] = df.groupby('jet_id')['energy_z'].transform('std')
    
#     df['jet_id_energy_xy_max'] = df.groupby('jet_id')['energy_xy'].transform('max')
#     df['jet_id_energy_xy_mean'] = df.groupby('jet_id')['energy_xy'].transform('mean')
#     df['jet_id_energy_xy_min'] = df.groupby('jet_id')['energy_xy'].transform('min')
#     df['jet_id_energy_xy_std'] = df.groupby('jet_id')['energy_xy'].transform('std')
    
#     df['jet_id_energy_yz_max'] = df.groupby('jet_id')['energy_yz'].transform('max')
#     df['jet_id_energy_yz_mean'] = df.groupby('jet_id')['energy_yz'].transform('mean')
#     df['jet_id_energy_yz_min'] = df.groupby('jet_id')['energy_yz'].transform('min')
#     df['jet_id_energy_yz_std'] = df.groupby('jet_id')['energy_yz'].transform('std')
    
#     df['jet_id_energy_zx_max'] = df.groupby('jet_id')['energy_zx'].transform('max')
#     df['jet_id_energy_zx_mean'] = df.groupby('jet_id')['energy_zx'].transform('mean')
#     df['jet_id_energy_zx_min'] = df.groupby('jet_id')['energy_zx'].transform('min')
#     df['jet_id_energy_zx_std'] = df.groupby('jet_id')['energy_zx'].transform('std')
    
#     df['jet_id_distance_max'] = df.groupby('jet_id')['particle_distance'].transform('max')
#     df['jet_id_distance_mean'] = df.groupby('jet_id')['particle_distance'].transform('mean')
#     df['jet_id_distance_min'] = df.groupby('jet_id')['particle_distance'].transform('min')
#     df['jet_id_distance_std'] = df.groupby('jet_id')['particle_distance'].transform('std')
    
#     df['jet_id_xy_dis_max'] = df.groupby('jet_id')['xy_dis'].transform('max')
#     df['jet_id_xy_dis_mean'] = df.groupby('jet_id')['xy_dis'].transform('mean')
#     df['jet_id_xy_dis_min'] = df.groupby('jet_id')['xy_dis'].transform('min')
#     df['jet_id_xy_dis_std'] = df.groupby('jet_id')['xy_dis'].transform('std')
    
#     df['jet_id_yz_dis_max'] = df.groupby('jet_id')['yz_dis'].transform('max')
#     df['jet_id_yz_dis_mean'] = df.groupby('jet_id')['yz_dis'].transform('mean')
#     df['jet_id_yz_dis_min'] = df.groupby('jet_id')['yz_dis'].transform('min')
#     df['jet_id_yz_dis_std'] = df.groupby('jet_id')['yz_dis'].transform('std')
    
#     df['jet_id_zx_dis_max'] = df.groupby('jet_id')['zx_dis'].transform('max')
#     df['jet_id_zx_dis_mean'] = df.groupby('jet_id')['zx_dis'].transform('mean')
#     df['jet_id_zx_dis_min'] = df.groupby('jet_id')['zx_dis'].transform('min')
#     df['jet_id_zx_dis_std'] = df.groupby('jet_id')['zx_dis'].transform('std')
    
#     df['jet_id_x_div_dist_max'] = df.groupby('jet_id')['x_div_dist'].transform('max')
#     df['jet_id_x_div_dist_mean'] = df.groupby('jet_id')['x_div_dist'].transform('mean')
#     df['jet_id_x_div_dist_min'] = df.groupby('jet_id')['x_div_dist'].transform('min')
    df['jet_id_x_div_dist_std'] = df.groupby('jet_id')['x_div_dist'].transform('std')
    
#     df['jet_id_y_div_dist_max'] = df.groupby('jet_id')['y_div_dist'].transform('max')
#     df['jet_id_y_div_dist_mean'] = df.groupby('jet_id')['y_div_dist'].transform('mean')
#     df['jet_id_y_div_dist_min'] = df.groupby('jet_id')['y_div_dist'].transform('min')
#     df['jet_id_y_div_dist_std'] = df.groupby('jet_id')['y_div_dist'].transform('std')
    
#     df['jet_id_z_div_dist_max'] = df.groupby('jet_id')['z_div_dist'].transform('max')
#     df['jet_id_z_div_dist_mean'] = df.groupby('jet_id')['z_div_dist'].transform('mean')
#     df['jet_id_z_div_dist_min'] = df.groupby('jet_id')['z_div_dist'].transform('min')
#     df['jet_id_z_div_dist_std'] = df.groupby('jet_id')['z_div_dist'].transform('std')
    
#     df['jet_id_xy_div_dist_max'] = df.groupby('jet_id')['xy_div_dist'].transform('max')
#     df['jet_id_xy_div_dist_mean'] = df.groupby('jet_id')['xy_div_dist'].transform('mean')
#     df['jet_id_xy_div_dist_min'] = df.groupby('jet_id')['xy_div_dist'].transform('min')
#     df['jet_id_xy_div_dist_std'] = df.groupby('jet_id')['xy_div_dist'].transform('std')
    
#     df['jet_id_yz_div_dist_max'] = df.groupby('jet_id')['yz_div_dist'].transform('max')
#     df['jet_id_yz_div_dist_mean'] = df.groupby('jet_id')['yz_div_dist'].transform('mean')
#     df['jet_id_yz_div_dist_min'] = df.groupby('jet_id')['yz_div_dist'].transform('min')
#     df['jet_id_yz_div_dist_std'] = df.groupby('jet_id')['yz_div_dist'].transform('std')
    
#     df['jet_id_zx_div_dist_max'] = df.groupby('jet_id')['zx_div_dist'].transform('max')
#     df['jet_id_zx_div_dist_mean'] = df.groupby('jet_id')['zx_div_dist'].transform('mean')
#     df['jet_id_zx_div_dist_min'] = df.groupby('jet_id')['zx_div_dist'].transform('min')
#     df['jet_id_zx_div_dist_std'] = df.groupby('jet_id')['zx_div_dist'].transform('std')
    
#     df['jet_id_speed_max'] = df.groupby('jet_id')['particle_speed'].transform('max')
#     df['jet_id_speed_mean'] = df.groupby('jet_id')['particle_speed'].transform('mean')
#     df['jet_id_speed_min'] = df.groupby('jet_id')['particle_speed'].transform('min')
    df['jet_id_speed_std'] = df.groupby('jet_id')['particle_speed'].transform('std')
    
#     df['jet_id_speed_x_max'] = df.groupby('jet_id')['speed_x'].transform('max')
#     df['jet_id_speed_x_mean'] = df.groupby('jet_id')['speed_x'].transform('mean')
#     df['jet_id_speed_x_min'] = df.groupby('jet_id')['speed_x'].transform('min')
    df['jet_id_speed_x_std'] = df.groupby('jet_id')['speed_x'].transform('std')
    
#     df['jet_id_speed_y_max'] = df.groupby('jet_id')['speed_y'].transform('max')
#     df['jet_id_speed_y_mean'] = df.groupby('jet_id')['speed_y'].transform('mean')
#     df['jet_id_speed_y_min'] = df.groupby('jet_id')['speed_y'].transform('min')
#     df['jet_id_speed_y_std'] = df.groupby('jet_id')['speed_y'].transform('std')
    
#     df['jet_id_speed_z_max'] = df.groupby('jet_id')['speed_z'].transform('max')
#     df['jet_id_speed_z_mean'] = df.groupby('jet_id')['speed_z'].transform('mean')
#     df['jet_id_speed_z_min'] = df.groupby('jet_id')['speed_z'].transform('min')
#     df['jet_id_speed_z_std'] = df.groupby('jet_id')['speed_z'].transform('std')
    
#     df['jet_id_speed_xy_max'] = df.groupby('jet_id')['speed_xy'].transform('max')
#     df['jet_id_speed_xy_mean'] = df.groupby('jet_id')['speed_xy'].transform('mean')
#     df['jet_id_speed_xy_min'] = df.groupby('jet_id')['speed_xy'].transform('min')
#     df['jet_id_speed_xy_std'] = df.groupby('jet_id')['speed_xy'].transform('std')
    
#     df['jet_id_speed_yz_max'] = df.groupby('jet_id')['speed_yz'].transform('max')
#     df['jet_id_speed_yz_mean'] = df.groupby('jet_id')['speed_yz'].transform('mean')
#     df['jet_id_speed_yz_min'] = df.groupby('jet_id')['speed_yz'].transform('min')
#     df['jet_id_speed_yz_std'] = df.groupby('jet_id')['speed_yz'].transform('std')
    
#     df['jet_id_speed_zx_max'] = df.groupby('jet_id')['speed_zx'].transform('max')
#     df['jet_id_speed_zx_mean'] = df.groupby('jet_id')['speed_zx'].transform('mean')
#     df['jet_id_speed_zx_min'] = df.groupby('jet_id')['speed_zx'].transform('min')
#     df['jet_id_speed_zx_std'] = df.groupby('jet_id')['speed_zx'].transform('std')    
    
#     df['jet_id_px_max'] = df.groupby('jet_id')['particle_px'].transform('max')
#     df['jet_id_px_mean'] = df.groupby('jet_id')['particle_px'].transform('mean')
#     df['jet_id_px_min'] = df.groupby('jet_id')['particle_px'].transform('min')
    df['jet_id_px_std'] = df.groupby('jet_id')['particle_px'].transform('std')
    
#     df['jet_id_py_max'] = df.groupby('jet_id')['particle_py'].transform('max')
#     df['jet_id_py_mean'] = df.groupby('jet_id')['particle_py'].transform('mean')
#     df['jet_id_py_min'] = df.groupby('jet_id')['particle_py'].transform('min')
#     df['jet_id_py_std'] = df.groupby('jet_id')['particle_py'].transform('std')
    
#     df['jet_id_pz_max'] = df.groupby('jet_id')['particle_pz'].transform('max')
#     df['jet_id_pz_mean'] = df.groupby('jet_id')['particle_pz'].transform('mean')
#     df['jet_id_pz_min'] = df.groupby('jet_id')['particle_pz'].transform('min')
#     df['jet_id_pz_std'] = df.groupby('jet_id')['particle_pz'].transform('std')
    
#     df['jet_id_angle_xy_max'] = df.groupby('jet_id')['angle_xy'].transform('max')
#     df['jet_id_angle_xy_mean'] = df.groupby('jet_id')['angle_xy'].transform('mean')
#     df['jet_id_angle_xy_min'] = df.groupby('jet_id')['angle_xy'].transform('min')
    df['jet_id_angle_xy_std'] = df.groupby('jet_id')['angle_xy'].transform('std')
    
#     df['jet_id_angle_xz_max'] = df.groupby('jet_id')['angle_xz'].transform('max')
#     df['jet_id_angle_xz_mean'] = df.groupby('jet_id')['angle_xz'].transform('mean')
#     df['jet_id_angle_xz_min'] = df.groupby('jet_id')['angle_xz'].transform('min')
    df['jet_id_angle_xz_std'] = df.groupby('jet_id')['angle_xz'].transform('std')
    
#     df['jet_id_angle_yx_max'] = df.groupby('jet_id')['angle_yx'].transform('max')
#     df['jet_id_angle_yx_mean'] = df.groupby('jet_id')['angle_yx'].transform('mean')
#     df['jet_id_angle_yx_min'] = df.groupby('jet_id')['angle_yx'].transform('min')
#     df['jet_id_angle_yx_std'] = df.groupby('jet_id')['angle_yx'].transform('std')
    
#     df['jet_id_angle_yz_max'] = df.groupby('jet_id')['angle_yz'].transform('max')
#     df['jet_id_angle_yz_mean'] = df.groupby('jet_id')['angle_yz'].transform('mean')
#     df['jet_id_angle_yz_min'] = df.groupby('jet_id')['angle_yz'].transform('min')
#     df['jet_id_angle_yz_std'] = df.groupby('jet_id')['angle_yz'].transform('std')
    
#     df['jet_id_angle_zy_max'] = df.groupby('jet_id')['angle_zy'].transform('max')
#     df['jet_id_angle_zy_mean'] = df.groupby('jet_id')['angle_zy'].transform('mean')
#     df['jet_id_angle_zy_min'] = df.groupby('jet_id')['angle_zy'].transform('min')
#     df['jet_id_angle_zy_std'] = df.groupby('jet_id')['angle_zy'].transform('std')
    
#     df['jet_id_angle_zx_max'] = df.groupby('jet_id')['angle_zx'].transform('max')
#     df['jet_id_angle_zx_mean'] = df.groupby('jet_id')['angle_zx'].transform('mean')
#     df['jet_id_angle_zx_min'] = df.groupby('jet_id')['angle_zx'].transform('min')
#     df['jet_id_angle_zx_std'] = df.groupby('jet_id')['angle_zx'].transform('std')
    
#     df['jet_id_time_dis_max'] = df.groupby('jet_id')['time_dis'].transform('max')
#     df['jet_id_time_dis_mean'] = df.groupby('jet_id')['time_dis'].transform('mean')
#     df['jet_id_time_dis_min'] = df.groupby('jet_id')['time_dis'].transform('min')
#     df['jet_id_time_dis_std'] = df.groupby('jet_id')['time_dis'].transform('std')
    
#     df['jet_id_time_x_max'] = df.groupby('jet_id')['time_x'].transform('max')
#     df['jet_id_time_x_mean'] = df.groupby('jet_id')['time_x'].transform('mean')
#     df['jet_id_time_x_min'] = df.groupby('jet_id')['time_x'].transform('min')
    df['jet_id_time_x_std'] = df.groupby('jet_id')['time_x'].transform('std')
    
#     df['jet_id_time_y_max'] = df.groupby('jet_id')['time_y'].transform('max')
#     df['jet_id_time_y_mean'] = df.groupby('jet_id')['time_y'].transform('mean')
#     df['jet_id_time_y_min'] = df.groupby('jet_id')['time_y'].transform('min')
#     df['jet_id_time_y_std'] = df.groupby('jet_id')['time_y'].transform('std')
    
#     df['jet_id_time_z_max'] = df.groupby('jet_id')['time_z'].transform('max')
#     df['jet_id_time_z_mean'] = df.groupby('jet_id')['time_z'].transform('mean')
#     df['jet_id_time_z_min'] = df.groupby('jet_id')['time_z'].transform('min')
#     df['jet_id_time_z_std'] = df.groupby('jet_id')['time_z'].transform('std')
    
#     df['jet_id_time_xy_max'] = df.groupby('jet_id')['time_xy'].transform('max')
#     df['jet_id_time_xy_mean'] = df.groupby('jet_id')['time_xy'].transform('mean')
#     df['jet_id_time_xy_min'] = df.groupby('jet_id')['time_xy'].transform('min')
#     df['jet_id_time_xy_std'] = df.groupby('jet_id')['time_xy'].transform('std')
    
#     df['jet_id_time_yz_max'] = df.groupby('jet_id')['time_yz'].transform('max')
#     df['jet_id_time_yz_mean'] = df.groupby('jet_id')['time_yz'].transform('mean')
#     df['jet_id_time_yz_min'] = df.groupby('jet_id')['time_yz'].transform('min')
#     df['jet_id_time_yz_std'] = df.groupby('jet_id')['time_yz'].transform('std')
    
#     df['jet_id_time_zx_max'] = df.groupby('jet_id')['time_zx'].transform('max')
#     df['jet_id_time_zx_mean'] = df.groupby('jet_id')['time_zx'].transform('mean')
#     df['jet_id_time_zx_min'] = df.groupby('jet_id')['time_zx'].transform('min')
#     df['jet_id_time_zx_std'] = df.groupby('jet_id')['time_zx'].transform('std')
    
#     df['jet_id_mv_max'] = df.groupby('jet_id')['jet_mv'].transform('max')
#     df['jet_id_mv_mean'] = df.groupby('jet_id')['jet_mv'].transform('mean')
#     df['jet_id_mv_min'] = df.groupby('jet_id')['jet_mv'].transform('min')
#     df['jet_id_mv_std'] = df.groupby('jet_id')['jet_mv'].transform('std')
    
#     df['jet_id_mv_x_max'] = df.groupby('jet_id')['mv_x'].transform('max')
#     df['jet_id_mv_x_mean'] = df.groupby('jet_id')['mv_x'].transform('mean')
#     df['jet_id_mv_x_min'] = df.groupby('jet_id')['mv_x'].transform('min')
#     df['jet_id_mv_x_std'] = df.groupby('jet_id')['mv_x'].transform('std')
    
#     df['jet_id_mv_y_max'] = df.groupby('jet_id')['mv_y'].transform('max')
#     df['jet_id_mv_y_mean'] = df.groupby('jet_id')['mv_y'].transform('mean')
#     df['jet_id_mv_y_min'] = df.groupby('jet_id')['mv_y'].transform('min')
#     df['jet_id_mv_y_std'] = df.groupby('jet_id')['mv_y'].transform('std')
    
#     df['jet_id_mv_z_max'] = df.groupby('jet_id')['mv_z'].transform('max')
#     df['jet_id_mv_z_mean'] = df.groupby('jet_id')['mv_z'].transform('mean')
#     df['jet_id_mv_z_min'] = df.groupby('jet_id')['mv_z'].transform('min')
#     df['jet_id_mv_z_std'] = df.groupby('jet_id')['mv_z'].transform('std')
    
#     df['jet_id_mv_xy_max'] = df.groupby('jet_id')['mv_xy'].transform('max')
#     df['jet_id_mv_xy_mean'] = df.groupby('jet_id')['mv_xy'].transform('mean')
#     df['jet_id_mv_xy_min'] = df.groupby('jet_id')['mv_xy'].transform('min')
#     df['jet_id_mv_xy_std'] = df.groupby('jet_id')['mv_xy'].transform('std')
    
#     df['jet_id_mv_yz_max'] = df.groupby('jet_id')['mv_yz'].transform('max')
#     df['jet_id_mv_yz_mean'] = df.groupby('jet_id')['mv_yz'].transform('mean')
#     df['jet_id_mv_yz_min'] = df.groupby('jet_id')['mv_yz'].transform('min')
#     df['jet_id_mv_yz_std'] = df.groupby('jet_id')['mv_yz'].transform('std')
    
#     df['jet_id_mv_zx_max'] = df.groupby('jet_id')['mv_zx'].transform('max')
#     df['jet_id_mv_zx_mean'] = df.groupby('jet_id')['mv_zx'].transform('mean')
#     df['jet_id_mv_zx_min'] = df.groupby('jet_id')['mv_zx'].transform('min')
#     df['jet_id_mv_zx_std'] = df.groupby('jet_id')['mv_zx'].transform('std')

    df['particle_category_abs'] = np.abs(df['particle_category'])

    df['particle_category_unique_len'] = df.groupby(['jet_id'])['particle_category'].transform('unique').apply(len)
    df['particle_category_unique_len_abs'] = df.groupby(['jet_id'])['particle_category_abs'].transform('unique').apply(len)
    
    return df

In [16]:
df = brute_force(df)

In [17]:
df['jet_id_energy_sum'] = df.groupby('jet_id')['particle_energy'].transform('sum')
df['jet_id_mass_sum'] = df.groupby('jet_id')['particle_mass'].transform('sum')

In [18]:
df.columns

Index(['particle_category', 'particle_px', 'particle_py', 'particle_pz',
       'particle_energy', 'particle_mass', 'jet_id', 'particle_distance',
       'x_div_dist', 'energy_x', 'mass_x', 'angle_xy', 'angle_xz',
       'particle_speed', 'speed_x', 'time_x', 'jet_id_mass_std',
       'jet_id_energy_std', 'jet_id_mass_x_std', 'jet_id_energy_x_std',
       'jet_id_x_div_dist_std', 'jet_id_speed_std', 'jet_id_speed_x_std',
       'jet_id_px_std', 'jet_id_angle_xy_std', 'jet_id_angle_xz_std',
       'jet_id_time_x_std', 'particle_category_abs',
       'particle_category_unique_len', 'particle_category_unique_len_abs',
       'jet_id_energy_sum', 'jet_id_mass_sum'],
      dtype='object')

In [18]:
# do_in_vec = np.vectorize(calculate_speed, otypes=[np.float])
# vec = do_in_vec(df.jet_id_energy_sum, df.jet_id_mass_sum)
# df['jet_id_speed'] = vec

In [16]:
# to_drop_cols = [
#     'particle_distance', 'speed_x', 'energy_x',
# ]

# df = df.drop(to_drop_cols, axis=1)

In [19]:
df.shape

(35791152, 32)

In [20]:
df.head()

Unnamed: 0,particle_category,particle_px,particle_py,particle_pz,particle_energy,particle_mass,jet_id,particle_distance,x_div_dist,energy_x,mass_x,angle_xy,angle_xz,particle_speed,speed_x,time_x,jet_id_mass_std,jet_id_energy_std,jet_id_mass_x_std,jet_id_energy_x_std,jet_id_x_div_dist_std,jet_id_speed_std,jet_id_speed_x_std,jet_id_px_std,jet_id_angle_xy_std,jet_id_angle_xz_std,jet_id_time_x_std,particle_category_abs,particle_category_unique_len,particle_category_unique_len_abs,jet_id_energy_sum,jet_id_mass_sum
0,-211,-1.29973,0.368729,-0.057008,1.3594,0.13957,01291194f90c44c7bd79d9dbd50abd93,1.352224,-0.96118,-1.306628,-0.134152,-74.16154,87.488531,4.413596,-4.242259,0.0,0.277446,5.890592,0.274888,5.835939,0.031791,3.420882,3.398918,5.837704,85.439604,80.349951,0.0,211,8,6,105.426401,5.61416
1,-2212,1.30542,0.418302,-0.426083,1.71493,0.93827,7df78235aa464796bbb386765d4784b5,1.435495,0.909387,1.559535,0.85325,72.232733,-71.923559,1.91194,1.738693,0.750805,0.243374,16.306425,0.235951,16.079667,0.019349,7.231486,7.137482,16.08209,87.112019,74.097341,0.991281,2212,8,5,481.065425,7.30309
2,22,-0.611265,0.111135,-0.225794,0.661044,0.0,9aeaad70b98f4bf0890734137f88c7ac,0.661044,-0.924697,-0.611265,-0.0,-79.695525,69.726371,0.0,-0.0,0.0,0.103278,14.061791,0.100559,13.685861,0.019484,6.237393,6.076387,13.686267,49.158702,50.032284,0.0,22,4,3,241.553304,2.58723
3,211,3.77857,0.205956,-1.08938,3.94033,0.13957,4261ccf933b64ce6ba83c623e9a5bf06,3.937862,0.959549,3.780938,0.133924,86.880104,-73.917473,7.514243,7.210282,0.524053,0.239237,8.948237,0.233264,8.889257,0.019339,5.30026,5.232419,8.892414,85.593128,59.668327,0.891626,211,9,6,358.764924,8.35898
4,22,0.461909,0.330447,-0.16864,0.592448,0.0,e79cf654192e4cab8b83f50554fededc,0.592448,0.779662,0.461909,0.0,54.420291,-69.943188,0.0,0.0,0.0,0.14239,6.828113,0.113489,5.390166,0.047242,15.91554,12.716738,5.391547,5.156662,5.314014,0.439389,22,7,4,199.864176,2.877762


In [21]:
df['particle_category'] = df['particle_category'].astype('int16')
df['particle_category_abs'] = df['particle_category_abs'].astype('int16')
df['particle_category_unique_len'] = df['particle_category_unique_len'].astype('int16')
df['particle_category_unique_len_abs'] = df['particle_category_unique_len_abs'].astype('int16')

for col in tqdm([i for i in df.columns.tolist() if i not in ['particle_category', 'particle_category_abs', 'particle_category_unique_len', 'particle_category_unique_len_abs', 'jet_id']]):
    df[col] = df[col].astype('float16')

100%|██████████| 27/27 [00:20<00:00,  1.34it/s]


In [22]:
df = df.reindex(columns=[i for i in df.columns.tolist() if i != 'jet_id'] + ['jet_id'])
df.head()

Unnamed: 0,particle_category,particle_px,particle_py,particle_pz,particle_energy,particle_mass,particle_distance,x_div_dist,energy_x,mass_x,angle_xy,angle_xz,particle_speed,speed_x,time_x,jet_id_mass_std,jet_id_energy_std,jet_id_mass_x_std,jet_id_energy_x_std,jet_id_x_div_dist_std,jet_id_speed_std,jet_id_speed_x_std,jet_id_px_std,jet_id_angle_xy_std,jet_id_angle_xz_std,jet_id_time_x_std,particle_category_abs,particle_category_unique_len,particle_category_unique_len_abs,jet_id_energy_sum,jet_id_mass_sum,jet_id
0,-211,-1.299805,0.368652,-0.057007,1.359375,0.139526,1.352539,-0.960938,-1.306641,-0.134155,-74.1875,87.5,4.414062,-4.242188,0.0,0.277344,5.890625,0.274902,5.835938,0.031799,3.419922,3.398438,5.835938,85.4375,80.375,0.0,211,8,6,105.4375,5.613281,01291194f90c44c7bd79d9dbd50abd93
1,-2212,1.305664,0.418213,-0.426025,1.714844,0.938477,1.435547,0.90918,1.55957,0.853027,72.25,-71.9375,1.912109,1.738281,0.750977,0.243408,16.3125,0.235962,16.078125,0.019348,7.230469,7.136719,16.078125,87.125,74.125,0.991211,2212,8,5,481.0,7.304688,7df78235aa464796bbb386765d4784b5
2,22,-0.611328,0.111145,-0.22583,0.661133,0.0,0.661133,-0.924805,-0.611328,-0.0,-79.6875,69.75,0.0,-0.0,0.0,0.103271,14.0625,0.100586,13.6875,0.019485,6.238281,6.078125,13.6875,49.15625,50.03125,0.0,22,4,3,241.5,2.587891,9aeaad70b98f4bf0890734137f88c7ac
3,211,3.779297,0.205933,-1.089844,3.939453,0.139526,3.9375,0.959473,3.78125,0.133911,86.875,-73.9375,7.515625,7.210938,0.523926,0.239258,8.945312,0.233276,8.890625,0.019333,5.300781,5.230469,8.890625,85.5625,59.65625,0.891602,211,9,6,358.75,8.359375,4261ccf933b64ce6ba83c623e9a5bf06
4,22,0.461914,0.330566,-0.168579,0.592285,0.0,0.592285,0.779785,0.461914,0.0,54.40625,-69.9375,0.0,0.0,0.0,0.142334,6.828125,0.113464,5.390625,0.047241,15.914062,12.71875,5.390625,5.15625,5.3125,0.439453,22,7,4,199.875,2.876953,e79cf654192e4cab8b83f50554fededc


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35791152 entries, 0 to 11493799
Data columns (total 32 columns):
particle_category                   int16
particle_px                         float16
particle_py                         float16
particle_pz                         float16
particle_energy                     float16
particle_mass                       float16
particle_distance                   float16
x_div_dist                          float16
energy_x                            float16
mass_x                              float16
angle_xy                            float16
angle_xz                            float16
particle_speed                      float16
speed_x                             float16
time_x                              float16
jet_id_mass_std                     float16
jet_id_energy_std                   float16
jet_id_mass_x_std                   float16
jet_id_energy_x_std                 float16
jet_id_x_div_dist_std               float16
jet_id_

In [24]:
train = df[:train_length]
test = df[train_length:]

In [25]:
del df
gc.collect()

80

In [26]:
train.shape, test.shape

((24297352, 32), (11493800, 32))

In [27]:
train.head()

Unnamed: 0,particle_category,particle_px,particle_py,particle_pz,particle_energy,particle_mass,particle_distance,x_div_dist,energy_x,mass_x,angle_xy,angle_xz,particle_speed,speed_x,time_x,jet_id_mass_std,jet_id_energy_std,jet_id_mass_x_std,jet_id_energy_x_std,jet_id_x_div_dist_std,jet_id_speed_std,jet_id_speed_x_std,jet_id_px_std,jet_id_angle_xy_std,jet_id_angle_xz_std,jet_id_time_x_std,particle_category_abs,particle_category_unique_len,particle_category_unique_len_abs,jet_id_energy_sum,jet_id_mass_sum,jet_id
0,-211,-1.299805,0.368652,-0.057007,1.359375,0.139526,1.352539,-0.960938,-1.306641,-0.134155,-74.1875,87.5,4.414062,-4.242188,0.0,0.277344,5.890625,0.274902,5.835938,0.031799,3.419922,3.398438,5.835938,85.4375,80.375,0.0,211,8,6,105.4375,5.613281,01291194f90c44c7bd79d9dbd50abd93
1,-2212,1.305664,0.418213,-0.426025,1.714844,0.938477,1.435547,0.90918,1.55957,0.853027,72.25,-71.9375,1.912109,1.738281,0.750977,0.243408,16.3125,0.235962,16.078125,0.019348,7.230469,7.136719,16.078125,87.125,74.125,0.991211,2212,8,5,481.0,7.304688,7df78235aa464796bbb386765d4784b5
2,22,-0.611328,0.111145,-0.22583,0.661133,0.0,0.661133,-0.924805,-0.611328,-0.0,-79.6875,69.75,0.0,-0.0,0.0,0.103271,14.0625,0.100586,13.6875,0.019485,6.238281,6.078125,13.6875,49.15625,50.03125,0.0,22,4,3,241.5,2.587891,9aeaad70b98f4bf0890734137f88c7ac
3,211,3.779297,0.205933,-1.089844,3.939453,0.139526,3.9375,0.959473,3.78125,0.133911,86.875,-73.9375,7.515625,7.210938,0.523926,0.239258,8.945312,0.233276,8.890625,0.019333,5.300781,5.230469,8.890625,85.5625,59.65625,0.891602,211,9,6,358.75,8.359375,4261ccf933b64ce6ba83c623e9a5bf06
4,22,0.461914,0.330566,-0.168579,0.592285,0.0,0.592285,0.779785,0.461914,0.0,54.40625,-69.9375,0.0,0.0,0.0,0.142334,6.828125,0.113464,5.390625,0.047241,15.914062,12.71875,5.390625,5.15625,5.3125,0.439453,22,7,4,199.875,2.876953,e79cf654192e4cab8b83f50554fededc


In [28]:
for i, col in enumerate(train.columns.tolist()):
    print(i, col)

0 particle_category
1 particle_px
2 particle_py
3 particle_pz
4 particle_energy
5 particle_mass
6 particle_distance
7 x_div_dist
8 energy_x
9 mass_x
10 angle_xy
11 angle_xz
12 particle_speed
13 speed_x
14 time_x
15 jet_id_mass_std
16 jet_id_energy_std
17 jet_id_mass_x_std
18 jet_id_energy_x_std
19 jet_id_x_div_dist_std
20 jet_id_speed_std
21 jet_id_speed_x_std
22 jet_id_px_std
23 jet_id_angle_xy_std
24 jet_id_angle_xz_std
25 jet_id_time_x_std
26 particle_category_abs
27 particle_category_unique_len
28 particle_category_unique_len_abs
29 jet_id_energy_sum
30 jet_id_mass_sum
31 jet_id


In [29]:
train.to_pickle('train_particle_32cols.pickle')
test.to_pickle('test_particle_32cols.pickle')