# Import Libraries

In [3]:
import uproot
import time
import infofile

import numpy as np
import pandas as pd
import mplhep as hep
import uproot_methods.classes.TLorentzVector as LVepm
import matplotlib.pyplot as plt

from numpy import pi, sqrt, sin, cos, linspace, zeros, arctan, exp, tan, arccos, log, sinh, cosh
from scipy import stats

from numpy import array as arr
from numpy import append as app
from numpy import random as rd
from pandas import DataFrame

print('import success')

import success


# File Names and Directories

In [4]:
# monte-carlo simulation data
mc_files = ['mc_361106.Zee.4lep.root',
            'mc_361107.Zmumu.4lep.root',
            'mc_410000.ttbar_lep.4lep.root',
            'mc_363490.llll.4lep.root',
            'mc_363492.llvv.4lep.root',
            'mc_363356.ZqqZll.4lep.root',    
            'mc_345060.ggH125_ZZ4lep.4lep.root',
            'mc_341964.WH125_ZZ4lep.4lep.root',
            'mc_344235.VBFH125_ZZ4lep.4lep.root',
            'mc_341947.ZH125_ZZ4lep.4lep.root']

# actual data
data_files = ['data_A.4lep.root',
              'data_B.4lep.root',
              'data_C.4lep.root',
              'data_D.4lep.root']

# add directory to file names
mc_files = ['4lep/MC/' + file_name for file_name in mc_files]
data_files = ['4lep/Data/' + file_name for file_name in data_files]

# bring 'em all together
all_files = mc_files + data_files

# Global Variables

In [5]:
# luminosity (fb) and bin number
lumi = 10
nBins = 34

# mass range, in GeV
h_min, h_max = 80, 250

btagWP77 = 0.6459
zPDG = 91188

# Functions to Read Files into DataFrame

In [39]:
# determine net simulation weight
def get_xsec_weight(sample):
    info = infofile.infos[sample]
    xsec_weight = (lumi*1000*info['xsec'])/(info['sumw']*info['red_eff'])
    return xsec_weight

# read single file
def read_file(file: str=None, fam: int=None):
    btagWP77 = 0.6459
    
    name = file.split('.')[1] 
    # sample_names.append(name)
    Type = file.split('/')[1]
    tree = uproot.open(file)['mini']
   
    names = ['mcWeight', 'SumWeights', 'XSection', 'trigM', 'trigE', 'scaleFactor_PILEUP', 
             'scaleFactor_ELE', 'scaleFactor_MUON', 'scaleFactor_LepTRIGGER', 'lep_type', 
             'lep_pt', 'lep_eta', 'lep_phi', 'lep_E', 'lep_charge', 'lep_etcone20', 
             'lep_ptcone30', 'jet_n', 'jet_pt', 'jet_eta', 'jet_phi', 'jet_E', 'jet_MV2c10']
    
    # get info from tree
    (mcWeight, SumWeights, XSection, trigM, trigE, scaleFactor_PILEUP, scaleFactor_ELE, 
    scaleFactor_MUON,scaleFactor_LepTRIGGER, lep_type, lep_pt, lep_eta, lep_phi, lep_E, 
    lep_charge, lep_etcone20, lep_ptcone30, jet_n, jet_pt, jet_eta, jet_phi, jet_E, 
    jet_MV2c10) = tree.arrays(names, outputtype=tuple)
    
    
    leplv = LVepm.TLorentzVectorArray.from_ptetaphi(lep_pt, lep_eta, lep_phi, lep_E)

    lep_reliso_pt = (lep_ptcone30 / lep_pt)
    lep_reliso_et = (lep_etcone20 / lep_pt)
    sum_lep_type = lep_type.sum()
    
    jetlv = LVepm.TLorentzVectorArray.from_ptetaphi(jet_pt, jet_eta, jet_phi, jet_E)
    jetlv = jetlv[jet_MV2c10.argsort()]    
    tags = jet_pt[jet_MV2c10 > btagWP77]
    
    cuts = {'trig_cut': ( (trigM == 1) | (trigE == 1) ), 
            'lep_pt_cut': ( (lep_pt.max() > 20000) & (lep_pt.min() > 7000) ), 
            'lep_eta_cut': ( (lep_eta.min() >- 2.5) & (lep_eta.max() < 2.5) ), 
            'lep_type_cut': ( (sum_lep_type == 44) | (sum_lep_type == 48) | (sum_lep_type == 52) ),
            'lep_iso_cut': ( (lep_reliso_pt.max() < 0.3) & (lep_reliso_pt.max() < 0.3) ),
            'lept_count_cut': ( leplv.counts == 4 ),
            'lept_charge_cut': ( lep_charge.sum()== 0 )}
    
    cut = True
    for boolean in cuts.values():
        cut &= boolean
    
    if True:
        (mcWeight, SumWeights, XSection, trigM, trigE, scaleFactor_PILEUP, scaleFactor_ELE, 
        scaleFactor_MUON,scaleFactor_LepTRIGGER, lep_type, lep_pt, lep_eta, lep_phi, lep_E, 
        lep_charge, lep_etcone20, lep_ptcone30, jet_n, jet_pt, jet_eta, jet_phi, jet_E, 
        jet_MV2c10) = (mcWeight[cut], SumWeights[cut], XSection[cut], trigM[cut], trigE[cut], 
                       scaleFactor_PILEUP[cut], scaleFactor_ELE[cut], scaleFactor_MUON[cut], 
                       scaleFactor_LepTRIGGER[cut], lep_type[cut], lep_pt[cut], lep_eta[cut], 
                       lep_phi[cut], lep_E[cut], lep_charge[cut], lep_etcone20[cut], 
                       lep_ptcone30[cut], jet_n[cut], jet_pt[cut], jet_eta[cut], jet_phi[cut], 
                       jet_E[cut], jet_MV2c10[cut])
    
    if(Type == 'MC'):
        a1 = get_xsec_weight(name)*(mcWeight)*(scaleFactor_PILEUP)
        a2 = (scaleFactor_ELE)*(scaleFactor_MUON)*(scaleFactor_LepTRIGGER)
        finalWeight = a1*a2
    else:
        finalWeight = np.ones(len(lep_type))
        
    df = (mcWeight, SumWeights, XSection, trigM, trigE, scaleFactor_PILEUP, scaleFactor_ELE, 
          scaleFactor_MUON,scaleFactor_LepTRIGGER, lep_type, lep_pt, lep_eta, lep_phi, lep_E, 
          lep_charge, lep_etcone20, lep_ptcone30, jet_n, jet_pt, jet_eta, jet_phi, jet_E, 
          jet_MV2c10, lep_type.sum(), finalWeight)
    
    names = ['mcWeight', 'SumWeights', 'XSec', 'trigM', 'trigE', 'SF_PILEUP', 'SF_ELE', 'SF_MUON', 
             'SF_LepTRIGGER', 'lep_type', 'lep_pt', 'lep_eta', 'lep_phi', 'lep_E', 'lep_charge', 
             'lep_etcone20', 'lep_ptcone30', 'jet_n', 'jet_pt', 'jet_eta', 'jet_phi', 'jet_E', 
             'jet_MV2c10', 'sum_lep_type', 'w']

    df = pd.DataFrame( dict( zip(names, df) ) )
    
    if 'H125' in file:
        df['sig?'] = 1
    else:
        df['sig?'] = 0
        
    df['fam'] = fam
    
    if True:
        df.drop(labels=['mcWeight', 'SumWeights', 'XSec', 'trigM', 'trigE', 'SF_PILEUP', 'SF_ELE', 
                       'SF_MUON', 'SF_LepTRIGGER', 'jet_n', 'jet_pt', 'jet_eta', 'jet_phi', 
                        'jet_E', 'jet_MV2c10'], axis=1, inplace=True)
    
    return df

def read_files(files: list=None, export: bool=True):
    frames = [ read_file(file=file, fam=i) for i, file in enumerate(files) ]
    out = pd.concat(frames)

    i = linspace(0, len(out)-1, len(out))
    out.set_index(keys=i, inplace=True)
    out['id'] = i

    d = {'lep_type': 'type', 'lep_pt': 'pt', 'lep_eta': 'eta', 'lep_phi': 'phi', 'lep_E': 'E',
        'lep_charge': 'Q', 'lep_etcone20': 'etc20', 'lep_ptcone30': 'ptc30'}

    out = out.rename(mapper=d, axis=1, inplace=False)
    out.drop(labels=['etc20', 'ptc30'], axis=1, inplace=True)
    
    if export:
        out.to_csv('DataFrame_Initial.csv', sep='&')
    return out

In [40]:
# only run this cell if changes to the above cell have been made, as it takes a while
DF = read_files(all_files, export=False)
DF

Unnamed: 0,type,pt,eta,phi,E,Q,sum_lep_type,w,sig?,fam,id
0.0,"[11, 11, 11, 11]","[47592.39, 43458.098, 10026.11, 7162.6626]","[-0.8090483, -0.6145289, 2.0875025, 0.6384636]","[-2.2826285, 0.6541073, 0.18135798, 0.8925741]","[64036.78, 51925.496, 41050.63, 8672.812]","[-1, 1, -1, 1]",44,0.169451,0,0,0.0
1.0,"[11, 11, 13, 13]","[51710.316, 30980.805, 11872.697, 7892.9395]","[2.211758, 1.0357504, -1.5491847, -0.49692452]","[3.0596266, -0.21164392, -0.018451305, 2.6533346]","[238934.31, 49138.336, 29207.332, 8888.304]","[-1, 1, -1, 1]",48,0.174886,0,0,1.0
2.0,"[11, 11, 11, 11]","[65519.844, 17793.098, 14570.528, 7871.123]","[-1.8400015, 0.014194325, 0.10441768, 0.54491776]","[0.4868397, -2.3558335, -1.6149002, -2.8749647]","[211477.25, 17794.89, 14650.032, 9068.935]","[-1, -1, 1, 1]",44,0.381443,0,0,2.0
3.0,"[11, 11, 13, 13]","[39427.957, 32687.89, 8821.469, 8123.534]","[0.6319882, -0.72217596, 0.5735928, -0.06887347]","[2.225151, -1.306289, 1.8235722, -1.0468655]","[47567.492, 41588.848, 10313.406, 8143.4946]","[-1, 1, 1, -1]",48,0.166324,0,0,3.0
4.0,"[11, 11, 13, 13]","[49097.43, 18737.934, 11769.221, 7947.696]","[-1.0622535, 0.8695862, -1.5491946, -0.4963608]","[2.2721536, -1.0716466, -0.018504394, 2.653314]","[79502.45, 26280.404, 28953.041, 8947.64]","[1, -1, -1, 1]",48,0.156882,0,0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
818086.0,"[11, 11, 11, 11]","[51773.375, 51708.45, 51148.297, 30737.582]","[-1.28635, -1.0202633, -0.80568916, 0.30312154]","[1.1772112, -1.047273, -1.4184802, 2.3813634]","[100850.09, 81038.13, 68667.06, 32160.553]","[-1, 1, 1, -1]",44,1.000000,0,13,818086.0
818087.0,"[13, 13, 13, 13]","[61561.816, 25231.277, 21982.31, 14411.031]","[-0.6561884, -0.043175437, 0.49340436, -0.2145...","[3.0332172, 1.3050667, 0.06840056, 2.7004883]","[75298.086, 25255.02, 24713.037, 14744.3125]","[1, -1, -1, 1]",52,1.000000,0,13,818087.0
818088.0,"[13, 13, 13, 13]","[53048.145, 39838.145, 23835.697, 7428.501]","[1.6669953, 1.2820495, 0.009336205, 1.5172257]","[1.6207898, -1.2548615, -2.9835243, -0.25539052]","[145485.73, 77315.69, 23836.97, 17750.264]","[-1, 1, 1, -1]",52,1.000000,0,13,818088.0
818089.0,"[11, 11, 11, 11]","[114072.234, 113360.6, 64472.6, 26180.354]","[1.6677951, 1.1956723, 1.5933089, 1.2851827]","[0.0551456, 3.0626664, -0.9325686, -1.999637]","[313078.7, 204518.39, 165154.75, 50946.03]","[1, 1, -1, -1]",44,1.000000,0,13,818089.0


# Functions For Processing

In [41]:
# function to split dataframe into 4, only for eeuu
def partner_eeuu(df: DataFrame=None):
    cols = [col for col in df.columns]

    shape = ( len(df), len(cols) )
    (e_neg, e_pos, u_neg, u_pos) = ( zeros( shape=shape ), zeros( shape=shape ), 
                                     zeros( shape=shape ), zeros( shape=shape ) )

    for i, row in enumerate(df.iloc):
        x = np.stack( row[:-4].to_numpy() )
        
        x_add = row[-4:].to_numpy()
        
        # ab: electrons, cd: muons
        e_neg_loc, e_pos_loc = tuple( np.where( x[cols.index('type')] == 11 )[0] )
        u_neg_loc, u_pos_loc = tuple( np.where( x[cols.index('type')] == 13 )[0] )

        x = np.transpose(x)

        if  x[e_neg_loc][cols.index('Q')] == 1:
            e_neg_loc, e_pos_loc = e_pos_loc, e_neg_loc
        if x[u_neg_loc][cols.index('Q')] == 1:
            u_neg_loc, u_pos_loc = u_pos_loc, u_neg_loc
        
        
        e_neg[i], e_pos[i] = app(x[e_neg_loc], x_add), app(x[e_pos_loc], x_add)
        u_neg[i], u_pos[i] = app(x[u_neg_loc], x_add), app(x[u_pos_loc], x_add)
    
    e2u2 = [ cart( pd.DataFrame(array, columns=cols) ) for array in [e_neg, e_pos, u_neg, u_pos] ]
    cols = e2u2[0].columns
    sorter = mass_compare_eeuu(frames=e2u2)
    
     # add sorter to each array
    for df in e2u2:
        df['s'] = sorter
        
    # get lepton dataframes such that AB real and CD virtual
    real_neg = pd.concat( [ e2u2[0][sorter == True], e2u2[2][sorter == False]  ] )
    real_pos = pd.concat( [ e2u2[1][sorter == True], e2u2[3][sorter == False]  ] )
    virt_neg = pd.concat( [ e2u2[2][sorter == True], e2u2[0][sorter == False]  ] )
    virt_pos = pd.concat( [ e2u2[3][sorter == True], e2u2[1][sorter == False]  ] )
    
    out = [real_neg, real_pos, virt_neg, virt_pos]
    
    # order dataframes, such that they align, and drop sorter column
    for df in out:
        df.sort_values(by='id', axis=0, inplace=True)
        df.drop(labels=['s'], axis=1, inplace=True)
    
    return out

# a function to compare eeuu set, and return sorter such that AB is real and CD virtual
def mass_compare_eeuu(frames: list=None, zPDG: float=91188):
    frames = [ df[ ['E', 'px', 'py', 'pz'] ] for df in frames ]
    
    names = ['elec', 'muon']
    frames = [ frames[0] + frames[1],
               frames[2] + frames[3] ]
    dic = {}
    for i, df in enumerate(frames):
        dic[names[i]] = abs( sqrt( df['E']**2 - df['px']**2 - df['py']**2 - df['pz']**2 ) - zPDG )
    df = pd.DataFrame(dic)
    
    # true means that the electrons come from the real-Z, false means muons
    return df['elec'] <= df['muon']

# split function for eeee or uuuu, partners lepton's to prioritize Z-mass for single pair
def partner_llll(df: DataFrame=None):
    cols = [col for col in df.columns]

    # create containers for initial split, lower-case to avoid confusion with final split
    shape = ( len(df), len(cols) )
    (neg_1, neg_2, pos_1, pos_2) = ( zeros( shape=shape ), zeros( shape=shape ), 
                                     zeros( shape=shape ), zeros( shape=shape ) )

    for i, row in enumerate(df.iloc):
        x = row[:-4].to_numpy()
        x = np.stack(x)
    
        x_add = row[-4:].to_numpy()
        
        # ab: electrons (muons), cd: positrons (antimuons)
        neg_1_loc, neg_2_loc = tuple( np.where( x[cols.index('Q')] == -1 )[0] )
        pos_1_loc, pos_2_loc = tuple( np.where( x[cols.index('Q')] == 1 )[0] )
        
        x = np.transpose(x)
        neg_1[i], neg_2[i] = app(x[neg_1_loc], x_add), app(x[neg_2_loc], x_add)
        pos_1[i], pos_2[i] = app(x[pos_1_loc], x_add), app(x[pos_2_loc], x_add)
    
    # pass into mass_compare function to get a boolean sorter
    L4 = [ cart( pd.DataFrame(array, columns=cols) ) for array in [neg_1, neg_2, pos_1, pos_2] ]
    sorter = mass_compare_llll(frames=L4)
    
    # add sorter to each array
    for df in L4:
        df['s'] = sorter
        
    # get lepton dataframes such that A partners with B, C partners with D
    real_neg = pd.concat( [ L4[0][sorter == True], L4[1][sorter == False]  ] )
    real_pos = pd.concat( [ L4[2][sorter == True], L4[3][sorter == False]  ] ) 
    virt_neg = pd.concat( [ L4[1][sorter == True], L4[0][sorter == False]  ] )
    virt_pos = pd.concat( [ L4[3][sorter == True], L4[2][sorter == False]  ] )
    
    out = [real_neg, real_pos, virt_neg, virt_pos]
    
    # order dataframes, such that they align, and drop sorter column
    for df in out:
        df.sort_values(by='id', axis=0, inplace=True)
        df.drop(labels=['s'], axis=1, inplace=True)
    
    return out

# a function to compare 4 similar leptons and return a series of booleans
def mass_compare_llll(frames: list=None, zPDG: float=91188):
    frames = [ df[ ['E', 'px', 'py', 'pz'] ] for df in frames ]
    
    # assume input is [neg_1, neg_2, pos_1, pos_2], then this list gives the matching which we test
    names = ['n1p1', 'n1p2', 'n2p1', 'n2p2']
    frames = [ frames[0] + frames[2], 
               frames[0] + frames[3], 
               frames[1] + frames[2], 
               frames[1] + frames[3] ]
    
    # calculate Z-mass diff for each of the above combinations
    dic = {}
    for i, df in enumerate(frames):
        dic[names[i]] = abs( sqrt( df['E']**2 - df['px']**2 - df['py']**2 - df['pz']**2 ) - zPDG )
    df = pd.DataFrame(dic)
    
    # if n1p1 then n2p2, and we only need 1 Z-boson close to 91 GeV
    df['n1p1'] = df[ ['n1p1', 'n2p2'] ].min(axis=1)
    df['n1p2'] = df[ ['n1p2', 'n2p1'] ].min(axis=1)
    df.drop(labels=['n2p2', 'n2p1'], axis=1, inplace=True)
    
    # return sorted array, True -> (n1p1, n2p2), False -> (n1p2, n2p1)
    return df['n1p1'] <= df['n1p2']

# replace ptetaphi with cartesian coordinates
def cart(df: DataFrame=None):
    theta = 2*arctan( exp( -df['eta'] ) )
    df['px'] = df['pt'] * sin( df['phi'] )
    df['py'] = df['pt'] * cos( df['phi'] )
    df['pz'] = df['pt'] / tan(theta)
    df['p'] = sqrt( df['px']**2 + df['py']**2 + df['pz']**2 )
    df.drop(labels=['pt', 'eta', 'phi'], axis=1, inplace=True)
    return df

# Merge and Mass, a generalized function, either LLLL -> ZZ, or ZZ -> H 
def MnM(FRAMES: list=None):
    # if input is LLLL, return ZZ
    if len(FRAMES) == 4:
        frames = [df.drop(labels=['p'], axis=1, inplace=False) for df in FRAMES]
        real, virt = frames[0] + frames[1], frames[2] + frames[3]
        
        # these are columns which detail the event and do not have physical meaning, thus we half
        # them to counteract the doubling which took place above
        real['id'], virt['id'] = real['id']/2, virt['id']/2
        real['w'], virt['w'] = real['w']/2, virt['w']/2
        real['fam'], virt['fam'] = real['fam']/2, virt['fam']/2
        real['sig?'], virt['sig?'] = real['sig?']/2, virt['sig?']/2

        out = []
        for i, df in enumerate( [real, virt] ):
            df['p'] = sqrt( df['px']**2 + df['py']**2 + df['pz']**2 )
            df['m'] = sqrt( df['E']**2 - df['p']**2 )
            out.append( df )

        return out
    
    # if input is ZZ, return H
    elif len(FRAMES) == 2:
        h = FRAMES[0] + FRAMES[1]
        
        # counteract doubling as above
        h['id'], h['w'], h['fam'], h['sig?'] = h['id']/2, h['w']/2, h['fam']/2, h['sig?']/2
        
        h.drop(labels=['p'], axis=1, inplace=True)
        if 'm' in h.columns:
            h.drop(labels=['m'], axis=1, inplace=True)
        h['p'] = sqrt( h['px']**2 + h['py']**2 + h['pz']**2 )
        h['m'] = sqrt( h['E']**2 - h['p']**2 )
        
        return h
    
 # momentum dot product between 2 particles
def dot(d1: DataFrame=None, d2: DataFrame=None):
    return d1['px']*d2['px'] + d1['py']*d2['py'] + d1['pz']*d2['pz']

# get angles between lepton and Z-boson, as well as angle between Z and higgs
def get_angles(L4: list=None, Z2: list=None, H: DataFrame=None):
    # dict for mapping lep to z
    L2Z = {0: 0, 1: 0, 2: 1, 3: 1}
    
    # 'a' for angle :)
    for i, L in enumerate(L4):
        Z = Z2[ L2Z[i] ]
        L['a'] = arccos( dot(L, Z) / (L['p']*Z['p']) )
    for i, Z in enumerate(Z2):
        Z['a'] = arccos( dot(Z, H) / (H['p']*Z['p']) )
    
    return L4, Z2, H

# merge all the levels into a single dataframe
def merge(L4: list=None, Z2: list=None, H:list=None):
    
    # rename and merge lepton list into big lepton dataframe
    for i, df in enumerate(L4):
        df.sort_values(by='id', axis=0, inplace=True)
        
        # drop these columns, as they will come from h-data
        df.drop(labels=['id', 'w', 'sig?', 'fam'], axis=1, inplace=True)
        # change column names to specify lepton
        dic = [f'L{i} '+col for col in df.columns]
        dic = dict( zip(df.columns, dic) )
        df.rename(mapper=dic, axis=1, inplace=True)
    
    # single lepton dataframe
    L4 = pd.concat(L4, axis=1)
    
    # rename and merge Z's
    for i, df in enumerate(Z2):
        df.sort_values(by='id', axis=0, inplace=True)
        
        # drop these columns, as they will come from h-data
        df.drop(labels=['w', 'sig?', 'id', 'fam'], axis=1, inplace=True)
        
        # change column names to specify boson
        dic = [f'Z{i} '+col for col in df.columns]
        dic = dict( zip(df.columns, dic) )
        df.rename(mapper=dic, axis=1, inplace=True)
    
    # single boson dataframe
    Z2 = pd.concat(Z2, axis=1)
    
    H.sort_values(by='id', axis=0, inplace=True)
    out = pd.concat([L4, Z2, H], axis=1)
    return out

# a combination of all above process, returns informative dataframe if export=False, else exports 
def pipeline(df: DataFrame=None, subset: str='all', export: bool=True):
    
    typesum = {'all': [52, 44, 48], 'eeuu': [48], 'eeee': [44], 'uuuu': [52], 'llll': [52, 44]}
    
    lep_groups = [df[df['sum_lep_type']==ts].drop(labels=['sum_lep_type'], 
                                                  axis=1, 
                                                  inplace=False) for ts in typesum[subset]]

    frames = []
    for i, lep_group in enumerate(lep_groups):

        # split and match leptons
        if typesum[subset][i] == 48:
            L4 = partner_eeuu(lep_group)
        else:
            L4 = partner_llll(lep_group)

        # get Z's and higgs
        Z2 = MnM(L4)
        H = MnM(Z2)

        # append to list of dataframes
        frames.append( merge( *get_angles(L4, Z2, H) ) )

    out = pd.concat(frames, axis=0)
    out.sort_values(by='id', axis=0, inplace=True)
    out.set_index(keys='id', drop=True, inplace=True)
    
    if export:
        out.to_csv('DataFrame_Final.csv')
    else:
        return out

In [42]:
# export data to csv, only run this cell to update processed dataframe, as it takes a while
pipeline(DF, subset='all')

In [43]:
# read in data and check
V = pd.read_csv('DataFrame_Final.csv')
V

Unnamed: 0,id,L0 type,L0 E,L0 Q,L0 px,L0 py,L0 pz,L0 p,L0 a,L1 type,...,E,Q,w,sig?,fam,px,py,pz,p,m
0,0.0,11.0,64036.781250,-1.0,-36035.354687,-31088.403914,-42844.764528,64036.782343,0.775018,11.0,...,165685.717773,0.0,0.169451,0.0,0.0,-2207.377847,17755.172571,-26566.351879,32029.513299,162560.349876
1,1.0,11.0,238934.312500,-1.0,4233.747224,-51536.707377,233271.623877,238934.315930,0.141727,11.0,...,326168.284180,0.0,0.174886,0.0,0.0,1209.119766,-16347.164128,240642.264655,241199.899013,219564.018730
2,2.0,11.0,211477.250000,-1.0,30652.483728,57907.470730,-201071.564622,211477.242332,0.068085,11.0,...,252991.107422,0.0,0.381443,0.0,0.0,1436.079667,37094.988386,-194790.254596,198296.101257,157108.741516
3,3.0,11.0,47567.492188,-1.0,31283.751030,-23997.723166,26610.193752,47567.491075,1.018484,11.0,...,107613.240723,0.0,0.166324,0.0,0.0,1240.001057,-13594.084004,5679.720076,14784.990452,106592.746640
4,4.0,11.0,26280.404297,-1.0,-16451.715946,8969.459171,18426.869741,26280.404940,2.865924,11.0,...,143683.538086,0.0,0.156882,0.0,0.0,24567.648931,-17962.648813,-74665.543324,80629.830003,118927.665539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818086,818086.0,11.0,100850.093750,-1.0,47814.786986,19855.188350,-86546.282613,100850.093669,0.479502,11.0,...,282715.841797,0.0,1.000000,0.0,13.0,-26343.225786,31191.804669,-185297.981470,189742.552318,209585.331173
818087,818087.0,13.0,24713.037109,-1.0,1502.430197,21930.906969,11291.635205,24712.810495,0.562848,13.0,...,140010.455078,0.0,1.000000,0.0,13.0,38659.478189,-45675.281627,-36271.672918,69974.430057,121270.386614
818088,818088.0,13.0,145485.734375,-1.0,52981.865557,-2650.954608,135469.488805,145485.697013,0.305213,13.0,...,264388.656250,0.0,1.000000,0.0,13.0,9486.845750,-6624.023097,218074.564557,218381.303300,149033.445651
818089,818089.0,11.0,50946.031250,-1.0,-23809.681271,-10886.229282,43704.545288,50946.032123,0.503345,11.0,...,733697.859375,0.0,1.000000,0.0,13.0,-60365.804616,28415.929673,657539.382458,660915.679122,318595.376523


# Checking The Pipeline
Here we just do a few sanity checks to make sure that the pairing is done correct.

In [44]:
# check that we have paired leptons of same type
miss = V[ (V['L0 type'] != V['L1 type']) | (V['L2 type'] != V['L3 type']) ]
if len(miss) != 0:
    print('Check 1')
    
# check that the leptons are actually paired to the Z's we think they are:
miss = V[ (V['L0 Q'] + V['L1 Q'] != V['Z0 Q']) | (V['L2 Q'] + V['L3 Q'] != V['Z1 Q']) ]
if len(miss) != 0:
    print('Check 2')
    
# check that we have paired leptons of opposite charge
miss = V[ (V['Z0 Q'] != 0) | (V['Z1 Q'] != 0) ]
if len(miss) != 0:
    print('Check 3')

Check 3


In [45]:
# look a bit deeper into check 3:
miss = V[ (V['Z0 Q'] != 0) | (V['Z1 Q'] != 0) ]
miss

Unnamed: 0,id,L0 type,L0 E,L0 Q,L0 px,L0 py,L0 pz,L0 p,L0 a,L1 type,...,E,Q,w,sig?,fam,px,py,pz,p,m
385,385.0,13.0,61069.015625,-1.0,-60338.186190,-2659.078382,9035.809989,61068.922315,0.370369,13.0,...,129077.330078,0.0,5.073128e-02,0.0,2.0,-53243.431862,-16763.367778,59005.806369,81225.357623,100316.491262
387,387.0,13.0,18553.455078,1.0,-13968.186310,-9358.485708,7843.979735,18553.153414,1.013562,13.0,...,239397.943359,0.0,6.683526e-02,0.0,2.0,28759.870248,-11178.113837,221461.735453,223600.940596,85521.895730
393,393.0,11.0,102972.882812,-1.0,14804.589109,90984.248835,-45892.319995,102972.882009,0.861902,11.0,...,206976.646484,0.0,1.039828e-01,0.0,2.0,-43485.401486,59564.118217,16478.756835,75567.279614,192688.656754
395,395.0,13.0,183429.906250,-1.0,-50416.697218,-81477.056136,-156416.645725,183429.880594,0.155583,13.0,...,263029.529297,0.0,4.728251e-02,0.0,2.0,-28266.723940,-102877.670206,-149258.414537,183469.062840,188477.150505
401,401.0,13.0,47554.742188,-1.0,-22350.349747,16836.110816,-38450.613149,47554.625566,0.721048,13.0,...,160853.549805,0.0,5.084822e-02,0.0,2.0,-56779.039059,70044.492512,-31059.665412,95366.624263,129534.055219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636286,636286.0,13.0,141101.750000,-1.0,-33543.584990,-12933.920427,-136444.988049,141101.712092,0.183204,13.0,...,196857.514648,0.0,2.667217e-05,1.0,7.0,-30301.190874,-16278.018488,-141552.426244,145671.635639,132410.934759
638131,638131.0,11.0,19040.498047,1.0,-8342.953009,-15460.110119,7343.073482,19040.498889,1.289702,11.0,...,276803.611328,0.0,2.994898e-05,1.0,7.0,-95313.585024,-90697.586456,142738.049322,194125.944694,197320.442025
715555,715555.0,11.0,38976.324219,1.0,-7938.508295,-24950.655308,28872.109512,38976.323863,1.518894,11.0,...,120278.044922,0.0,2.530513e-06,1.0,8.0,38904.984966,12044.356595,-4601.503250,40985.829419,113079.484775
811863,811863.0,11.0,56744.609375,-1.0,-45256.629265,-11084.691112,32386.998690,56744.608146,0.411528,11.0,...,129238.103516,0.0,6.805841e-08,1.0,9.0,-51723.518926,9025.069821,59589.589412,79420.988798,101954.862261


In [46]:
set( miss['L0 type'] + miss['L1 type'] + miss['L2 type'] + miss['L3 type'] )

{48.0}

### Comments:
So all the faults came from EEUU events. I'm not exactly sure the reason of this, and it would be very interesting to find out, but it seems there are a few cases where we have leptons of same charge and same type (i.e., 2 electrons and 2 anti-muons, or 2 positrons and 2 muons). As far as I'm aware, this cannot be a valid signal, yet some of these cases are labelled as signal events.

### NB:
Moving forward, I will drop these events and not include them in the analysis and plots in the other notebook, but I decided not to rather drop them seperately (in the following cell) than to modify the selection criteria previously, just in case these events deserve some attention.

Just be careful if running these notebooks locally, to make sure you run the following cell to correct the csv file.

In [47]:
V.drop(V[V['Z0 Q'] != 0].index, axis=0, inplace=True)
V.to_csv('DataFrame_Final.csv')

# Test Accuracy of Lepton Pairing
As we know how leptons must be paired in EEUU events, we can pair them by using the z-mass algorithm (used in EEEE and UUUU cases) and compare the results.

This code just does the pairing and exports to csv, the actual analysis and testing is done in the other notebook.

In [48]:
# function to perform llll pairing on eeuu events
def pair_test(df: DataFrame=None, export: bool=True):
    # select eeuu events
    eeuu = df[df['sum_lep_type'] == 48].drop(labels='sum_lep_type', axis=1)
    
    # sort the leptons according to possible z-mass
    llll = partner_llll(eeuu)

    # get Z's and higgs
    zz = MnM(llll)
    h = MnM(zz)

    # merge to single df
    out = merge( *get_angles(llll, zz, h) )

    out.sort_values(by='id', axis=0, inplace=True)
    out.set_index(keys='id', drop=True, inplace=True)
    if export:
        out.to_csv('eeuu.csv')
    else:
        return out

In [49]:
# only run this cell if changes have been made to the previous code, as it takes a while
pair_test(DF)