In [1]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import uproot 
import awkward as ak
from pathlib import Path

from typing import Dict, List 
import re
import pickle
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  

def safe_array_divide(numerator, denominator):
    with np.errstate(divide='ignore', invalid='ignore'):
        ratio = np.true_divide(numerator, denominator)
        ratio = np.nan_to_num(ratio, nan=0, posinf=0, neginf=0)
    return ratio


In [2]:
training_vars = ['jet_pt', 'jet_eta', 'jet_nTracks', 'jet_trackWidth', 'jet_trackC1']
training_weight = ['equal_weight']
filename = './MLP_classifier.model'
scaler_filename = './scaler.pkl'
sample_all_jets_path = '../../../samples/sample_allpt_all_jets.pkl'

sample_path = '../../../samples/sample_allpt_all_jets.pkl'
label_pt_bin = [500, 600, 800, 1000, 1200, 1500, 2000]
layers = (10, 5, 4)


In [3]:
sample = pd.read_pickle(sample_path)

In [4]:
with open(scaler_filename, 'rb') as f:
    scaler = pickle.load(f)

with open(filename, 'rb') as f:
    clf = pickle.load(f)

In [5]:
clf

In [6]:
y_sample = clf.predict_proba(scaler.transform(sample[training_vars]))[:,1]
sample['new_MLPprob'] = y_sample

In [7]:
sample.head()

Unnamed: 0,jet_pt,jet_eta,jet_nTracks,jet_trackWidth,jet_trackC1,jet_trackBDT,jet_PartonTruthLabelID,equal_weight,event_weight,flatpt_weight,is_forward,pt_idx,target,new_MLPprob
0,766.747925,1.298662,2.0,0.318292,0.215923,-0.192873,21.0,1.0,17.609434,2.70319e-05,1.0,1,1.0,0.131943
0,680.507996,0.447689,24.0,0.064352,0.208339,0.066904,2.0,1.0,0.331535,2.858433e-07,0.0,1,0.0,0.551975
1,619.839111,0.588775,34.0,0.115656,0.298206,0.232268,21.0,1.0,0.331535,1.655248e-07,1.0,1,1.0,0.867591
2,721.692932,-0.716669,12.0,0.049718,0.220595,-0.169604,2.0,1.0,0.377624,3.935379e-07,0.0,1,0.0,0.223149
3,697.332947,-1.042831,22.0,0.091199,0.278667,0.029374,21.0,1.0,0.377624,3.255802e-07,1.0,1,1.0,0.509022


In [8]:
sample.to_pickle('pred_sample.pkl')

In [9]:

def Calculate_reweight_factor(sample, clf_type):
    assert clf_type in ['new_MLPprob', 'new_GBDTscore']

    if clf_type == 'new_MLPprob':
        clf_range = (0, 1) 
    if clf_type == 'new_GBDTscore':
        clf_range = (-4.0, 4.0) 

    features = [*sample.columns[:6]] + [clf_type] 
    HistBins = {
        features[0] : np.linspace(0, 2000, 61), 
        features[1] : np.linspace(-2.5, 2.5, 51),
        features[2] : np.linspace(0, 60, 61),
        features[3] : np.linspace(0, 0.4, 61), 
        features[4] : np.linspace(0, 0.4, 61), 
        features[5] : np.linspace(-1.0, 1.0, 51),
        clf_type : np.linspace(-clf_range[0], clf_range[1], 51)
    }
    label_vars = ['jet_nTracks', 'jet_trackBDT', clf_type]

    # Initialize all the vars 
    for var in label_vars:
        sample[f'{var}_quark_reweighting_weights'] = sample['event_weight'].copy()
        sample[f'{var}_gluon_reweighting_weights'] = sample['event_weight'].copy()

    reweighted_sample = []
    for pt_idx, pt in enumerate(tqdm(label_pt_bin[:-1])):
        sample_pt = sample[sample['pt_idx'] == pt_idx]  # Get the pt slice 
        _sample = sample_pt
        
        forward_quark = _sample[(_sample['is_forward']==1) &(_sample['target']==0)]
        forward_gluon = _sample[(_sample['is_forward']==1) &(_sample['target']==1)]
        central_quark = _sample[(_sample['is_forward']==0) &(_sample['target']==0)]
        central_gluon = _sample[(_sample['is_forward']==0) &(_sample['target']==1)]

        for var in label_vars:
            bin_var = HistBins[var]
            hist_forward_quark, _ = np.histogram(forward_quark[var], bins=bin_var, weights=forward_quark['event_weight'])
            hist_central_quark, _ = np.histogram(central_quark[var], bins=bin_var, weights=central_quark['event_weight'])
            hist_forward_gluon, _ = np.histogram(forward_gluon[var], bins=bin_var, weights=forward_gluon['event_weight'])
            hist_central_gluon, _ = np.histogram(central_gluon[var], bins=bin_var, weights=central_gluon['event_weight'])

            quark_factor = safe_array_divide(numerator=hist_forward_quark, denominator=hist_central_quark)
            gluon_factor = safe_array_divide(numerator=hist_forward_gluon, denominator=hist_central_gluon)

            new_var_idx = pd.cut(_sample[var], bins=bin_var, right=False, labels=False)  # Binned feature distribution 
            for i, score in enumerate(bin_var[:-1]): # Loop over the bins 
                mod_idx = np.where(new_var_idx == i)[0]
                _sample.iloc[mod_idx, _sample.columns.get_loc(f'{var}_quark_reweighting_weights')] *= quark_factor[i]
                _sample.iloc[mod_idx, _sample.columns.get_loc(f'{var}_gluon_reweighting_weights')] *= gluon_factor[i]
            
        reweighted_sample.append(_sample)

    return pd.concat(reweighted_sample)


reweighted_sample = Calculate_reweight_factor(sample=sample, clf_type='new_MLPprob')

100%|██████████| 6/6 [01:28<00:00, 14.82s/it]


In [10]:
reweighted_sample.head()

Unnamed: 0,jet_pt,jet_eta,jet_nTracks,jet_trackWidth,jet_trackC1,jet_trackBDT,jet_PartonTruthLabelID,equal_weight,event_weight,flatpt_weight,is_forward,pt_idx,target,new_MLPprob,jet_nTracks_quark_reweighting_weights,jet_nTracks_gluon_reweighting_weights,jet_trackBDT_quark_reweighting_weights,jet_trackBDT_gluon_reweighting_weights,new_MLPprob_quark_reweighting_weights,new_MLPprob_gluon_reweighting_weights
7,525.046448,-1.230899,21.0,0.026432,0.188511,0.136256,1.0,1.0,0.502558,1.737371e-07,1.0,0,0.0,0.668663,0.736131,0.377296,0.69928,0.378954,0.473112,0.423288
11,519.972595,-1.157604,17.0,0.120245,0.257995,-0.025059,21.0,1.0,1.053274,3.573635e-07,1.0,0,1.0,0.462145,1.476486,0.749412,1.340951,0.754682,1.280776,1.16257
15,533.982361,-1.150206,11.0,0.015249,0.185861,-0.2747,2.0,1.0,0.698572,2.415003e-07,1.0,0,0.0,0.140065,0.852193,0.469778,0.886139,0.506668,0.869151,0.857833
17,534.412231,-0.183188,40.0,0.198334,0.333095,0.310029,21.0,1.0,0.394101,1.362431e-07,0.0,0,1.0,0.892167,0.632219,0.347629,0.593279,0.312994,0.220145,0.196692
18,503.886597,0.486479,40.0,0.086191,0.283057,0.412483,21.0,1.0,1.292122,4.384019e-07,1.0,0,1.0,0.93842,2.072829,1.139754,1.93426,1.01452,0.517195,0.449123


In [11]:
reweighted_sample.to_pickle('reweighted_pred_sample.pkl')