In [1]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import uproot 
import awkward as ak
from pathlib import Path

from typing import Dict, List 
import re
import pickle
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from sklearn import model_selection 



training_vars = ['jet_pt', 'jet_eta', 'jet_nTracks', 'jet_trackWidth', 'jet_trackC1']
all_vars = training_vars + ['total_weight', 'flatpt_weight']
n_jets = 2_000_000
sample_size = 100000

sample_alljets_path = '../../samples/BDT_training/sample_2M_w_flatpt.pkl'
with open(sample_alljets_path, 'rb') as f:
    sample_2Mjets = pd.read_pickle(f)

sample_2Mjets = sample_2Mjets[sample_2Mjets['jet_pt'] >=1500]

# sel_idx = np.append(np.arange(0,sample_size), np.arange(n_jets//2, n_jets//2+sample_size))

# X = sample_2Mjets.iloc[sel_idx, :-1]
# y = sample_2Mjets.iloc[sel_idx, -1]
X = sample_2Mjets.iloc[:, :-1]
y = sample_2Mjets.iloc[:, -1]

X_dev,X_eval, y_dev,y_eval = train_test_split(X, y, test_size=0.1, random_state=456)
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.1/0.9, random_state=789)



In [2]:
len(sample_2Mjets)

176587

In [3]:
dt = DecisionTreeClassifier(max_depth=3,
                            min_samples_leaf=0.001,
                            max_features="log2")
bdt = AdaBoostClassifier(dt,
                        algorithm="SAMME",
                        n_estimators=800,
                        learning_rate=0.001)


param_grid = {"base_estimator__max_depth": [3, 5, 7, 9],
              "n_estimators": [500, 750, 1000],
              'learning_rate': [0.1, 0.5, 1.]}


# physical_weight_socre = make_scorer(roc_auc_score, sample_weight = X_dev['total_weight'])
# clf = model_selection.GridSearchCV(bdt,
#                                param_grid,
#                                cv=3,
#                                scoring=physical_weight_socre,
#                                n_jobs=8,
#                                verbose=3)

clf = model_selection.GridSearchCV(bdt,
                               param_grid,
                               cv=3,
                               scoring='roc_auc',
                               n_jobs=12,
                               verbose=3)
                               
_ = clf.fit(X_dev[training_vars],y_dev, sample_weight = X_dev['flatpt_weight'] )

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
print("Best parameter set found on development set:\n")
print(clf.best_estimator_)
print("Grid scores on a subset of the development set:\n")

# import pickle
# with open('cv_results_flat_pt.pkl', 'wb') as f:
#     pickle.dump(clf.cv_results_, f)