In [1]:
# just trainig for all the jets in the pt range (500, 2000) GeV
# Grid Search for best parameters 
import random
from re import X

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import roc_curve, auc


import uproot 
import awkward as ak

In [2]:
file = "/global/cfs/projectdirs/atlas/hrzhao/qgcal/BDT_EB3/pkls/small_sample_periodA.pkl"

small_sample = pd.read_pickle(file)
small_sample = small_sample.drop(columns=['jet_eta'])
small_sample_array = small_sample.to_numpy()

assert small_sample_array.shape == (200000, 10 -1)

In [3]:
columns = [*small_sample.columns]
n_features = 4
flat_pt_weight_idx = columns.index('flat_weight')
phys_weight_idx = columns.index('total_weight')


In [4]:
X = small_sample_array[:, :-1]
y = small_sample_array[:, -1]

In [5]:
print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (200000, 8)
y Shape: (200000,)


In [6]:
from sklearn.model_selection import train_test_split

X_dev,X_eval, y_dev,y_eval = train_test_split(X, y, test_size=0.1, random_state=456)
X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.1/0.9, random_state=789)

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

dt = DecisionTreeClassifier(max_depth=3,
                            min_samples_leaf=0.001,
                            max_features="log2")
bdt = AdaBoostClassifier(dt,
                        algorithm="SAMME",
                        n_estimators=800,
                        learning_rate=0.001)

scores = cross_val_score(bdt,
                        X_dev[:,0:n_features], y_dev,
                        scoring="roc_auc",
                        n_jobs=6,
                        cv=3)

print("Accuracy: %0.5f (+/- %0.5f)"%(scores.mean(), scores.std()))

Accuracy: 0.80979 (+/- 0.00105)


In [7]:
from sklearn import model_selection 
param_grid = {"base_estimator__max_depth": [3, 5, 7, 9],
              "n_estimators": [500, 750, 1000],
              'learning_rate': [0.1, 0.5, 1.]}


clf = model_selection.GridSearchCV(bdt,
                               param_grid,
                               cv=3,
                               scoring='roc_auc', # FIXME should we pass physics weight to scoring? 
                               n_jobs=8,
                               verbose=3)
_ = clf.fit(X_dev[:,:n_features],y_dev, sample_weight = X_dev[:,flat_pt_weight_idx] )

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=500;, score=0.826 total time= 1.1min
[CV 2/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=500;, score=0.828 total time= 1.1min
[CV 3/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=500;, score=0.829 total time= 1.1min
[CV 1/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=750;, score=0.827 total time= 1.6min
[CV 3/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=750;, score=0.830 total time= 1.6min
[CV 2/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=750;, score=0.828 total time= 1.6min
[CV 2/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=1000;, score=0.828 total time= 2.1min
[CV 1/3] END base_estimator__max_depth=3, learning_rate=0.1, n_estimators=1000;, score=0.827 total time= 2.1min
[CV 1/3] END base_estimator__max_depth=3, learni

In [8]:
print("Best parameter set found on development set:\n")
print(clf.best_estimator_)
print("Grid scores on a subset of the development set:\n")
print(clf.best_estimator_)


Best parameter set found on development set:

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(max_depth=5,
                                                         max_features='log2',
                                                         min_samples_leaf=0.001),
                   learning_rate=0.1, n_estimators=750)
Grid scores on a subset of the development set:



In [9]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs

In [10]:
import pickle
with open('./models/cv_results_flat_pt.pkl', 'wb') as f:
    pickle.dump(clf.cv_results_, f)