## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [2]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Scripts_Python.ML_Python_Build_FeatureArrays_FromROOT import Build_FeatureArrays_FromROOT
from Scripts_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLCoefficients_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators,
    Full_TrainTest)
from datetime import datetime



###########################################
#                                         #
#     DATA PREPARATION - CHANGE BELOW     #
#                                         #
###########################################

# File Directories
file_directory   = "../../Files/Test/"
output_directory = file_directory
csv_directory    = file_directory + "CSV_Backup/"

# Training Data Sources
train_file_prep_arr = [  
    # String tuple with: 
    # (0:"File_Name.root",   1:"Tree_Name",
    #  2:"Base_Name",   3:"Bias",   4:(pt_min, pt_max))
    ("Full_Train_B8_10_90_N500000_ML_Prep.root", "ML_Train_B8_10_90_N500000_Flat",
     "Train_B8_Flat_10_90", "B8_Flat", (10., 90.)),
#     ("Full_Train_B8_10_90_N500000_ML_Prep.root", "ML_Train_B8_10_90_N500000",
#      "Train_B8_10_90",      "B8",   (10., 90.)),
#     ("Full_Train_B4_10_90_N500000_ML_Prep.root", "ML_Train_B4_10_90_N500000",
#      "Train_B4_10_90",      "B4",   (10., 90.)),
#     ("Full_Train_B0_10_90_N500000_ML_Prep.root", "ML_Train_B0_10_90_N500000",
#      "Train_B8_10_90",      "B0",   (10., 90.))
]

# Testing Data Sources
test_file_prep_arr = [
    # String tuple with: 
    # (0:"File_Name.root",   1:"Tree_Name",
    #  2:"Base_Name",   3:"Bias",   4:(pt_min, pt_max))
    ("Full_Test_B8_10_90_N500000_ML_Prep.root", "ML_Test_B8_10_90_N500000_Flat", 
     "Test_Flat_10_90", "B8_Flat", (10., 90.))
]

# Testing pT Bins
test_bin_array = [
    # Tuple with:
    # (0:"Test Label / Folder Name", 
    #  1:(Training bins: (min,max), (min,max),...), 3:Optional testing bin (min,max))
#     ("Test_4GeV_Bins", 
#      ((18,22), (28,32), (38,42), (48,52), (58,62), (68,72), (78,82))),
#     ("Test_Centered_Wide_Bins", 
#      ((40,60), (30,70), (20,80), (10,90)))
    ("Train_Centered_Test_40_60", 
     ((40,60), (30,70), (20,80), (10,90)), (40,60)) # <- Includes a 3rd index item
]

# Training and Testing pT Bins
traintest_bin_array = [ 
    # Tuple with:
    # (0:"Test Label / Folder Name", 
    #  1:(Training bins: (min,max), (min,max),...), 3:Optional testing bin (min,max))
#     ("Train_20GeV_Bins", 
#      ((10,30), (20,40), (30,50), (40,60), (50,70), (60,80), (70,90))),
#     ("Train_30GeV_Bins", 
#      ((10,40), (20,50), (30,60), (40,70), (50,80), (60,90))),
]



########## ANYTHING BELOW THIS SHOULDN'T NEED TO CHANGE ##########



# Builds output directories
try:
    os.mkdir(output_directory)
    print("made output directory")
except:
    print("Output directory already exists or not made")
    
try:
    os.mkdir(csv_directory)
    print("made 'CSV_Backup' directory")
except:
    print("'CSV_Backup/' already exists or not made")

# Builds feature and target arrays from root file, or skips them if csv already exists

# Training data
train_file_bundle = []
for train_file_info in train_file_prep_arr:
    train_file_path  = file_directory + train_file_info[0]
    train_csv_path   = csv_directory + "ML_CSV_" + train_file_info[2] + ".csv"
    train_file_bundle.append((train_csv_path, train_file_info[2], train_file_info[3]))
    if not os.path.exists(train_csv_path):
        X_train, y_train, sc_train = Build_FeatureArrays_FromROOT(
            train_file_path, train_file_info[1], train_csv_path, train_file_info[4][0], train_file_info[4][1])
        
# Testing data
test_file_bundle = []
for test_file_info in test_file_prep_arr:
    test_file_path   = file_directory + test_file_info[0]
    test_csv_path    = csv_directory + "ML_CSV_" + test_file_info[2] + ".csv"
    test_file_bundle.append((test_csv_path, test_file_info[2], test_file_info[3]))
    if not os.path.exists(test_csv_path):
        X_test, y_test, sc_test = Build_FeatureArrays_FromROOT(
            test_file_path,  test_file_info[1],  test_csv_path,  test_file_info[4][0],  test_file_info[4][1])

# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 11 features (removes jet_pt_corr)
feature_label_11feat = [
    "jet_pt_raw",                      "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_11feat = [0,    2, 3, 5, 6, 8, 9, 10, 11, 18, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]

# CONSIDERATIONS:
# get rid of jet_pt_corr - why not try to correct without it?


now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

'ML_Results/' already exists or not made
'CSV_Backup/' already exists or not made

Ready! 2023/02/16 13:14:15


In [3]:
feature_bundle = [
    (feature_label_1feat,  feature_index_1feat), 
    (feature_label_3feat,  feature_index_3feat),
    (feature_label_12feat, feature_index_12feat),
    (feature_label_11feat, feature_index_11feat)
]

Full_TrainTest(
    train_file_bundle,  # train_file_bundle
    test_file_bundle,   # test_file_bundle
    feature_bundle,     # feature_bundle
    test_bin_array,     # test_bin_array
    traintest_bin_array,# traintest_bin_array
    output_directory,   # output_directory
    10.,       # train_pt_min
    90.,       # train_pt_max
    use_lr  = True,     # use_lr
    use_rf  = True,     # use_rf
    use_mlp = True      # use_mlp
    )

Directory already exists: ../../Files/Test/
Preparing to collect data from csv backup file...
Jet: 10000 | pTraw: 67.802 | pTcorr:  9.889 | pTtrue:  22.714
Jet: 20000 | pTraw: 113.494 | pTcorr:  68.569 | pTtrue:  73.197
Jet: 30000 | pTraw: 128.162 | pTcorr:  83.969 | pTtrue:  78.818
Jet: 40000 | pTraw: 53.972 | pTcorr:  6.135 | pTtrue:  13.920
Jet: 50000 | pTraw: 74.984 | pTcorr:  17.824 | pTtrue:  23.420
Jet: 60000 | pTraw: 133.699 | pTcorr:  77.283 | pTtrue:  89.206
Jet: 70000 | pTraw: 135.533 | pTcorr:  82.953 | pTtrue:  89.507
Jet: 80000 | pTraw: 116.430 | pTcorr:  68.602 | pTtrue:  71.660
Jet: 90000 | pTraw: 106.142 | pTcorr:  55.696 | pTtrue:  58.960
Jet: 100000 | pTraw: 90.687 | pTcorr:  25.634 | pTtrue:  24.841
Jet: 110000 | pTraw: 106.689 | pTcorr:  60.016 | pTtrue:  40.791
Jet: 120000 | pTraw: 76.343 | pTcorr:  25.970 | pTtrue:  31.077
Jet: 130000 | pTraw: 84.739 | pTcorr:  33.524 | pTtrue:  30.955
Jet: 140000 | pTraw: 77.394 | pTcorr:  32.290 | pTtrue:  28.719
Jet: 150000 | 

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 262176 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.984597313975904
[13.89696238]
[13.89696238 49.98459731]
Regression Coefficients:
jet_pt_raw 13.89696238448797
lr_intercept 49.984597313975904
<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.pipeline.Pipeline'>
Writing coefficients to CSV...
Coefficient CSV file complete.

Testing 1 features on 40_60 GeV...
ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting d




Multilayer Perceptron Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(max_iter=100))])
<class 'sklearn.pipeline.Pipeline'>
made directory

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 87252 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.997940298386574
[ 3.1078148  -0.94388597 -1.14457185]
[ 3.1078148  -0.94388597 -1.14457185 49.9979403 ]
Regression Coefficients:
jet_pt_raw 3.1078148021337153
jet_area -0.9438859734610536
jet_rho -1.1445718517083319
lr_intercept 49.997940298386574
<class 'sklearn.pipeline.Pipeline'




Multilayer Perceptron Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(max_iter=100))])
<class 'sklearn.pipeline.Pipeline'>
made directory

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 87252 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.997940298386574
[ 0.38032502  1.89781431 -1.39416531  0.44952562  0.70378393  0.70389997
  1.21646651  0.62783988  0.42449509  1.11595741  0.02243733  0.4472568 ]
[ 3.80325021e-01  1.89781431e+00 -1.39416531e+00  4.49525616e-01
  7.03783933e-01  7.03899970e-01  1.21646651e+00  6.27




Multilayer Perceptron Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(max_iter=100))])
<class 'sklearn.pipeline.Pipeline'>
made directory

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 87252 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.997940298386574
[ 2.62969207 -1.39064765 -0.38669086  0.61819412  0.64546025  1.21601726
  0.62749355  0.42494801  1.11521288  0.02226766 -0.48662591]
[ 2.62969207e+00 -1.39064765e+00 -3.86690864e-01  6.18194121e-01
  6.45460249e-01  1.21601726e+00  6.27493552e-01  4.24948015e-01
 

Jet: 320000 | pTraw: 99.787 | pTcorr:  41.591 | pTtrue:  39.929
Jet: 330000 | pTraw: 71.682 | pTcorr:  14.959 | pTtrue:  14.693
Jet: 340000 | pTraw: 100.084 | pTcorr:  39.902 | pTtrue:  48.623
Jet: 350000 | pTraw: 72.465 | pTcorr:  13.904 | pTtrue:  26.976
Jet: 360000 | pTraw: 131.157 | pTcorr:  83.618 | pTtrue:  62.055
Jet: 370000 | pTraw: 104.449 | pTcorr:  49.863 | pTtrue:  56.937
Jet: 380000 | pTraw: 84.693 | pTcorr:  38.324 | pTtrue:  29.811
Jet: 390000 | pTraw: 98.544 | pTcorr:  47.749 | pTtrue:  33.386
Jet: 400000 | pTraw: 10.760 | pTcorr:  3.191 | pTtrue:  16.660
Jet: 410000 | pTraw: 83.914 | pTcorr:  23.916 | pTtrue:  28.244
Jet: 420000 | pTraw: 49.416 | pTcorr:  5.587 | pTtrue:  23.649
Jet: 430000 | pTraw: 93.354 | pTcorr:  39.321 | pTtrue:  29.593
Jet: 440000 | pTraw: 88.619 | pTcorr:  35.148 | pTtrue:  31.754
Jet: 450000 | pTraw: 94.736 | pTcorr:  39.934 | pTtrue:  27.414
Jet: 460000 | pTraw: 42.444 | pTcorr: -0.234 | pTtrue:  11.945
Jet: 470000 | pTraw: 84.913 | pTcorr:  2

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 392700 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
44.23144448997472
[14.33669254]
[14.33669254 44.23144449]
Regression Coefficients:
jet_pt_raw 14.336692540084277
lr_intercept 44.23144448997472
<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.pipeline.Pipeline'>
Writing coefficients to CSV...
Coefficient CSV file complete.

Testing 1 features on 40_60 GeV...
ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting da


Random Tree Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])
Feature Importance:
jet_pt_raw 0.013610127625299716
jet_pt_corr 0.8468343382590209
jet_mass 0.006603486650936075
jet_area 0.00344853512434212
jet_const_n 0.002452494672047505
const_pt_mean 0.0393520348956616
const_1_pt 0.018084354761896064
const_2_pt 0.02744873541532526
const_3_pt 0.01218534212418874
const_4_pt 0.01544369260643458
jet_y 0.009802338209176725
jet_rho 0.004734519655670848
<class 'sklearn.pipeline.Pipeline'>
Writing coefficients to CSV...
Coefficient CSV file complete.

Training multilayer perceptron (neural net) regression estimator...

----- Fitting Neural Network Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.






Multilayer Perceptron Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(max_iter=100))])
<class 'sklearn.pipeline.Pipeline'>
made directory

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 110078 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.530866297969624
[ 0.6436735   1.69113654 -1.46033964  0.36315247  0.70720284  0.67318118
  1.23113807  0.62824889  0.39390857  1.14376213  0.00897486  0.34533448]
[ 6.43673504e-01  1.69113654e+00 -1.46033964e+00  3.63152471e-01
  7.07202838e-01  6.73181177e-01  1.23113807e+00  6.2




Multilayer Perceptron Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor', MLPRegressor(max_iter=100))])
<class 'sklearn.pipeline.Pipeline'>
made directory

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 110078 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.530866297969624
[ 2.65189026 -1.45680145 -0.38233984  0.62764772  0.61853543  1.23077411
  0.62797468  0.39399214  1.14319138  0.00890502 -0.48541555]
[ 2.65189026e+00 -1.45680145e+00 -3.82339839e-01  6.27647719e-01
  6.18535429e-01  1.23077411e+00  6.27974675e-01  3.93992137e-01


Jet: 340000 | pTraw: 90.546 | pTcorr:  29.082 | pTtrue:  22.914
Jet: 350000 | pTraw: 49.950 | pTcorr:  5.784 | pTtrue:  16.411
Jet: 360000 | pTraw: 122.188 | pTcorr:  66.395 | pTtrue:  72.610
Jet: 370000 | pTraw: 58.720 | pTcorr:  10.820 | pTtrue:  11.590
Jet: 380000 | pTraw: 88.082 | pTcorr:  26.026 | pTtrue:  21.689
Jet: 390000 | pTraw: 73.409 | pTcorr:  32.013 | pTtrue:  16.243
Jet: 400000 | pTraw: 84.979 | pTcorr:  30.218 | pTtrue:  24.153
Jet: 410000 | pTraw: 55.423 | pTcorr:  4.442 | pTtrue:  12.450
Jet: 420000 | pTraw: 72.110 | pTcorr:  21.566 | pTtrue:  10.494
Jet: 430000 | pTraw: 87.773 | pTcorr:  30.601 | pTtrue:  27.341
Jet: 440000 | pTraw: 129.844 | pTcorr:  71.192 | pTtrue:  76.652
Jet: 450000 | pTraw: 93.670 | pTcorr:  35.533 | pTtrue:  23.622
Jet: 460000 | pTraw: 54.216 | pTcorr:  0.096 | pTtrue:  12.090
Jet: 470000 | pTraw: 98.066 | pTcorr:  44.776 | pTtrue:  42.222
Jet: 480000 | pTraw: 113.780 | pTcorr:  76.748 | pTtrue:  78.121
Jet: 490000 | pTraw: 85.929 | pTcorr:  2

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 600838 

Selecting data from master array...
Data ready. Feature array length: 86514 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
29.83765509000171
[16.26520069]
[16.26520069 29.83765509]
Regression Coefficients:
jet_pt_raw 16.265200691104486
lr_intercept 29.83765509000171
<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.pipeline.Pipeline'>
Writing coefficients to CSV...
Coefficient CSV file complete.

Testing 1 features on 40_60 GeV...
ML results .csv file closed.
Test and save complete!

Directory already exists: ../../Files/Test/Train_B4/

Building trainin

KeyboardInterrupt: 