## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [11]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Scripts_Python.ML_Python_Build_FeatureArrays_FromROOT import Build_FeatureArrays_FromROOT
from Scripts_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLCoefficients_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators,
    Full_TrainTest)
from datetime import datetime



###########################################
#                                         #
#     DATA PREPARATION - CHANGE BELOW     #
#                                         #
###########################################

# File Directories
# file_directory   = "../../Files/Thesis_Data/LR_Coeff_Test/"
file_directory   = "../../Files/Thesis_Data/"
output_directory = file_directory
csv_directory    = file_directory + "CSV_Backup/"

# Training Data Sources
train_file_prep_arr = [  
    # String tuple with: 
    # (0:"File_Name.root",   1:"Tree_Name",
    #  2:"Base_Name",   3:"Bias",   4:(pt_min, pt_max))
#     ("Full_Train_T2_B8_10_90_N500000_ML_Prep.root", "ML_Train_T2_B8_10_90_N500000_Flat",
#      "Train_B8_Flat_T2_10_90", "B8_Flat_T2", (10., 90.)),
    ("Full_Train_B8_10_90_N500000_ML_Prep.root", "ML_Train_B8_10_90_N500000_Flat",
     "Train_B8_Flat_10_90", "B8_Flat", (10., 90.)),
#     ("Full_Train_B8_10_90_N500000_ML_Prep.root", "ML_Train_B8_10_90_N500000",
#      "Train_B8_10_90",      "B8",   (10., 90.)),
#     ("Full_Train_B4_10_90_N500000_ML_Prep.root", "ML_Train_B4_10_90_N500000",
#      "Train_B4_10_90",      "B4",   (10., 90.)),
#     ("Full_Train_B0_10_90_N500000_ML_Prep.root", "ML_Train_B0_10_90_N500000",
#      "Train_B0_10_90",      "B0",   (10., 90.))
]

# Testing Data Sources
test_file_prep_arr = [
    # String tuple with: 
    # (0:"File_Name.root",   1:"Tree_Name",
    #  2:"Base_Name",   3:"Bias",   4:(pt_min, pt_max))
#     ("Full_Test_B8_10_90_N500000_ML_Prep.root", "ML_Test_B8_10_90_N500000_Flat", 
#      "Test_B8_Flat_10_90", "B8_Flat", (10., 90.))#,
    ("Full_Train_B4_10_90_N500000_ML_Prep.root", "ML_Train_B4_10_90_N500000",
     "Train_B4_10_90",      "B4",   (10., 90.)),
    ("Full_Train_B0_10_90_N500000_ML_Prep.root", "ML_Train_B0_10_90_N500000",
     "Train_B0_10_90",      "B0",   (10., 90.))
]

# Testing pT Bins
test_bin_array = [
    # Tuple with:
    # (0:"Test Label / Folder Name", 
    #  1:(Training bins: (min,max), (min,max),...), 3:Optional testing bin (min,max))
    ("Test_4GeV_Bins", 
     ((18,22), (28,32), (38,42), (48,52), (58,62), (68,72), (78,82))),
#     ("Test_10GeV_Bins", 
#      ((10,20), (20,30), (30,40), (40,50), (50,60), (60,70), (70,80), (80,90))),
    ("Test_Centered_Wide_Bins", 
     ((40,60), (30,70), (20,80), (10,90)))
]

# Training and Testing pT Bins
traintest_bin_array = [ 
    # Tuple with:
    # (0:"Test Label / Folder Name", 
    #  1:(Training bins: (min,max), (min,max),...), 3:Optional testing bin (min,max))
#     ("Train_20GeV_Bins", 
#      ((10,30), (20,40), (30,50), (40,60), (50,70), (60,80), (70,90))),
#     ("Train_30GeV_Bins", 
#      ((10,40), (20,50), (30,60), (40,70), (50,80), (60,90)))
]



########## ANYTHING BELOW THIS SHOULDN'T NEED TO CHANGE ##########



# Builds output directories
try:
    os.mkdir(output_directory)
    print("made output directory")
except:
    print("Output directory already exists or not made")
    
try:
    os.mkdir(csv_directory)
    print("made 'CSV_Backup' directory")
except:
    print("'CSV_Backup/' already exists or not made")

# Builds feature and target arrays from root file, or skips them if csv already exists

# Training data
train_file_bundle = []
for train_file_info in train_file_prep_arr:
    train_file_path  = file_directory + train_file_info[0]
    train_csv_path   = csv_directory + "ML_CSV_" + train_file_info[2] + ".csv"
    train_file_bundle.append((train_csv_path, train_file_info[2], train_file_info[3]))
    if not os.path.exists(train_csv_path):
        X_train, y_train, sc_train = Build_FeatureArrays_FromROOT(
            train_file_path, train_file_info[1], train_csv_path, train_file_info[4][0], train_file_info[4][1])
        
# Testing data
test_file_bundle = []
for test_file_info in test_file_prep_arr:
    test_file_path   = file_directory + test_file_info[0]
    test_csv_path    = csv_directory + "ML_CSV_" + test_file_info[2] + ".csv"
    test_file_bundle.append((test_csv_path, test_file_info[2], test_file_info[3]))
    if not os.path.exists(test_csv_path):
        X_test, y_test, sc_test = Build_FeatureArrays_FromROOT(
            test_file_path,  test_file_info[1],  test_csv_path,  test_file_info[4][0],  test_file_info[4][1])

# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 11 features (removes jet_pt_corr)
feature_label_11feat = [
    "jet_pt_raw",                      "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_11feat = [0,    2, 3, 5, 6, 8, 9, 10, 11, 18, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]

# CONSIDERATIONS:
# get rid of jet_pt_corr - why not try to correct without it?


now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

Output directory already exists or not made
'CSV_Backup/' already exists or not made

Ready! 2023/03/13 13:22:32


In [12]:
feature_bundle = [
#     (feature_label_1feat,  feature_index_1feat), 
#     (feature_label_3feat,  feature_index_3feat),
#     (feature_label_12feat, feature_index_12feat),
    (feature_label_11feat, feature_index_11feat)
]

Full_TrainTest(
    train_file_bundle,  # train_file_bundle
    test_file_bundle,   # test_file_bundle
    feature_bundle,     # feature_bundle
    test_bin_array,     # test_bin_array
    traintest_bin_array,# traintest_bin_array
    output_directory,   # output_directory
    10.,                # train_pt_min
    90.,                # train_pt_max
    use_lr  = True,     # use_lr
    use_rf  = True,     # use_rf
    use_mlp = True,     # use_mlp
    use_lr_tt  = False,  # use lr for test/train
    use_rf_tt  = False,  # use rf for test/train
    use_mlp_tt = False, # use mlp for test/train
    rf_n_estimators = 150,
    rf_max_depth = 50,
    rf_n_jobs = 4,
    mlp_max_iter = 500,
    mlp_hidden_layer_sizes = 150
    )

Preparing to collect data from csv backup file...
Data collected!
Backup .csv file closed.
All data transferred to array. Testing with 348075 jets.

Data set lengths: 348075 / 348075 / 348075

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 348075 


Training linear regression estimator...
Loading ../../Files/Thesis_Data/JOBLIB_Backup/Train_B8_Flat_F11_10_90_Pipeline_LR.joblib
Loading ../../Files/Thesis_Data/JOBLIB_Backup/Train_B8_Flat_F11_10_90_Pipeline_LR_Coeffs.joblib

Training random forest regression estimator...
Loading ../../Files/Thesis_Data/JOBLIB_Backup/Train_B8_Flat_F11_10_90_Pipeline_RF.joblib
Loading ../../Files/Thesis_Data/JOBLIB_Backup/Train_B8_Flat_F11_10_90_Pipeline_RF_Coeffs.joblib
Writing coefficients to CSV...
Coefficient CSV file complete.

Training multilayer perceptron (neural net) regression estimator...
Loading ../../Files/Thesis_Data/JOBLIB_Backup/Train_B8_Flat_F11_10_90_Pipeline_ML