## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [1]:
from ML_Python.ML_Python_Build_FeatureArrays_FromROOT import Build_FeatureArrays_FromROOT
from ML_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLWeights_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators)
from datetime import datetime
import os



def Build_SelectFeatureArray(
    X_features,
    feature_index
    ) :
    """
    Builds training and testing data sets
    """
    
    print("Selecting data from master array...")
    
    X_features_select = []
    for i in range(len(X_features)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_features[i][feature_index[j]])
        X_features_select.append(X_temp)
        
    print("Data ready. Feature array length:", len(X_features_select), "\n")
    
    return X_features_select

    

def TestAndSave_LinearRegression(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    lr_pipeline,      # Trained Linear Regression Pipeline
    lr_coeffs,        # Array of coefficient values from trained linear regression pipeline
    X_test_select,    # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    pt_test_min,      # Float of min pT to test with
    pt_test_max,      # Float of max pT to test with
    output_filename,  # Directory path + name for output csv file
    use_scaler = True # If true, rescales data
    ) :
    
    X_test_temp  = []
    y_test_temp  = []
    sc_test_temp = []
    
    for i in range(len(y_test)):
        if y_test[i] > pt_test_min and y_test[i] < pt_test_max:
            X_test_temp.append(X_test_select[i])
            y_test_temp.append(y_test[i])
            sc_test_temp.append(sc_test[i])
        else: continue
    
    # Tests estimator
    
    print(type(lr_pipeline))
    
    lr_results, lr_results_delta = Test_Estimator(
        lr_pipeline,
        X_test_temp, 
        y_test_temp
        )
    
    # Writes outputs to a csv file
    Write_MLResults_ToCSV(
        output_filename,
        y_test_temp,
        sc_test_temp,
        lr_results,
        X_test_temp,
        feature_label
        )
    
    return



def TrainTestPlot_All_Estimators(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    X_train,          # Array of training data features
    y_train,          # Array of training data targets
    sc_train,         # Array of training data simple correction values
    X_test,           # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    output_file_path, # File path for outputs
    use_scaler = True,
    use_lr = True,
    use_rf = True,
    use_mlp = True,
    ) :
    
    # Builds training data set
    print("Selecting training data...")
    X_train_select = []
    for i in range(len(X_train)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_train[i][feature_index[j]])
        X_train_select.append(X_temp)
    print("Training data ready. X/Y length:", len(X_train_select), len(y_train), "/n")
    
    # Builds pipelines from selected training features
    print("Building estimator pipelines...")
    lr_pipeline, rf_pipeline, mlp_pipeline, lr_coeffs, rf_features = Train_All_Estimators(
        X_train_select, y_train, feature_label, 
        use_StandardScaler = use_scaler,
        use_LinearRegression = use_lr,
        use_RandomForest = use_rf,
        use_MLP = use_mlp)
    print("Pipelines built./n")
    
    print("Selecting testing data...")
    X_test_select = []
    for i in range(len(X_test)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_test[i][feature_index[j]])
        X_test_select.append(X_temp)
    print("Testing data ready. X/Y length:", len(X_test_select), len(y_test), "/n")
    
    # Test estimators
    print("Testing all estimators...")
    lr_results, lr_results_delta, rf_results, rf_results_delta, mlp_results, mlp_results_delta = Test_All_Estimators(
        X_test_select, 
        y_test, 
        lr_pipeline,
        rf_pipeline,
        mlp_pipeline)
    print("Estimator testing complete!/n")
    
    return



now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

Welcome to JupyROOT 6.26/04

Ready! 2022/12/31 10:15:52


## Data Preparation
Sets up input directories/files for training and testing. Creates initial training data arrays.

In [7]:
file_directory   = "../Files/Comparison_Trial2/Data/"

train_file_name  = "ML_Prep_Train_B0_10_90_N500000.root"
train_tree_name  = "Jet_ML_Train_B0_10_90_N500000"
train_file_path  = file_directory + train_file_name
train_csv_path   = file_directory + train_file_name[0:-5] + "_Backup.csv"
train_range      = (10., 90.)

test_file_name   = "ML_Prep_Test_B8_10_90_N500000.root"
test_tree_name   = "Jet_ML_Test_B8_10_90_N500000"
test_file_path   = file_directory + test_file_name
test_csv_path    = file_directory + test_file_name[0:-5] + "_Backup.csv"
test_range       = (10., 90.)

output_csv_name  = file_directory + "ML_Results/30GeV_Bins/Train_B0"

try:
    os.mkdir(file_directory + "ML_Results/30GeV_Bins")
except:
    print("directory already exists")

# Builds feature and target arrays from ROOT file
X_train, y_train, sc_train = Build_FeatureArrays_FromROOT(
    train_file_path, train_tree_name, train_csv_path, train_range[0], train_range[1])
X_test, y_test, sc_test = Build_FeatureArrays_FromROOT(
    test_file_path,  test_tree_name,  test_csv_path,  test_range[0],  test_range[1])

# # Rebuilds feature and target arrays from csv file (MUCH faster if csv has been made already)
# X_train, y_train, sc_train = Build_FeatureArrays_FromCSV(train_csv_path)
# X_test,  y_test,  sc_test  = Build_FeatureArrays_FromCSV(test_csv_path)



# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]



now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

directory already exists
Input file accessed successfully. Output file generated.
Accessing input tree...
Input tree accessed successfully.
Creating .csv backup file...
Backup file started.
Preparing to collect data from TTree...
Jet: 10000 | pTraw: 62.258 | pTcorr:  8.813 | pTtrue:  11.416
Jet: 20000 | pTraw: 64.764 | pTcorr:  10.686 | pTtrue:  14.582
Jet: 30000 | pTraw: 74.677 | pTcorr:  15.376 | pTtrue:  10.248
Jet: 40000 | pTraw: 77.999 | pTcorr:  14.600 | pTtrue:  10.555
Jet: 50000 | pTraw: 67.386 | pTcorr:  12.472 | pTtrue:  13.537
Jet: 60000 | pTraw: 57.465 | pTcorr:  4.565 | pTtrue:  19.211
Jet: 70000 | pTraw: 82.745 | pTcorr:  24.559 | pTtrue:  12.792
Jet: 80000 | pTraw: 54.911 | pTcorr:  10.610 | pTtrue:  10.957
Jet: 90000 | pTraw: 86.909 | pTcorr:  19.999 | pTtrue:  19.625
Jet: 100000 | pTraw: 54.377 | pTcorr: -1.509 | pTtrue:  11.781
Jet: 110000 | pTraw: 35.099 | pTcorr: -2.911 | pTtrue:  10.075
Jet: 120000 | pTraw: 57.615 | pTcorr:  2.611 | pTtrue:  10.821
Jet: 130000 | pT

Jet: 670000 | pTraw: 137.986 | pTcorr:  84.921 | pTtrue:  74.864
Jet: 680000 | pTraw: 60.335 | pTcorr:  23.807 | pTtrue:  19.029
Jet: 690000 | pTraw: 5.798 | pTcorr: -0.322 | pTtrue:  14.576
Jet: 700000 | pTraw: 96.868 | pTcorr:  57.531 | pTtrue:  67.429
Jet: 710000 | pTraw: 65.482 | pTcorr:  23.698 | pTtrue:  33.125
Jet: 720000 | pTraw: 89.073 | pTcorr:  27.160 | pTtrue:  28.951
Jet: 730000 | pTraw: 75.409 | pTcorr:  3.495 | pTtrue:  21.793
Jet: 740000 | pTraw: 5.075 | pTcorr: -4.520 | pTtrue:  11.066
Jet: 750000 | pTraw: 77.485 | pTcorr:  31.165 | pTtrue:  19.686
Jet: 760000 | pTraw: 74.166 | pTcorr:  22.251 | pTtrue:  30.539
Jet: 770000 | pTraw: 113.546 | pTcorr:  72.085 | pTtrue:  78.752
Jet: 780000 | pTraw: 85.929 | pTcorr:  36.962 | pTtrue:  22.683
Jet: 790000 | pTraw: 128.616 | pTcorr:  74.256 | pTtrue:  83.493
Jet: 800000 | pTraw: 101.842 | pTcorr:  51.454 | pTtrue:  37.574
Jet: 810000 | pTraw: 63.034 | pTcorr:  4.317 | pTtrue:  20.618
Jet: 820000 | pTraw: 6.279 | pTcorr: -1.59

## Training & Testing
1 Feature: pt_raw ONLY

3 Features: pt_raw, jet_area, jet_rho

12 Features: jet_pt_raw, jet_pt_corr, jet_mass, jet_area, jet_const_n, const_pt_mean, const_1_pt, const_2_pt, const_3_pt, const_4_pt, jet_y, jet_rho

In [5]:
#################################################
#                                               #
#  TRAIN ML ON ONE BIN, WITH 1, 3, 12 FEATURES  #
#                                               #
#################################################


test_min_max_array = [  # Array of min and max for pT ranges to test on
    [18,22], [28,32], [38,42], [48,52], 
    [58,62], [68,72], [78,82]
    ]
feature_bundle = [
    [feature_label_1feat,  feature_index_1feat], 
    [feature_label_3feat,  feature_index_3feat],
    [feature_label_12feat, feature_index_12feat]
    ]
train_bundle = [ # This may be implemented later to iterate through multiple training sets
    [X_train, y_train, sc_train]
    ]

for feature_set in feature_bundle:
    feature_label = feature_set[0]
    feature_index = feature_set[1]
    
    output_csv_name_2 = output_csv_name + "_F" + str(len(feature_label)) + "_" + str(int(train_range[0])) + "_" + str(int(train_range[1]))
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train, feature_index)
    X_test_select  = Build_SelectFeatureArray(X_test, feature_index)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train, 
        feature_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name_2 + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_label
        )

    # Tests estimator and saves results

    for min_max in test_min_max_array:
        
        output = "\nTesting " + str(len(feature_index)) + " features on " + str(min_max[0]) + "-" + str(min_max[1]) + " GeV..."
        print(output)
        
        csv_path = output_csv_name_2 + "_Test_" + str(int(min_max[0])) + "_" + str(int(min_max[1])) + ".csv"
        
        TestAndSave_LinearRegression(
            feature_label,
            feature_index, 
            lr_pipeline, 
            lr_coeffs,
            X_test_select,
            y_test, 
            sc_test,
            min_max[0],
            min_max[1],   
            csv_path,
            use_scaler = True
            )
        
        print("Test and save complete!\n")

    

now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 542699 

Selecting data from master array...
Data ready. Feature array length: 961713 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
Regression Coefficients:
jet_pt_raw 1.32688560728391
<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.pipeline.Pipeline'>
ML weights .csv file closed.

Testing 1 features on 18-22 GeV...
<class 'sklearn.pipeline.Pipeline'>
ML results .csv file closed.
Test and save complete!


Testing 1 features on 28-32 GeV...
<class 'sklearn.pipeline.Pipeline'>
ML results .csv file closed.
Test and save complete!


Testing 1 features on 38-42 GeV...
<class 'sklearn.pipeline.Pipeline'>
ML results .c

In [6]:
#################################################
#                                               #
#  TRAIN ML OVER 20 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################



test_min_max_array = [  # Array of min and max for pT ranges to test on
    [18,22], [28,32], [38,42], [48,52], 
    [58,62], [68,72], [78,82]
    ]
training_bundle = [
    [10.,30.], [20.,40.], [30.,50.], [40.,60.],
    [50.,70.], [60.,80.], [70.,90.]
]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name_2 = output_csv_name + "_F" + str(len(feature_label_12feat)) + "_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_label_12feat, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name_2 + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_label_12feat
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(train_min) + "-" + str(train_max) + " GeV..."
    print(output)

    csv_path = output_csv_name_2 + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")

    

now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 533544 

Selecting data from master array...
Data ready. Feature array length: 498734 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
Regression Coefficients:
jet_pt_raw 2.615752355931794
jet_pt_corr 0.15663272188135036
jet_mass 3.711311407582237
jet_area -1.076648157099351
jet_const_n -4.871936142641229
const_pt_mean 0.05558063793870576
const_1_pt 0.4147796840138919
const_2_pt 0.16432699642885681
const_3_pt 0.23435196105498954
const_4_pt -0.41251385440136507
jet_y 0.008780589992905426
jet_rho -0.13859868130078729
<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.pipeline.Pipeline'>
ML weights .csv file closed.

Tes

In [8]:
#################################################
#                                               #
#  TRAIN ML OVER 30 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

training_bundle = [
    [10.,40.], [20.,50.], [30.,60.], [40.,70.],
    [50.,80.], [60.,90.]
]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name_2 = output_csv_name + "_F" + str(len(feature_label_12feat)) + "_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_label_12feat, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name_2 + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_label_12feat
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(train_min) + "-" + str(train_max) + " GeV..."
    print(output)

    csv_path = output_csv_name_2 + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")

    

now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 539932 

Selecting data from master array...
Data ready. Feature array length: 608594 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
Regression Coefficients:
jet_pt_raw 3.495511488493191
jet_pt_corr 0.21222265163745857
jet_mass 4.353110742323748
jet_area -1.4021797036377104
jet_const_n -6.003130685528323
const_pt_mean 0.05606349702381845
const_1_pt 0.5280578018476776
const_2_pt 0.19157595335982525
const_3_pt 0.31816506131694167
const_4_pt -0.5243539242716067
jet_y 0.01269841050808216
jet_rho -0.17363881765139577
<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.pipeline.Pipeline'>
ML weights .csv file closed.

Test