## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [1]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Scripts_Python.ML_Python_Build_FeatureArrays_FromROOT import (
    Build_FeatureArrays_FromROOT,
    Build_FeatureArrays_FromROOT_ByEvent)
from Scripts_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLWeights_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators)
from datetime import datetime



def Build_SelectFeatureArray(
    X_features,
    feature_index
    ) :
    """
    Builds training and testing data sets
    """
    
    print("Selecting data from master array...")
    
    X_features_select = []
    for i in range(len(X_features)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_features[i][feature_index[j]])
        X_features_select.append(X_temp)
        
    print("Data ready. Feature array length:", len(X_features_select), "\n")
    
    return X_features_select

    

def TestAndSave_LinearRegression(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    lr_pipeline,      # Trained Linear Regression Pipeline
    lr_coeffs,        # Array of coefficient values from trained linear regression pipeline
    X_test_select,    # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    pt_test_min,      # Float of min pT to test with
    pt_test_max,      # Float of max pT to test with
    output_filename,  # Directory path + name for output csv file
    use_scaler = True # If true, rescales data
    ) :
    
    X_test_temp  = []
    y_test_temp  = []
    sc_test_temp = []
    
    for i in range(len(y_test)):
        if y_test[i] > pt_test_min and y_test[i] < pt_test_max:
            X_test_temp.append(X_test_select[i])
            y_test_temp.append(y_test[i])
            sc_test_temp.append(sc_test[i])
        else: continue
    
    # Tests estimator
    
    print(type(lr_pipeline))
    
    lr_results, lr_results_delta = Test_Estimator(
        lr_pipeline,
        X_test_temp, 
        y_test_temp
        )
    
    # Writes outputs to a csv file
    Write_MLResults_ToCSV(
        output_filename,
        y_test_temp,
        sc_test_temp,
        lr_results,
        X_test_temp,
        feature_label
        )
    
    return



def TrainTestPlot_All_Estimators(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    X_train,          # Array of training data features
    y_train,          # Array of training data targets
    sc_train,         # Array of training data simple correction values
    X_test,           # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    output_file_path, # File path for outputs
    use_scaler = True,
    use_lr = True,
    use_rf = True,
    use_mlp = True,
    ) :
    
    # Builds training data set
    print("Selecting training data...")
    X_train_select = []
    for i in range(len(X_train)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_train[i][feature_index[j]])
        X_train_select.append(X_temp)
    print("Training data ready. X/Y length:", len(X_train_select), len(y_train), "/n")
    
    # Builds pipelines from selected training features
    print("Building estimator pipelines...")
    lr_pipeline, rf_pipeline, mlp_pipeline, lr_coeffs, rf_features = Train_All_Estimators(
        X_train_select, y_train, feature_label, 
        use_StandardScaler = use_scaler,
        use_LinearRegression = use_lr,
        use_RandomForest = use_rf,
        use_MLP = use_mlp)
    print("Pipelines built./n")
    
    print("Selecting testing data...")
    X_test_select = []
    for i in range(len(X_test)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_test[i][feature_index[j]])
        X_test_select.append(X_temp)
    print("Testing data ready. X/Y length:", len(X_test_select), len(y_test), "/n")
    
    # Test estimators
    print("Testing all estimators...")
    lr_results, lr_results_delta, rf_results, rf_results_delta, mlp_results, mlp_results_delta = Test_All_Estimators(
        X_test_select, 
        y_test, 
        lr_pipeline,
        rf_pipeline,
        mlp_pipeline)
    print("Estimator testing complete!/n")
    
    return



###########################################
#                                         #
#     DATA PREPARATION - CHANGE BELOW     #
#                                         #
###########################################



file_directory   = "../../../Jet_Reco_Joey/CU-Heavy-Ions-Jet-Reco-ML/Data/"

train_base_name  = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
train_bias       = "B8"
train_range      = (10., 90.) # pT min/max of training file

test_base_name   = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
test_range       = (10., 90.) # pT min/max of testing file



##### ANYTHING BELOW THIS SHOULDN'T NEED TO CHANGE #####

train_file_name  = "ML_Prep_10_90_Train8.root"
train_tree_name  = "Tree_Tree"
train_file_path  = file_directory + train_file_name
train_csv_path   = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"

test_file_name   = "ML_Prep_10_90_Train8.root"
test_tree_name   = "Tree_Tree"
test_file_path   = file_directory + test_file_name
test_csv_path    = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"

# Builds ML output directories
output_directory = file_directory + "ML_Results/"

try:
    os.mkdir(output_directory)
    print("made 'ML_Results' directory")
except:
    print("directory already exists")

# Rebuilds feature and target arrays from csv file, or rebuilds them if csv doesn't exist

# Training data
if os.path.exists(train_csv_path):
    X_train, y_train, sc_train = Build_FeatureArrays_FromCSV(train_csv_path)
else:
    X_train, y_train, sc_train = Build_FeatureArrays_FromROOT_ByEvent(
        train_file_path, train_tree_name, train_csv_path, train_range[0], train_range[1])

# Testing data
if os.path.exists(train_csv_path):
    X_test,  y_test,  sc_test  = Build_FeatureArrays_FromCSV(test_csv_path)
else:
    X_test, y_test, sc_test = Build_FeatureArrays_FromROOT_ByEvent(
        test_file_path,  test_tree_name,  test_csv_path,  test_range[0],  test_range[1])

# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]



now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

Welcome to JupyROOT 6.26/04
made 'ML_Results' directory
Input file accessed successfully. Output file generated.
Accessing input tree...
Input tree accessed successfully.
Creating .csv backup file...
Backup file started.
Preparing to collect data from TTree...
Jet: 10000 | pTraw: 123.391 | pTcorr:  71.410 | pTtrue:  84.804
Jet: 20000 | pTraw: 75.000 | pTcorr:  22.868 | pTtrue:  24.145
Jet: 30000 | pTraw: 112.965 | pTcorr:  68.582 | pTtrue:  64.775
Jet: 40000 | pTraw: 135.919 | pTcorr:  82.684 | pTtrue:  84.196
Jet: 50000 | pTraw: 83.135 | pTcorr:  30.042 | pTtrue:  39.726
Jet: 60000 | pTraw: 137.648 | pTcorr:  79.297 | pTtrue:  76.884
Jet: 70000 | pTraw: 75.015 | pTcorr:  15.004 | pTtrue:  20.667
Jet: 80000 | pTraw: 131.227 | pTcorr:  79.413 | pTtrue:  79.358
Jet: 90000 | pTraw: 75.028 | pTcorr:  15.992 | pTtrue:  30.021
Jet: 100000 | pTraw: 110.513 | pTcorr:  45.566 | pTtrue:  46.578
Jet: 110000 | pTraw: 81.198 | pTcorr:  23.460 | pTtrue:  17.140
Jet: 120000 | pTraw: 64.572 | pTcorr: 

Jet: 580000 | pTraw: 64.622 | pTcorr:  17.624 | pTtrue:  14.322
Jet: 590000 | pTraw: 155.418 | pTcorr:  83.456 | pTtrue:  73.281
Jet: 600000 | pTraw: 74.602 | pTcorr:  27.521 | pTtrue:  10.418
Jet: 610000 | pTraw: 113.958 | pTcorr:  64.817 | pTtrue:  69.698
Jet: 620000 | pTraw: 67.857 | pTcorr:  25.612 | pTtrue:  32.178
Jet: 630000 | pTraw: 55.966 | pTcorr:  10.392 | pTtrue:  13.993
Jet: 640000 | pTraw: 81.407 | pTcorr:  11.150 | pTtrue:  12.341
Jet: 650000 | pTraw: 83.193 | pTcorr:  31.343 | pTtrue:  34.873
Jet: 660000 | pTraw: 102.640 | pTcorr:  57.943 | pTtrue:  57.166
Backup .csv file closed.
All data transferred to array. Testing with 662946 jets.

Data set lengths: 662946 / 662946 / 662946

Ready! 2023/01/26 13:39:58


## Training & Testing
1 Feature: pt_raw ONLY

3 Features: pt_raw, jet_area, jet_rho

12 Features: jet_pt_raw, jet_pt_corr, jet_mass, jet_area, jet_const_n, const_pt_mean, const_1_pt, const_2_pt, const_3_pt, const_4_pt, jet_y, jet_rho

In [2]:
#################################################
#                                               #
#  TRAIN ML ON ONE BIN, WITH 1, 3, 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Test_4GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

test_min_max_array = [  # Array of min and max for pT ranges to test on
    [18,22], [28,32], [38,42], [48,52], 
    [58,62], [68,72], [78,82]
    ]
feature_bundle = [
#     [feature_label_1feat,  feature_index_1feat], 
#     [feature_label_3feat,  feature_index_3feat],
    [feature_label_12feat, feature_index_12feat]
    ]
train_bundle = [ # This may be implemented later to iterate through multiple training sets
    [X_train, y_train, sc_train]
    ]

for feature_set in feature_bundle:
    feature_label = feature_set[0]
    feature_coeff_label = feature_set[0].copy()
    feature_index = feature_set[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F" + str(len(feature_label)) + "_" + str(int(train_range[0])) + "_" + str(int(train_range[1]))
    
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train, feature_index)
    X_test_select  = Build_SelectFeatureArray(X_test, feature_index)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results

    for min_max in test_min_max_array:
        
        output = "\nTesting " + str(len(feature_index)) + " features on " + str(min_max[0]) + "-" + str(min_max[1]) + " GeV..."
        print(output)
        
        csv_path = output_csv_name + "_Test_" + str(int(min_max[0])) + "_" + str(int(min_max[1])) + ".csv"
        
        TestAndSave_LinearRegression(
            feature_label,
            feature_index, 
            lr_pipeline, 
            lr_coeffs,
            X_test_select,
            y_test, 
            sc_test,
            min_max[0],
            min_max[1],   
            csv_path,
            use_scaler = True
            )
        
        print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 20 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Train_20GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,30.], [20.,40.], [30.,50.], [40.,60.],
    [50.,70.], [60.,80.], [70.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "_" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 30 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Train_30GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,40.], [20.,50.], [30.,60.], [40.,70.],
    [50.,80.], [60.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "-" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")

    
    
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)

made directory
made subdirectories

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 662946 

Selecting data from master array...
Data ready. Feature array length: 662946 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
36.82004639273881
[ 1.18318673e+01  5.76391459e+00 -1.62024061e+00 -1.52315563e-01
 -4.51422150e+00 -2.04633807e+00  4.31537171e+00  2.41122273e+00
  1.68885217e+00  3.98542130e+00 -3.86194049e-03  1.63262835e-01]
[ 1.18318673e+01  5.76391459e+00 -1.62024061e+00 -1.52315563e-01
 -4.51422150e+00 -2.04633807e+00  4.31537171e+00  2.41122273e+00
  1.68885217e+00  3.98542130e+00 -3.86194049e-03  1.63262835e-01
  3.68200464e+01]
Regression Coefficients:
jet_

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 98174 

Selecting data from master array...
Data ready. Feature array length: 98174 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
59.74922664044022
[-0.05420434  2.66379938 -1.4967184   0.74164383  0.47030287  0.58489092
  1.02302081  0.47551431  0.38471411  0.95031199  0.01745452  0.78175661]
[-5.42043384e-02  2.66379938e+00 -1.49671840e+00  7.41643830e-01
  4.70302866e-01  5.84890925e-01  1.02302081e+00  4.75514309e-01
  3.84714113e-01  9.50311986e-01  1.74545213e-02  7.81756606e-01
  5.97492266e+01]
Regression Coefficients:
jet_pt_raw -0.05420433839729611
jet_

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 157569 

Selecting data from master array...
Data ready. Feature array length: 157569 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
54.138259154335266
[ 1.9095147   3.27086906 -2.29347393  0.63092872  0.08645927  0.51541166
  1.86414408  0.88516572  0.70533546  1.70396529  0.00345192  0.61345708]
[ 1.90951470e+00  3.27086906e+00 -2.29347393e+00  6.30928724e-01
  8.64592674e-02  5.15411661e-01  1.86414408e+00  8.85165724e-01
  7.05335459e-01  1.70396529e+00  3.45192248e-03  6.13457084e-01
  5.41382592e+01]
Regression Coefficients:
jet_pt_raw 1.9095146989542862
jet