## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [1]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Scripts_Python.ML_Python_Build_FeatureArrays_FromROOT import Build_FeatureArrays_FromROOT
from Scripts_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLWeights_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators)
from datetime import datetime



def Build_SelectFeatureArray(
    X_features,
    feature_index
    ) :
    """
    Builds training and testing data sets
    """
    
    print("Selecting data from master array...")
    
    X_features_select = []
    for i in range(len(X_features)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_features[i][feature_index[j]])
        X_features_select.append(X_temp)
        
    print("Data ready. Feature array length:", len(X_features_select), "\n")
    
    return X_features_select

    

def TestAndSave_LinearRegression(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    lr_pipeline,      # Trained Linear Regression Pipeline
    lr_coeffs,        # Array of coefficient values from trained linear regression pipeline
    X_test_select,    # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    pt_test_min,      # Float of min pT to test with
    pt_test_max,      # Float of max pT to test with
    output_filename,  # Directory path + name for output csv file
    use_scaler = True # If true, rescales data
    ) :
    
    X_test_temp  = []
    y_test_temp  = []
    sc_test_temp = []
    
    for i in range(len(y_test)):
        if y_test[i] > pt_test_min and y_test[i] < pt_test_max:
            X_test_temp.append(X_test_select[i])
            y_test_temp.append(y_test[i])
            sc_test_temp.append(sc_test[i])
        else: continue
    
    # Tests estimator
    
    print(type(lr_pipeline))
    
    lr_results, lr_results_delta = Test_Estimator(
        lr_pipeline,
        X_test_temp, 
        y_test_temp
        )
    
    # Writes outputs to a csv file
    Write_MLResults_ToCSV(
        output_filename,
        y_test_temp,
        sc_test_temp,
        lr_results,
        X_test_temp,
        feature_label
        )
    
    return



def TrainTestPlot_All_Estimators(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    X_train,          # Array of training data features
    y_train,          # Array of training data targets
    sc_train,         # Array of training data simple correction values
    X_test,           # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    output_file_path, # File path for outputs
    use_scaler = True,
    use_lr = True,
    use_rf = True,
    use_mlp = True,
    ) :
    
    # Builds training data set
    print("Selecting training data...")
    X_train_select = []
    for i in range(len(X_train)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_train[i][feature_index[j]])
        X_train_select.append(X_temp)
    print("Training data ready. X/Y length:", len(X_train_select), len(y_train), "/n")
    
    # Builds pipelines from selected training features
    print("Building estimator pipelines...")
    lr_pipeline, rf_pipeline, mlp_pipeline, lr_coeffs, rf_features = Train_All_Estimators(
        X_train_select, y_train, feature_label, 
        use_StandardScaler = use_scaler,
        use_LinearRegression = use_lr,
        use_RandomForest = use_rf,
        use_MLP = use_mlp)
    print("Pipelines built./n")
    
    print("Selecting testing data...")
    X_test_select = []
    for i in range(len(X_test)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_test[i][feature_index[j]])
        X_test_select.append(X_temp)
    print("Testing data ready. X/Y length:", len(X_test_select), len(y_test), "/n")
    
    # Test estimators
    print("Testing all estimators...")
    lr_results, lr_results_delta, rf_results, rf_results_delta, mlp_results, mlp_results_delta = Test_All_Estimators(
        X_test_select, 
        y_test, 
        lr_pipeline,
        rf_pipeline,
        mlp_pipeline)
    print("Estimator testing complete!/n")
    
    return



###########################################
#                                         #
#     DATA PREPARATION - CHANGE BELOW     #
#                                         #
###########################################



file_directory   = "../../Files/Comparison_Trial2/Data/"

train_base_name  = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
train_bias       = "B8"
train_range      = (10., 90.) # pT min/max of training file

test_base_name   = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
test_range       = (10., 90.) # pT min/max of testing file



##### ANYTHING BELOW THIS SHOULDN'T NEED TO CHANGE #####

train_file_name  = "ML_Prep_" + train_base_name + ".root"
train_tree_name  = "Jet_ML_" + train_base_name
train_file_path  = file_directory + train_file_name
train_csv_path   = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"

test_file_name   = "ML_Prep_" + test_base_name + ".root"
test_tree_name   = "Jet_ML_" + test_base_name
test_file_path   = file_directory + test_file_name
test_csv_path    = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"

# Builds ML output directories
output_directory = file_directory + "ML_Results/"

try:
    os.mkdir(output_directory)
    print("made 'ML_Results' directory")
except:
    print("directory already exists")

# Rebuilds feature and target arrays from csv file, or rebuilds them if csv doesn't exist

# Training data
if os.path.exists(train_csv_path):
    X_train, y_train, sc_train = Build_FeatureArrays_FromCSV(train_csv_path)
else:
    X_train, y_train, sc_train = Build_FeatureArrays_FromROOT(
        train_file_path, train_tree_name, train_csv_path, train_range[0], train_range[1])

# Testing data
if os.path.exists(train_csv_path):
    X_test,  y_test,  sc_test  = Build_FeatureArrays_FromCSV(test_csv_path)
else:
    X_test, y_test, sc_test = Build_FeatureArrays_FromROOT(
        test_file_path,  test_tree_name,  test_csv_path,  test_range[0],  test_range[1])

# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]



now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

Welcome to JupyROOT 6.26/04
directory already exists
Preparing to collect data from csv backup file...
Jet: 10000 | pTraw: 69.039 | pTcorr:  12.499 | pTtrue:  32.134
Jet: 20000 | pTraw: 127.286 | pTcorr:  75.146 | pTtrue:  62.960
Jet: 30000 | pTraw: 88.468 | pTcorr:  44.166 | pTtrue:  41.243
Jet: 40000 | pTraw: 108.331 | pTcorr:  42.789 | pTtrue:  67.764
Jet: 50000 | pTraw: 56.171 | pTcorr:  21.615 | pTtrue:  24.684
Jet: 60000 | pTraw: 116.375 | pTcorr:  59.762 | pTtrue:  65.163
Jet: 70000 | pTraw: 77.024 | pTcorr:  25.210 | pTtrue:  16.547
Jet: 80000 | pTraw: 82.198 | pTcorr:  24.789 | pTtrue:  20.457
Jet: 90000 | pTraw: 6.586 | pTcorr:  4.330 | pTtrue:  30.058
Jet: 100000 | pTraw: 5.084 | pTcorr: -0.270 | pTtrue:  41.968
Jet: 110000 | pTraw: 147.600 | pTcorr:  84.399 | pTtrue:  83.597
Jet: 120000 | pTraw: 101.184 | pTcorr:  52.551 | pTtrue:  36.231
Jet: 130000 | pTraw: 42.130 | pTcorr: -0.521 | pTtrue:  11.632
Jet: 140000 | pTraw: 69.807 | pTcorr:  17.179 | pTtrue:  10.456
Jet: 15000

Jet: 310000 | pTraw: 74.655 | pTcorr:  25.022 | pTtrue:  24.154
Jet: 320000 | pTraw: 69.683 | pTcorr:  23.111 | pTtrue:  36.670
Jet: 330000 | pTraw: 93.152 | pTcorr:  37.485 | pTtrue:  19.547
Jet: 340000 | pTraw: 7.790 | pTcorr: -2.278 | pTtrue:  12.054
Jet: 350000 | pTraw: 8.535 | pTcorr:  3.619 | pTtrue:  14.253
Jet: 360000 | pTraw: 61.232 | pTcorr:  16.374 | pTtrue:  21.734
Jet: 370000 | pTraw: 82.290 | pTcorr:  36.779 | pTtrue:  36.992
Jet: 380000 | pTraw: 44.184 | pTcorr:  19.203 | pTtrue:  17.229
Jet: 390000 | pTraw: 95.230 | pTcorr:  33.253 | pTtrue:  27.909
Jet: 400000 | pTraw: 26.673 | pTcorr: -3.252 | pTtrue:  26.602
Jet: 410000 | pTraw: 92.897 | pTcorr:  34.061 | pTtrue:  45.306
Jet: 420000 | pTraw: 95.345 | pTcorr:  45.336 | pTtrue:  33.139
Jet: 430000 | pTraw: 61.067 | pTcorr:  13.748 | pTtrue:  14.365
Jet: 440000 | pTraw: 83.100 | pTcorr:  32.265 | pTtrue:  41.430
Jet: 450000 | pTraw: 105.815 | pTcorr:  56.882 | pTtrue:  74.527
Jet: 460000 | pTraw: 64.682 | pTcorr:  15.29

## Training & Testing
1 Feature: pt_raw ONLY

3 Features: pt_raw, jet_area, jet_rho

12 Features: jet_pt_raw, jet_pt_corr, jet_mass, jet_area, jet_const_n, const_pt_mean, const_1_pt, const_2_pt, const_3_pt, const_4_pt, jet_y, jet_rho

In [9]:
#################################################
#                                               #
#  TRAIN ML ON ONE BIN, WITH 1, 3, 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Test_4GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

test_min_max_array = [  # Array of min and max for pT ranges to test on
    [18,22], [28,32], [38,42], [48,52], 
    [58,62], [68,72], [78,82]
    ]
feature_bundle = [
#     [feature_label_1feat,  feature_index_1feat], 
#     [feature_label_3feat,  feature_index_3feat],
    [feature_label_12feat, feature_index_12feat]
    ]
train_bundle = [ # This may be implemented later to iterate through multiple training sets
    [X_train, y_train, sc_train]
    ]

for feature_set in feature_bundle:
    feature_label = feature_set[0]
    feature_coeff_label = feature_set[0].copy()
    feature_index = feature_set[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F" + str(len(feature_label)) + "_" + str(int(train_range[0])) + "_" + str(int(train_range[1]))
    
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train, feature_index)
    X_test_select  = Build_SelectFeatureArray(X_test, feature_index)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results

    for min_max in test_min_max_array:
        
        output = "\nTesting " + str(len(feature_index)) + " features on " + str(min_max[0]) + "-" + str(min_max[1]) + " GeV..."
        print(output)
        
        csv_path = output_csv_name + "_Test_" + str(int(min_max[0])) + "_" + str(int(min_max[1])) + ".csv"
        
        TestAndSave_LinearRegression(
            feature_label,
            feature_index, 
            lr_pipeline, 
            lr_coeffs,
            X_test_select,
            y_test, 
            sc_test,
            min_max[0],
            min_max[1],   
            csv_path,
            use_scaler = True
            )
        
        print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 20 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Train_20GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,30.], [20.,40.], [30.,50.], [40.,60.],
    [50.,70.], [60.,80.], [70.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "_" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 30 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Train_30GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,40.], [20.,50.], [30.,60.], [40.,70.],
    [50.,80.], [60.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "-" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")

    
    
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)

directory already exists

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 961713 

Selecting data from master array...
Data ready. Feature array length: 961713 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
36.38890792680406
[ 2.37647474e+01  9.49338713e-01  1.04491241e+01 -8.32120399e+00
 -1.57317687e+01  3.12398618e+00 -5.56442947e-01 -2.34457912e-01
  7.27674342e-02 -8.43756125e-01  9.15854183e-03 -1.47858449e+00]
[ 2.37647474e+01  9.49338713e-01  1.04491241e+01 -8.32120399e+00
 -1.57317687e+01  3.12398618e+00 -5.56442947e-01 -2.34457912e-01
  7.27674342e-02 -8.43756125e-01  9.15854183e-03 -1.47858449e+00
  3.63889079e+01]
Regression Coefficients:
jet_pt_raw 23.

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 140244 

Selecting data from master array...
Data ready. Feature array length: 140244 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
59.7229005676726
[ 4.52586157  0.16175359  1.8705025  -2.38066498 -3.45589565  0.50470651
 -0.20595634 -0.14137604 -0.04115627 -0.22463318  0.03003231 -0.2695141 ]
[ 4.52586157e+00  1.61753593e-01  1.87050250e+00 -2.38066498e+00
 -3.45589565e+00  5.04706512e-01 -2.05956339e-01 -1.41376042e-01
 -4.11562728e-02 -2.24633177e-01  3.00323109e-02 -2.69514100e-01
  5.97229006e+01]
Regression Coefficients:
jet_pt_raw 4.525861566265633
jet_pt


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
43.44102281371701
[ 7.80740948e+00  9.99704944e-01  5.52467871e+00 -4.04424117e+00
 -8.45199869e+00  1.00985133e+00 -3.69864098e-01 -2.17309070e-01
  5.16646782e-05 -3.60107199e-01 -8.16831326e-03 -3.75035838e-01]
[ 7.80740948e+00  9.99704944e-01  5.52467871e+00 -4.04424117e+00
 -8.45199869e+00  1.00985133e+00 -3.69864098e-01 -2.17309070e-01
  5.16646782e-05 -3.60107199e-01 -8.16831326e-03 -3.75035838e-01
  4.34410228e+01]
Regression Coefficients:
jet_pt_raw 7.807409480148057
jet_pt_corr 0.9997049439942503
jet_mass 5.524678705401544
jet_area -4.044241171237165
jet_const_n -8.451998692249939
const_pt_mean 1.0098513295080036
const_1_pt -0.3698640979115826
const_2_pt -0.21730907030104807
const_3_pt 5.1664678205422003e-05
const_4_pt -0.3601071991298382
jet_y -0.008168313262793832
jet_rho -0.37503583812289654
lr_intercept 43.44102281371701
<class 'sklearn