## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [7]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Scripts_Python.ML_Python_Build_FeatureArrays_FromROOT import Build_FeatureArrays_FromROOT
from Scripts_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLWeights_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators)
from datetime import datetime



def Build_SelectFeatureArray(
    X_features,
    feature_index
    ) :
    """
    Builds training and testing data sets
    """
    
    print("Selecting data from master array...")
    
    X_features_select = []
    for i in range(len(X_features)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_features[i][feature_index[j]])
        X_features_select.append(X_temp)
        
    print("Data ready. Feature array length:", len(X_features_select), "\n")
    
    return X_features_select

    

def TestAndSave_LinearRegression(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    lr_pipeline,      # Trained Linear Regression Pipeline
    lr_coeffs,        # Array of coefficient values from trained linear regression pipeline
    X_test_select,    # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    pt_test_min,      # Float of min pT to test with
    pt_test_max,      # Float of max pT to test with
    output_filename,  # Directory path + name for output csv file
    use_scaler = True # If true, rescales data
    ) :
    
    X_test_temp  = []
    y_test_temp  = []
    sc_test_temp = []
    
    for i in range(len(y_test)):
        if y_test[i] > pt_test_min and y_test[i] < pt_test_max:
            X_test_temp.append(X_test_select[i])
            y_test_temp.append(y_test[i])
            sc_test_temp.append(sc_test[i])
        else: continue
    
    # Tests estimator
    
    print(type(lr_pipeline))
    
    lr_results, lr_results_delta = Test_Estimator(
        lr_pipeline,
        X_test_temp, 
        y_test_temp
        )
    
    # Writes outputs to a csv file
    Write_MLResults_ToCSV(
        output_filename,
        y_test_temp,
        sc_test_temp,
        lr_results,
        X_test_temp,
        feature_label
        )
    
    return



def TrainTestPlot_All_Estimators(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    X_train,          # Array of training data features
    y_train,          # Array of training data targets
    sc_train,         # Array of training data simple correction values
    X_test,           # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    output_file_path, # File path for outputs
    use_scaler = True,
    use_lr = True,
    use_rf = True,
    use_mlp = True,
    ) :
    
    # Builds training data set
    print("Selecting training data...")
    X_train_select = []
    for i in range(len(X_train)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_train[i][feature_index[j]])
        X_train_select.append(X_temp)
    print("Training data ready. X/Y length:", len(X_train_select), len(y_train), "/n")
    
    # Builds pipelines from selected training features
    print("Building estimator pipelines...")
    lr_pipeline, rf_pipeline, mlp_pipeline, lr_coeffs, rf_features = Train_All_Estimators(
        X_train_select, y_train, feature_label, 
        use_StandardScaler = use_scaler,
        use_LinearRegression = use_lr,
        use_RandomForest = use_rf,
        use_MLP = use_mlp)
    print("Pipelines built./n")
    
    print("Selecting testing data...")
    X_test_select = []
    for i in range(len(X_test)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_test[i][feature_index[j]])
        X_test_select.append(X_temp)
    print("Testing data ready. X/Y length:", len(X_test_select), len(y_test), "/n")
    
    # Test estimators
    print("Testing all estimators...")
    lr_results, lr_results_delta, rf_results, rf_results_delta, mlp_results, mlp_results_delta = Test_All_Estimators(
        X_test_select, 
        y_test, 
        lr_pipeline,
        rf_pipeline,
        mlp_pipeline)
    print("Estimator testing complete!/n")
    
    return



###########################################
#                                         #
#     DATA PREPARATION - CHANGE BELOW     #
#                                         #
###########################################



file_directory   = "../../Files/Comparison_Test_4/Data/"

train_base_name  = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
train_bias       = "B8"
train_range      = (10., 90.) # pT min/max of training file

test_base_name   = "Test_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
test_range       = (10., 90.) # pT min/max of testing file



##### ANYTHING BELOW THIS SHOULDN'T NEED TO CHANGE #####

train_file_name  = "Full_" + train_base_name + "_ML_Prep.root"
# train_file_name  = "Full_Train_TopInRange_B8_10_90_N500000_ML_Prep.root"
train_tree_name  = "ML_" + train_base_name
train_file_path  = file_directory + train_file_name
train_csv_path   = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"
# train_csv_path   = file_directory + "ML_Prep_Train_TopInRange_B8_10_90_N500000_Backup.csv"

test_file_name   = "Full_" + test_base_name + "_ML_Prep.root"
test_tree_name   = "ML_" + test_base_name
test_file_path   = file_directory + test_file_name
test_csv_path    = file_directory + "ML_Prep_" + test_base_name + "_Backup.csv"

# Builds ML output directories
output_directory = file_directory + "ML_Results/"

try:
    os.mkdir(output_directory)
    print("made 'ML_Results' directory")
except:
    print("directory already exists")

# Rebuilds feature and target arrays from csv file, or rebuilds them if csv doesn't exist

# Training data
if os.path.exists(train_csv_path):
    X_train, y_train, sc_train = Build_FeatureArrays_FromCSV(train_csv_path)
else:
    X_train, y_train, sc_train = Build_FeatureArrays_FromROOT(
        train_file_path, train_tree_name, train_csv_path, train_range[0], train_range[1])

# Testing data
if os.path.exists(test_csv_path):
    X_test,  y_test,  sc_test  = Build_FeatureArrays_FromCSV(test_csv_path)
else:
    X_test, y_test, sc_test = Build_FeatureArrays_FromROOT(
        test_file_path,  test_tree_name,  test_csv_path,  test_range[0],  test_range[1])

# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]



now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

directory already exists
Input file accessed successfully. Output file generated.
Accessing input tree...
Input tree accessed successfully.
Creating .csv backup file...
Backup file started.
Preparing to collect data from TTree...
Jet: 10000 | pTraw: 94.586 | pTcorr:  27.827 | pTtrue:  20.277
Jet: 20000 | pTraw: 129.452 | pTcorr:  79.065 | pTtrue:  67.456
Jet: 30000 | pTraw: 133.441 | pTcorr:  84.407 | pTtrue:  81.975
Jet: 40000 | pTraw: 114.348 | pTcorr:  51.083 | pTtrue:  47.653
Jet: 50000 | pTraw: 80.998 | pTcorr:  24.839 | pTtrue:  22.685
Jet: 60000 | pTraw: 113.554 | pTcorr:  56.293 | pTtrue:  36.947
Jet: 70000 | pTraw: 61.468 | pTcorr:  15.503 | pTtrue:  22.712
Jet: 80000 | pTraw: 156.366 | pTcorr:  104.205 | pTtrue:  77.724
Jet: 90000 | pTraw: 115.208 | pTcorr:  56.829 | pTtrue:  51.045
Jet: 100000 | pTraw: 82.505 | pTcorr:  17.530 | pTtrue:  39.483
Jet: 110000 | pTraw: 131.879 | pTcorr:  70.023 | pTtrue:  78.585
Jet: 120000 | pTraw: 84.898 | pTcorr:  29.525 | pTtrue:  22.154
Jet

Jet: 500000 | pTraw: 50.057 | pTcorr:  3.176 | pTtrue:  11.809
Jet: 510000 | pTraw: 140.771 | pTcorr:  82.600 | pTtrue:  86.668
Jet: 520000 | pTraw: 113.060 | pTcorr:  56.081 | pTtrue:  59.225
Jet: 530000 | pTraw: 107.227 | pTcorr:  36.932 | pTtrue:  29.370
Jet: 540000 | pTraw: 115.388 | pTcorr:  65.942 | pTtrue:  59.544
Jet: 550000 | pTraw: 127.401 | pTcorr:  64.032 | pTtrue:  82.112
Jet: 560000 | pTraw: 61.743 | pTcorr:  3.662 | pTtrue:  11.740
Jet: 570000 | pTraw: 97.437 | pTcorr:  32.444 | pTtrue:  24.643
Jet: 580000 | pTraw: 129.455 | pTcorr:  87.953 | pTtrue:  89.743
Jet: 590000 | pTraw: 50.018 | pTcorr:  8.318 | pTtrue:  13.509
Jet: 600000 | pTraw: 109.059 | pTcorr:  40.736 | pTtrue:  55.587
Jet: 610000 | pTraw: 57.724 | pTcorr: -4.254 | pTtrue:  15.480
Jet: 620000 | pTraw: 63.374 | pTcorr:  17.767 | pTtrue:  26.256
Jet: 630000 | pTraw: 141.039 | pTcorr:  86.182 | pTtrue:  72.406
Jet: 640000 | pTraw: 86.987 | pTcorr:  46.515 | pTtrue:  25.880
Jet: 650000 | pTraw: 69.871 | pTcorr

## Training & Testing
1 Feature: pt_raw ONLY

3 Features: pt_raw, jet_area, jet_rho

12 Features: jet_pt_raw, jet_pt_corr, jet_mass, jet_area, jet_const_n, const_pt_mean, const_1_pt, const_2_pt, const_3_pt, const_4_pt, jet_y, jet_rho

In [8]:
#################################################
#                                               #
#  TRAIN ML ON ONE BIN, WITH 1, 3, 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory_alt = output_directory + "/Test_4GeV_Bins/"

try:
    os.mkdir(output_directory_alt)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory_alt + "Plots_Actual/")
    os.mkdir(output_directory_alt + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

test_min_max_array = [  # Array of min and max for pT ranges to test on
    [18,22], [28,32], [38,42], [48,52], 
    [58,62], [68,72], [78,82]
    ]
feature_bundle = [
#     [feature_label_1feat,  feature_index_1feat], 
#     [feature_label_3feat,  feature_index_3feat],
    [feature_label_12feat, feature_index_12feat]
    ]
train_bundle = [ # This may be implemented later to iterate through multiple training sets
    [X_train, y_train, sc_train]
    ]

for feature_set in feature_bundle:
    feature_label = feature_set[0]
    feature_coeff_label = feature_set[0].copy()
    feature_index = feature_set[1]
    
    output_csv_name = output_directory_alt + "Train_" + train_bias + "_F" + str(len(feature_label)) + "_" + str(int(train_range[0])) + "_" + str(int(train_range[1]))
    
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train, feature_index)
    X_test_select  = Build_SelectFeatureArray(X_test, feature_index)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results

    for min_max in test_min_max_array:
        
        output = "\nTesting " + str(len(feature_index)) + " features on " + str(min_max[0]) + "-" + str(min_max[1]) + " GeV..."
        print(output)
        
        csv_path = output_csv_name + "_Test_" + str(int(min_max[0])) + "_" + str(int(min_max[1])) + ".csv"
        
        TestAndSave_LinearRegression(
            feature_label,
            feature_index, 
            lr_pipeline, 
            lr_coeffs,
            X_test_select,
            y_test, 
            sc_test,
            min_max[0],
            min_max[1],   
            csv_path,
            use_scaler = True
            )
        
        print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 20 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory_alt = output_directory + "Train_20GeV_Bins/"

try:
    os.mkdir(output_directory_alt)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory_alt + "Plots_Actual/")
    os.mkdir(output_directory_alt + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,30.], [20.,40.], [30.,50.], [40.,60.],
    [50.,70.], [60.,80.], [70.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory_alt + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "_" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 30 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory_alt = output_directory + "Train_30GeV_Bins/"

try:
    os.mkdir(output_directory_alt)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory_alt + "Plots_Actual/")
    os.mkdir(output_directory_alt + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,40.], [20.,50.], [30.,60.], [40.,70.],
    [50.,80.], [60.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory_alt + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "-" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")

    
    
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)

made directory
made subdirectories

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 753551 

Selecting data from master array...
Data ready. Feature array length: 661476 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
49.00813326631894
[ 2.06653645e+01  5.19999535e+00  1.11085701e+00 -4.31725440e-01
 -8.18801576e+00 -3.24574353e-01  2.13031202e-02  1.24057552e-01
  1.96540253e-01  1.98312957e-01  6.64943849e-03 -1.27301071e-01]
[ 2.06653645e+01  5.19999535e+00  1.11085701e+00 -4.31725440e-01
 -8.18801576e+00 -3.24574353e-01  2.13031202e-02  1.24057552e-01
  1.96540253e-01  1.98312957e-01  6.64943849e-03 -1.27301071e-01
  4.90081333e+01]
Regression Coefficients:
jet_

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 177226 

Selecting data from master array...
Data ready. Feature array length: 98214 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
60.20660910193599
[ 4.26399604  0.37849419 -0.21679293 -0.22307307 -2.48912173 -0.09817032
 -0.01362956  0.01707158 -0.00636222  0.12206884  0.0187489  -0.29428055]
[ 4.26399604e+00  3.78494193e-01 -2.16792935e-01 -2.23073068e-01
 -2.48912173e+00 -9.81703157e-02 -1.36295596e-02  1.70715775e-02
 -6.36222477e-03  1.22068845e-01  1.87489028e-02 -2.94280550e-01
  6.02066091e+01]
Regression Coefficients:
jet_pt_raw 4.263996039969224
jet_pt

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 258145 

Selecting data from master array...
Data ready. Feature array length: 157452 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
55.44874259871741
[ 7.63865100e+00  6.65866524e-01 -4.87416263e-01 -4.17257438e-01
 -3.87467537e+00 -1.63806839e-01  6.80916002e-03  3.51754627e-02
  3.39622551e-02  1.51001987e-01  9.08179871e-03 -4.63204388e-01]
[ 7.63865100e+00  6.65866524e-01 -4.87416263e-01 -4.17257438e-01
 -3.87467537e+00 -1.63806839e-01  6.80916002e-03  3.51754627e-02
  3.39622551e-02  1.51001987e-01  9.08179871e-03 -4.63204388e-01
  5.54487426e+01]
Regression