## Core Functions
These handle importing necessary libraries, preparation of the feature arrays for Machine Learning, and execution of Machine Learning training and testing.

In [2]:
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Scripts_Python.ML_Python_Build_FeatureArrays_FromROOT import (
    Build_FeatureArrays_FromROOT,
    Build_FeatureArrays_FromROOT_ByEvent)
from Scripts_Python.ML_Python_TrainTest import (
    Build_FeatureArrays_FromCSV,
    Write_MLResults_ToCSV,
    Write_MLWeights_ToCSV,
    Train_All_Estimators,
    Train_LinearRegression,
    Train_RandomForestRegression,
    Train_MLPRegression,
    Test_Estimator,
    Test_All_Estimators)
from datetime import datetime



def Build_SelectFeatureArray(
    X_features,
    feature_index
    ) :
    """
    Builds training and testing data sets
    """
    
    print("Selecting data from master array...")
    
    X_features_select = []
    for i in range(len(X_features)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_features[i][feature_index[j]])
        X_features_select.append(X_temp)
        
    print("Data ready. Feature array length:", len(X_features_select), "\n")
    
    return X_features_select

    

def TestAndSave_LinearRegression(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    lr_pipeline,      # Trained Linear Regression Pipeline
    lr_coeffs,        # Array of coefficient values from trained linear regression pipeline
    X_test_select,    # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    pt_test_min,      # Float of min pT to test with
    pt_test_max,      # Float of max pT to test with
    output_filename,  # Directory path + name for output csv file
    use_scaler = True # If true, rescales data
    ) :
    
    X_test_temp  = []
    y_test_temp  = []
    sc_test_temp = []
    
    for i in range(len(y_test)):
        if y_test[i] > pt_test_min and y_test[i] < pt_test_max:
            X_test_temp.append(X_test_select[i])
            y_test_temp.append(y_test[i])
            sc_test_temp.append(sc_test[i])
        else: continue
    
    # Tests estimator
    
    print(type(lr_pipeline))
    
    lr_results, lr_results_delta = Test_Estimator(
        lr_pipeline,
        X_test_temp, 
        y_test_temp
        )
    
    # Writes outputs to a csv file
    Write_MLResults_ToCSV(
        output_filename,
        y_test_temp,
        sc_test_temp,
        lr_results,
        X_test_temp,
        feature_label
        )
    
    return



def TrainTestPlot_All_Estimators(
    feature_label,    # Array of labels corresponding to each feature
    feature_index,    # Array of indices for each feature used in X_train
    X_train,          # Array of training data features
    y_train,          # Array of training data targets
    sc_train,         # Array of training data simple correction values
    X_test,           # Array of testing data features
    y_test,           # Array of testing data targets
    sc_test,          # Array of testing data simple corrections
    output_file_path, # File path for outputs
    use_scaler = True,
    use_lr = True,
    use_rf = True,
    use_mlp = True,
    ) :
    
    # Builds training data set
    print("Selecting training data...")
    X_train_select = []
    for i in range(len(X_train)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_train[i][feature_index[j]])
        X_train_select.append(X_temp)
    print("Training data ready. X/Y length:", len(X_train_select), len(y_train), "/n")
    
    # Builds pipelines from selected training features
    print("Building estimator pipelines...")
    lr_pipeline, rf_pipeline, mlp_pipeline, lr_coeffs, rf_features = Train_All_Estimators(
        X_train_select, y_train, feature_label, 
        use_StandardScaler = use_scaler,
        use_LinearRegression = use_lr,
        use_RandomForest = use_rf,
        use_MLP = use_mlp)
    print("Pipelines built./n")
    
    print("Selecting testing data...")
    X_test_select = []
    for i in range(len(X_test)):
        X_temp = []
        for j in range(len(feature_index)):
            X_temp.append(X_test[i][feature_index[j]])
        X_test_select.append(X_temp)
    print("Testing data ready. X/Y length:", len(X_test_select), len(y_test), "/n")
    
    # Test estimators
    print("Testing all estimators...")
    lr_results, lr_results_delta, rf_results, rf_results_delta, mlp_results, mlp_results_delta = Test_All_Estimators(
        X_test_select, 
        y_test, 
        lr_pipeline,
        rf_pipeline,
        mlp_pipeline)
    print("Estimator testing complete!/n")
    
    return



###########################################
#                                         #
#     DATA PREPARATION - CHANGE BELOW     #
#                                         #
###########################################



file_directory   = "../../Files/Joey_Data/Data/"

train_base_name  = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
train_bias       = "B8"
train_range      = (10., 90.) # pT min/max of training file

test_base_name   = "Train_B8_10_90_N500000" # Cut off 'ML_Prep_' and '.root' parts of input file names
test_range       = (10., 90.) # pT min/max of testing file



##### ANYTHING BELOW THIS SHOULDN'T NEED TO CHANGE #####

train_file_name  = "ML_Prep_10_90_Train8.root"
train_tree_name  = "Tree_Tree"
train_file_path  = file_directory + train_file_name
train_csv_path   = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"

test_file_name   = "ML_Prep_10_90_Train8.root"
test_tree_name   = "Tree_Tree"
test_file_path   = file_directory + test_file_name
test_csv_path    = file_directory + "ML_Prep_" + train_base_name + "_Backup.csv"

# Builds ML output directories
output_directory = file_directory + "ML_Results/"

try:
    os.mkdir(output_directory)
    print("made 'ML_Results' directory")
except:
    print("directory already exists")

# Rebuilds feature and target arrays from csv file, or rebuilds them if csv doesn't exist

# Training data
if os.path.exists(train_csv_path):
    X_train, y_train, sc_train = Build_FeatureArrays_FromCSV(train_csv_path)
else:
    X_train, y_train, sc_train = Build_FeatureArrays_FromROOT_ByEvent(
        train_file_path, train_tree_name, train_csv_path, train_range[0], train_range[1])

# Testing data
if os.path.exists(train_csv_path):
    X_test,  y_test,  sc_test  = Build_FeatureArrays_FromCSV(test_csv_path)
else:
    X_test, y_test, sc_test = Build_FeatureArrays_FromROOT_ByEvent(
        test_file_path,  test_tree_name,  test_csv_path,  test_range[0],  test_range[1])

# Set Features to train with
# X_values[
#    0  jet_pt_raw,      1  jet_pt_corr,     2  jet_mass,        3  jet_area, 
#    4  jet_area_err,    5  jet_const_n,     6  const_pt_mean,   7  const_pt_median, 
#    8  const_1_pt,      9  const_2_pt,      10 const_3_pt,      11 const_4_pt,
#    12 const_5_pt,      13 const_6_pt,      14 const_7_pt,      15 const_8_pt,
#    16 const_9_pt,      17 const_10_pt,     18 jet_y,           19 jet_phi,
#    20 jet_rho]

# Training with 1 feature
feature_label_1feat = [
    "jet_pt_raw"]
feature_index_1feat = [0]

# Training with 3 features
feature_label_3feat = [
    "jet_pt_raw", "jet_area", "jet_rho"]
feature_index_3feat = [0, 3, 20]

# Training with 12 features
feature_label_12feat = [
    "jet_pt_raw",    "jet_pt_corr",    "jet_mass",      "jet_area", 
    "jet_const_n",   "const_pt_mean",  "const_1_pt",    "const_2_pt",
    "const_3_pt",    "const_4_pt",     "jet_y",         "jet_rho"]
feature_index_12feat = [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 18, 20]



now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nReady!", dt_string)

directory already exists
Input file accessed successfully. Output file generated.
Accessing input tree...
Input tree accessed successfully.
Creating .csv backup file...
Backup file started.
Preparing to collect data from TTree...
Jet: 10000 | pTraw: 84.443 | pTcorr:  7.454 | pTtrue:  18.926
Jet: 20000 | pTraw: 77.134 | pTcorr:  19.955 | pTtrue:  21.726
Jet: 30000 | pTraw: 127.356 | pTcorr:  62.739 | pTtrue:  74.138
Jet: 40000 | pTraw: 66.337 | pTcorr:  26.882 | pTtrue:  21.923
Jet: 50000 | pTraw: 118.369 | pTcorr:  64.838 | pTtrue:  58.220
Jet: 60000 | pTraw: 95.020 | pTcorr:  50.508 | pTtrue:  47.732
Jet: 70000 | pTraw: 91.805 | pTcorr:  31.265 | pTtrue:  39.645
Jet: 80000 | pTraw: 72.931 | pTcorr:  7.668 | pTtrue:  16.388
Jet: 90000 | pTraw: 68.348 | pTcorr:  21.350 | pTtrue:  35.245
Jet: 100000 | pTraw: 71.445 | pTcorr:  8.485 | pTtrue:  22.990
Jet: 110000 | pTraw: 149.970 | pTcorr:  86.229 | pTtrue:  87.505
Jet: 120000 | pTraw: 78.086 | pTcorr:  20.633 | pTtrue:  26.490
Jet: 130000

Jet: 340000 | pTraw: 135.940 | pTcorr:  89.956 | pTtrue:  83.047
Jet: 350000 | pTraw: 143.240 | pTcorr:  83.008 | pTtrue:  87.770
Jet: 360000 | pTraw: 65.477 | pTcorr:  13.818 | pTtrue:  12.015
Jet: 370000 | pTraw: 89.527 | pTcorr:  37.907 | pTtrue:  37.820
Jet: 380000 | pTraw: 122.035 | pTcorr:  51.989 | pTtrue:  60.723
Jet: 390000 | pTraw: 59.621 | pTcorr:  13.421 | pTtrue:  18.796
Jet: 400000 | pTraw: 120.945 | pTcorr:  62.609 | pTtrue:  50.739
Jet: 410000 | pTraw: 64.904 | pTcorr:  21.511 | pTtrue:  14.390
Jet: 420000 | pTraw: 68.030 | pTcorr:  30.456 | pTtrue:  28.310
Jet: 430000 | pTraw: 93.534 | pTcorr:  14.158 | pTtrue:  24.195
Jet: 440000 | pTraw: 86.579 | pTcorr:  30.866 | pTtrue:  33.426
Jet: 450000 | pTraw: 112.294 | pTcorr:  51.062 | pTtrue:  46.194
Jet: 460000 | pTraw: 131.897 | pTcorr:  76.580 | pTtrue:  71.342
Jet: 470000 | pTraw: 106.784 | pTcorr:  59.426 | pTtrue:  71.123
Jet: 480000 | pTraw: 119.461 | pTcorr:  52.433 | pTtrue:  63.744
Jet: 490000 | pTraw: 126.140 | p

## Training & Testing
1 Feature: pt_raw ONLY

3 Features: pt_raw, jet_area, jet_rho

12 Features: jet_pt_raw, jet_pt_corr, jet_mass, jet_area, jet_const_n, const_pt_mean, const_1_pt, const_2_pt, const_3_pt, const_4_pt, jet_y, jet_rho

In [3]:
#################################################
#                                               #
#  TRAIN ML ON ONE BIN, WITH 1, 3, 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Test_4GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

test_min_max_array = [  # Array of min and max for pT ranges to test on
    [18,22], [28,32], [38,42], [48,52], 
    [58,62], [68,72], [78,82]
    ]
feature_bundle = [
#     [feature_label_1feat,  feature_index_1feat], 
#     [feature_label_3feat,  feature_index_3feat],
    [feature_label_12feat, feature_index_12feat]
    ]
train_bundle = [ # This may be implemented later to iterate through multiple training sets
    [X_train, y_train, sc_train]
    ]

for feature_set in feature_bundle:
    feature_label = feature_set[0]
    feature_coeff_label = feature_set[0].copy()
    feature_index = feature_set[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F" + str(len(feature_label)) + "_" + str(int(train_range[0])) + "_" + str(int(train_range[1]))
    
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train, feature_index)
    X_test_select  = Build_SelectFeatureArray(X_test, feature_index)

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results

    for min_max in test_min_max_array:
        
        output = "\nTesting " + str(len(feature_index)) + " features on " + str(min_max[0]) + "-" + str(min_max[1]) + " GeV..."
        print(output)
        
        csv_path = output_csv_name + "_Test_" + str(int(min_max[0])) + "_" + str(int(min_max[1])) + ".csv"
        
        TestAndSave_LinearRegression(
            feature_label,
            feature_index, 
            lr_pipeline, 
            lr_coeffs,
            X_test_select,
            y_test, 
            sc_test,
            min_max[0],
            min_max[1],   
            csv_path,
            use_scaler = True
            )
        
        print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 20 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Train_20GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,30.], [20.,40.], [30.,50.], [40.,60.],
    [50.,70.], [60.,80.], [70.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "_" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")



#################################################
#                                               #
#  TRAIN ML OVER 30 GeV BINS, USES 12 FEATURES  #
#                                               #
#################################################

# Builds outputs directories
output_directory = file_directory + "ML_Results/Train_30GeV_Bins/"

try:
    os.mkdir(output_directory)
    print("made directory")
except:
    print("directory already exists")
try:
    os.mkdir(output_directory + "Plots_Actual/")
    os.mkdir(output_directory + "Plots_Delta/")
    print("made subdirectories")
except:
    print("directory already exists")

training_bundle = [
    [10.,40.], [20.,50.], [30.,60.], [40.,70.],
    [50.,80.], [60.,90.]
    ]

# ONLY runs 12 features
for training_range in training_bundle:
    train_min = training_range[0]
    train_max = training_range[1]
    
    output_csv_name = output_directory + "Train_" + train_bias + "_F12_" + str(int(train_min)) + "_" + str(int(train_max))
    
    X_train_cut = []
    y_train_cut = []
    sc_train_cut = []
    
    for i in range(len(X_train)):
        if (y_train[i] > train_min) and (y_train[i] < train_max):
            X_train_cut.append(X_train[i])
            y_train_cut.append(y_train[i])
            sc_train_cut.append(sc_train[i])
    
    X_test_cut = []
    y_test_cut = []
    sc_test_cut = []
    
    for i in range(len(X_test)):
        if (y_test[i] > train_min) and (y_test[i] < train_max):
            X_test_cut.append(X_test[i])
            y_test_cut.append(y_test[i])
            sc_test_cut.append(sc_test[i])
    
    # Builds training and testing arrays
    print("\nBuilding training and testing selected feature arrays...")
    X_train_select = Build_SelectFeatureArray(X_train_cut, feature_index_12feat)
    X_test_select  = Build_SelectFeatureArray(X_test_cut, feature_index_12feat)
    
    feature_coeff_label = feature_label_12feat.copy()
    feature_coeff_label.append("lr_intercept") # Adds field for linear regression y-intercept

    # Trains estimator
    print("\nTraining linear regression estimator...")
    lr_pipeline, lr_coeffs = Train_LinearRegression(
        X_train_select, 
        y_train_cut, 
        feature_coeff_label, 
        use_scaler = True)
    print(type(lr_pipeline))
    
    Write_MLWeights_ToCSV(
        output_csv_name + "_LR_Coeffs.csv",
        lr_coeffs, 
        feature_coeff_label
        )

    # Tests estimator and saves results
    output = "\nTesting " + str(len(feature_index_12feat)) + " features on " + str(int(train_min)) + "-" + str(int(train_max)) + " GeV..."
    print(output)

    csv_path = output_csv_name + "_Test_" + str(int(train_min)) + "_" + str(int(train_max)) + ".csv"

    TestAndSave_LinearRegression(
        feature_label_12feat,
        feature_index_12feat, 
        lr_pipeline, 
        lr_coeffs,
        X_test_select,
        y_test_cut, 
        sc_test_cut,
        train_min,
        train_max,   
        csv_path,
        use_scaler = True
        )

    print("Test and save complete!\n")

    
    
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
print("\nComplete!", dt_string)

made directory
made subdirectories

Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 891137 

Selecting data from master array...
Data ready. Feature array length: 891137 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
36.56230714574322
[ 3.19569390e+01  3.23702874e+00 -1.24176360e+01  5.86214570e-01
 -5.93038042e+00 -1.05422922e-01 -6.74393028e-02 -6.28924916e-02
  7.49967342e-03 -6.24078550e-02  7.69120245e-03  5.08189295e-02]
[ 3.19569390e+01  3.23702874e+00 -1.24176360e+01  5.86214570e-01
 -5.93038042e+00 -1.05422922e-01 -6.74393028e-02 -6.28924916e-02
  7.49967342e-03 -6.24078550e-02  7.69120245e-03  5.08189295e-02
  3.65623071e+01]
Regression Coefficients:
jet_

ML results .csv file closed.
Test and save complete!


Building training and testing selected feature arrays...
Selecting data from master array...
Data ready. Feature array length: 131234 

Selecting data from master array...
Data ready. Feature array length: 131234 


Training linear regression estimator...

----- Fitting Linear Regression Estimator -----


Using StandardScaler. Data will be recentered and normalized.


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
59.72017344070706
[ 6.58758547e+00  7.64241479e-01 -4.19773180e+00  3.39973275e-01
 -1.74676519e+00  4.11449016e-02  1.40809776e-04 -3.16511455e-03
 -1.89913629e-02 -9.17208011e-03  2.23311924e-02  1.29376658e-02]
[ 6.58758547e+00  7.64241479e-01 -4.19773180e+00  3.39973275e-01
 -1.74676519e+00  4.11449016e-02  1.40809776e-04 -3.16511455e-03
 -1.89913629e-02 -9.17208011e-03  2.23311924e-02  1.29376658e-02
  5.97201734e+01]
Regression


Linear Regression Fit:
 Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
43.445023221114226
[ 1.25828980e+01  1.31087461e+00 -7.61500507e+00  6.04537220e-01
 -3.12828974e+00  3.62528936e-02  2.62080541e-03 -1.97699262e-02
 -1.95927641e-02 -2.74754303e-02  4.36854873e-03  7.00252879e-02]
[ 1.25828980e+01  1.31087461e+00 -7.61500507e+00  6.04537220e-01
 -3.12828974e+00  3.62528936e-02  2.62080541e-03 -1.97699262e-02
 -1.95927641e-02 -2.74754303e-02  4.36854873e-03  7.00252879e-02
  4.34450232e+01]
Regression Coefficients:
jet_pt_raw 12.58289803091282
jet_pt_corr 1.3108746143632724
jet_mass -7.615005065701468
jet_area 0.6045372195869001
jet_const_n -3.128289738131
const_pt_mean 0.03625289355084962
const_1_pt 0.0026208054055479116
const_2_pt -0.019769926194660798
const_3_pt -0.019592764101542565
const_4_pt -0.027475430277848288
jet_y 0.004368548730553293
jet_rho 0.07002528792801373
lr_intercept 43.445023221114226
<class 'skle