# This notebook runs a similar preprocessing to the explore.ipynb notebook for the testing data set in order to test our models and saves a final file called "test_<# of features>variables.npz" inside the "root_files" folder

In [1]:
import keras
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
with np.load("root_files/combined_test.npz") as file:
    feature_array_test = file['features']
    label_array_test = file['labels']
    
    features = file['names'].tolist()
    
    nfeatures = len(features)
    nlabels = 2

In [3]:
df_test = pd.DataFrame(feature_array_test, columns=features)
bkg_test = df_test[label_array_test[:,0]==1]
sig_test = df_test[label_array_test[:,1]==1]

In [4]:
del feature_array_test, label_array_test

In [5]:
sig_test_out = sig_test
bkg_test_out = bkg_test

In [6]:
print(bkg_test.shape)
print(bkg_test_out.shape)

(1499756, 49)
(1499756, 49)


In [7]:
del bkg_test, sig_test, df_test

In [8]:
bkg_test_out = bkg_test_out.sample(len(sig_test_out)).reset_index(drop=True)

In [9]:
print(bkg_test_out.shape)
print(sig_test_out.shape)

(227871, 49)
(227871, 49)


In [10]:
# combine the testing signal and background dataframes into one with appropriate labeling columns
hbb = np.concatenate((np.ones(len(sig_test_out)),np.zeros(len(bkg_test_out))), axis=None)
QCD = np.concatenate((np.zeros(len(sig_test_out)),np.ones(len(bkg_test_out))), axis=None)
testdata = pd.concat([sig_test_out, bkg_test_out])
testdata['hbb'] = hbb
testdata['QCD'] = QCD

# shuffle and extract testing feature and label arrays
test = testdata.sample(frac=1).reset_index(drop=True)
X_test = test.iloc[:,:-2]
y_test = test.iloc[:,-2:]

In [11]:
del testdata

In [12]:
scaler = StandardScaler()
scaler.fit(X_test)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test)

In [13]:
X_test_final =  X_test

In [14]:
X_test_final.shape

(455742, 49)

In [15]:
np.savez(f'root_files/test_{len(features)}variables.npz', 
         X_test = X_test_final, y_test = y_test,
         variables = features, labels = ['Hbb', 'QCD'])