In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def get_data(dataset):
    """Read hd5 data files to pandas df"""
    path = f'/storage/bk17414/ForStudents/2021_CMS_ML/ml_postproc/{dataset}/df_ml_inputs.hd5'
    return pd.read_hdf(path)

def preprocess(signal, background, columns, train_size=0.8):
    # Labels
    background['signal'] = 0.0
    signal['signal'] = 1.0
    # Combine signal and background
    data = pd.concat([background, signal], ignore_index=True)
    # separate features from labels
    X, y = data[columns], data['signal']
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, stratify=y, random_state=42)
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return np.array([X_train_scaled, X_test_scaled, y_train, y_test])


In [2]:
test = get_data('ttH125')
test.columns

Index(['dataset', 'entry', 'BiasedDPhi', 'DiJet_mass', 'HT',
       'InputMet_InputJet_mindPhi', 'InputMet_phi', 'InputMet_pt', 'MHT_phi',
       'MHT_pt', 'MinChi', 'MinOmegaHat', 'MinOmegaTilde', 'cleanedJet_area',
       'cleanedJet_btagDeepB', 'cleanedJet_chHEF', 'cleanedJet_eta',
       'cleanedJet_mass', 'cleanedJet_neHEF', 'cleanedJet_phi',
       'cleanedJet_pt', 'hashed_filename', 'ncleanedBJet', 'ncleanedJet',
       'weight_nominal', 'xs_weight'],
      dtype='object')

In [3]:
import matplotlib.pyplot as plt

event = test.iloc[0,:]
event
# jet_image = np.zero((30,30))


dataset                                                                 ttH125
entry                                                                      103
BiasedDPhi                                                           -3.033725
DiJet_mass                                                          216.669163
HT                                                                  483.973785
InputMet_InputJet_mindPhi                                             0.987305
InputMet_phi                                                         -2.919922
InputMet_pt                                                             267.75
MHT_phi                                                              -2.735934
MHT_pt                                                              229.776886
MinChi                                                                1.249071
MinOmegaHat                                                           1.091516
MinOmegaTilde                                       

In [4]:
columns = ['ncleanedBJet', 'ncleanedJet', 'DiJet_mass', 'HT', 'MHT_pt', 'InputMet_pt', 
'InputMet_InputJet_mindPhi', 'BiasedDPhi', 'MinChi', 'MinOmegaHat', 'MinOmegaTilde']
data = preprocess(get_data('ttH125'), get_data('TTToSemiLeptonic'), columns)



In [5]:
import os

os.getcwd()

'/usersc/ac18804/ttbar_notebooks'

In [6]:
np.save('SL_Higgs_input_data', data)