In [27]:
from ROOT import *
from root_numpy import tree2array
from ROOT import TFile
import pandas as pd
import numpy as np
import deepdish.io as io
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, roc_auc_score
import os

In [28]:
data = TFile.Open("/home/minerva1993/public/v808/nosplit/ttHbb_PowhegPythia.root")
data2 = TFile.Open("/home/minerva1993/public/v808/nosplit/TTLJ_PowhegPythia_ttbb.root")
tree = data.Get("ttbbLepJets/tree")
tree2 = data2.Get("ttbbLepJets/tree")

In [29]:
def tree_to_df(tree, branch_names=[], index_name='', drop_roofit_labels=False):
    if tree is None:
        return None

    branch_list = tree.GetListOfBranches()
    all_branch_names = [branch_list.At(i).GetName() for i in range(branch_list.GetEntries())]
    if len(branch_names) == 0:
        branch_names = all_branch_names
    for bn in branch_names[:]:
        if bn not in all_branch_names:
            branch_names.remove(bn)
        if drop_roofit_labels:
            if bn.endswith('_lbl'):
                branch_names.remove(bn)

    arrs = tree2array(tree, branch_names, start = 0, stop = 20000)
    df = pd.DataFrame(arrs)

    if len(index_name) == 0:
        for col in df.columns:
            if col.startswith('__index__'):
                index_name = col
                break
    if len(index_name):
        try:
            df[index_name] = df[index_name].astype(np.int32)
            df.set_index(index_name, inplace=True)
        except BaseException:
            pass

    if drop_roofit_labels:
        df.columns = [col.replace('_idx', '') for col in df.columns]

    n_tree = tree.GetEntries()
    n_df = len(df.index)

    return df 

In [30]:
dftree = tree_to_df(tree)
dftree_bg = tree_to_df(tree2)

In [31]:
# results = []
# for i in range(len(dftree['addbjet1_pt'])):
#     k = np.sqrt((dftree['addbjet1_e'][i] + dftree['addbjet2_e'][i])**2 - (dftree['addbjet1_pt'][i] + dftree['addbjet2_pt'][i])**2 - (dftree['addbjet1_eta'][i] + dftree['addbjet2_eta'][i])**2 - (dftree['addbjet1_phi'][i] + dftree['addbjet2_phi'][i])**2)
#     results.append(k)

In [32]:
def process(df):
    columns = ['draddjets','lepton_pT','lepton_eta','lepton_phi','lepton_E','MET','MET_phi','channel','event_weight']
    
    for t in range(1,7):
        for i in ['jet_pT','jet_eta','jet_phi','jet_E','jet_CvsB']:
            columns.append(i+'_'+str(t))
            
    end = []
    
    for i in range(len(df['lepton_SF'])):
        if df['jet_number'][i] >= 6:
            parts = []
            for t in ['draddjets','lepton_pT','lepton_eta','lepton_phi','lepton_E','MET','MET_phi','channel']:
                parts.append(df[t][i])
            
            product = df['lepton_SF'][i][0] * df['jet_SF_CSV_30'][i][0] * df['PUWeight'][i][0] * df['genweight'][i]
            
            parts.append(product)
            
            for t in range(len(df['jet_pT'][i])):
                passed = True
                partial = []
                for k in ['jet_pT','jet_eta','jet_phi','jet_E','jet_CvsB']:
                    if k == 'jet_pT':
                        if df[k][i][t] < 30:
                            passed = False
                            break
                    elif k == 'jet_eta':
                        if df[k][i][t] > 2.4 or df[k][i][t] < -2.4:
                            passed = False
                            break
                    partial.append(df[k][i][t])
                
                if passed:
                    parts += partial
                    
                if len(parts) == len(columns):
                    break
                    
            end.append(parts)
            
    train_tree = pd.DataFrame(end, columns=columns)
    return train_tree

In [57]:
train_tree = process(dftree)

In [58]:
train_tree_2 = process(dftree_bg)

In [59]:
train_tree['result'] = np.zeros(len(train_tree))

In [60]:
train_tree_2['result'] = [1 for i in range(len(train_tree_2))]

In [63]:
train = train_tree.append(train_tree_2, ignore_index=True)

In [65]:
y_pred = train['result']
train = train.drop('result',1)

In [67]:
def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [68]:
train, scaler = preprocess_data(train)

  """


In [69]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train, y_pred, test_size = 0.2, random_state = 42)

In [70]:
model = Sequential()
model.add(Dropout(0.13, input_shape=(X_train.shape[1],)))
model.add(Dense(75))
model.add(PReLU())

model.add(Dropout(0.11))
model.add(Dense(50))
model.add(PReLU())

model.add(Dropout(0.09))
model.add(Dense(30))
model.add(PReLU())

model.add(Dropout(0.07))
model.add(Dense(25))
model.add(PReLU())

model.add(Dense(2))
model.add(Activation('sigmoid'))

In [71]:
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.SGD(lr=0.01, nesterov=True), metrics=['accuracy'])

In [72]:
Y_train_nn = np_utils.to_categorical(Y_train)
Y_valid_nn = np_utils.to_categorical(Y_valid)

In [73]:
model.fit(X_train, Y_train_nn, batch_size=64, epochs=40, verbose=2, shuffle=True, validation_data = (X_valid, Y_valid_nn))

Train on 12003 samples, validate on 3001 samples
Epoch 1/40
 - 2s - loss: 0.7082 - acc: 0.5085 - val_loss: 0.6942 - val_acc: 0.5308
Epoch 2/40
 - 1s - loss: 0.6992 - acc: 0.5095 - val_loss: 0.6924 - val_acc: 0.5292
Epoch 3/40
 - 0s - loss: 0.6955 - acc: 0.5110 - val_loss: 0.6916 - val_acc: 0.5298
Epoch 4/40
 - 0s - loss: 0.6930 - acc: 0.5221 - val_loss: 0.6912 - val_acc: 0.5278
Epoch 5/40
 - 1s - loss: 0.6930 - acc: 0.5205 - val_loss: 0.6909 - val_acc: 0.5278
Epoch 6/40
 - 0s - loss: 0.6916 - acc: 0.5149 - val_loss: 0.6908 - val_acc: 0.5275
Epoch 7/40
 - 0s - loss: 0.6928 - acc: 0.5190 - val_loss: 0.6907 - val_acc: 0.5285
Epoch 8/40
 - 0s - loss: 0.6915 - acc: 0.5204 - val_loss: 0.6904 - val_acc: 0.5308
Epoch 9/40
 - 0s - loss: 0.6924 - acc: 0.5165 - val_loss: 0.6904 - val_acc: 0.5298
Epoch 10/40
 - 0s - loss: 0.6903 - acc: 0.5215 - val_loss: 0.6902 - val_acc: 0.5345
Epoch 11/40
 - 0s - loss: 0.6899 - acc: 0.5286 - val_loss: 0.6901 - val_acc: 0.5335
Epoch 12/40
 - 0s - loss: 0.6903 - a

<keras.callbacks.History at 0x7f66b4de7310>

2018-10-17 17:57:33.356699: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
