In [9]:
from ROOT import *
from root_numpy import tree2array
from ROOT import TFile
import pandas as pd
import numpy as np
import deepdish.io as io
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RandomizedLasso
import os
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp

In [10]:
data = TFile.Open("/home/minerva1993/public/v808/nosplit/ttHbb_PowhegPythia.root")
data2 = TFile.Open("/home/minerva1993/public/v808/nosplit/TTLJ_PowhegPythia_ttbb.root")
tree = data.Get("ttbbLepJets/tree")
tree2 = data2.Get("ttbbLepJets/tree")

In [11]:
def tree_to_df(tree, branch_names=[], index_name='', drop_roofit_labels=False):
    if tree is None:
        return None

    branch_list = tree.GetListOfBranches()
    all_branch_names = [branch_list.At(i).GetName() for i in range(branch_list.GetEntries())]
    if len(branch_names) == 0:
        branch_names = all_branch_names
    for bn in branch_names[:]:
        if bn not in all_branch_names:
            branch_names.remove(bn)
        if drop_roofit_labels:
            if bn.endswith('_lbl'):
                branch_names.remove(bn)

    arrs = tree2array(tree, branch_names, start = 0, stop = 40000)
    df = pd.DataFrame(arrs)

    if len(index_name) == 0:
        for col in df.columns:
            if col.startswith('__index__'):
                index_name = col
                break
    if len(index_name):
        try:
            df[index_name] = df[index_name].astype(np.int32)
            df.set_index(index_name, inplace=True)
        except BaseException:
            pass

    if drop_roofit_labels:
        df.columns = [col.replace('_idx', '') for col in df.columns]

    n_tree = tree.GetEntries()
    n_df = len(df.index)

    return df 

In [12]:
dftree = tree_to_df(tree)
dftree_bg = tree_to_df(tree2)

In [13]:
def process_delta_phi(x):
    if x > math.pi:
        delta_phi = x - 2*math.pi
    elif x < -math.pi:
        delta_phi = x + 2*math.pi
    else:
        delta_phi = x
    return delta_phi

def calculate_delta_R(phi_1, phi_2, eta_1, eta_2):
    x = phi_1 - phi_2
    delta_phi = process_delta_phi(x)
    delta_eta = eta_1 - eta_2
    return math.sqrt(delta_phi**2 + delta_eta**2)

In [25]:
def generate(df):
    
    columns = ['draddjets','lepton_pT','lepton_eta','lepton_E','MET','MET_phi','jet_number','event_weight','delta_phi','delta_eta','delta_R','invmass','lepton_delta_R','lepton_delta_eta','H']
    
    for t in range(1,3):
        for i in ['jet_pT','jet_eta','jet_E','jet_CvsB']:
            columns.append(i+'_'+str(t))
    
    columns.append('result')
    
    overall = []
    
    for i in range(len(df['lepton_SF'])):
        if df['jet_number'][i] >= 6 and df['jet_CSV'][i][2] > 0.8:
            checked = 0
            for m in range(df['jet_number'][i]):
                if df['jet_pT'][i][m] > 20 and np.abs(dftree_bg['jet_eta'][i][m]) < 2.4:
                    checked += 1
            if checked < 6:
                continue
                
            count = 0
            
            #append all the invariant columns
            invariants = []
            
            for t in ['draddjets','lepton_pT','lepton_eta','lepton_E','MET','MET_phi','jet_number']:
                invariants.append(df[t][i])
                
            product = df['lepton_SF'][i][0] * df['jet_SF_CSV_30'][i][0] * df['PUWeight'][i][0] * df['genweight'][i]
            invariants.append(product)
            
            #Loop over possible combinations
            for t in [(0,1),(0,2),(0,3),(1,2),(1,3),(2,3)]:
                
                #initialize variant data column
                variants = []
                
                #set the jet pair
                jet_pair = (t[0],t[1])
                
                #Delta_phi, delta_eta and delta_R
                x = df['jet_phi'][i][jet_pair[0]] - df['jet_phi'][i][jet_pair[1]]
                delta_phi = process_delta_phi(x)
                delta_eta = df['jet_eta'][i][jet_pair[0]] - df['jet_eta'][i][jet_pair[1]]
                delta_R = math.sqrt(delta_phi**2 + delta_eta**2)

                #invmass
                pt1, pt2 = math.fabs(df['jet_pT'][i][jet_pair[0]]), math.fabs(df['jet_pT'][i][jet_pair[1]])
                pX1, pX2 = pt1 * math.cos(df['jet_phi'][i][jet_pair[0]]), pt2 * math.cos(df['jet_phi'][i][jet_pair[1]])
                pY1, pY2 = pt1 * math.sin(df['jet_phi'][i][jet_pair[0]]), pt2 * math.sin(df['jet_phi'][i][jet_pair[1]])
                pZ1, pZ2 = pt1 / math.tan(2.0 * math.atan(math.exp(-df['jet_eta'][i][jet_pair[0]]))), pt2 / math.tan(2.0 * math.atan(math.exp(-df['jet_eta'][i][jet_pair[1]])))
                invmass = math.sqrt((df['jet_E'][i][jet_pair[0]] + df['jet_E'][i][jet_pair[1]])**2 - (pX1 + pX2)**2 - (pY1 + pY2)**2 - (pZ1 + pZ2)**2)

                #H
                H = df['jet_pT'][i][jet_pair[0]] + df['jet_pT'][i][jet_pair[1]] + df['lepton_pT'][i]

                #delta_lepton_R
                y = df['jet_phi'][i][1] - df['lepton_phi'][i]
                delta_phi_lep = process_delta_phi(x)
                delta_eta_lep = df['jet_eta'][i][1] - df['lepton_eta'][i]
                delta_R_lep = math.sqrt(delta_phi_lep**2 + delta_eta_lep**2)

                variants += [delta_phi, delta_eta, delta_R, invmass, delta_R_lep, delta_eta_lep, H]
                
                for m in [t[0], t[1]]:
                    for k in ['jet_pT','jet_eta','jet_E','jet_CvsB']:
                        variants += [df[k][i][m]]

                phi_1, phi_2 = dftree_bg['jet_phi'][i][t[0]], dftree_bg['jet_phi'][i][t[1]]
                mt_phi_1, mt_phi_2 = dftree_bg['addbjet1_phi'][i], dftree_bg['addbjet2_phi'][i]
                eta_1, eta_2 = dftree_bg['jet_eta'][i][t[0]], dftree_bg['jet_eta'][i][t[1]]
                mt_eta_1, mt_eta_2 = dftree_bg['addbjet1_eta'][i], dftree_bg['addbjet2_eta'][i]

                dR_11 = calculate_delta_R(phi_1, mt_phi_1, eta_1, mt_eta_1)
                dR_12 = calculate_delta_R(phi_1, mt_phi_2, eta_1, mt_eta_2)
                dR_21 = calculate_delta_R(phi_2, mt_phi_1, eta_2, mt_eta_1)
                dR_22 = calculate_delta_R(phi_2, mt_phi_2, eta_2, mt_eta_2)

                variants.append(1 if (dR_11 < 0.4 or dR_12 < 0.4) and (dR_21 < 0.4 or dR_22 < 0.4) else 0)
                count += 1
                
                overall.append(invariants + variants)
            
    print "Column Length: ", len(overall[0])
    print "Fixed Length: ", len(columns)

    train_tree = pd.DataFrame(overall, columns=columns)
    return train_tree

In [26]:
train = generate(dftree_bg)

Column Length:  24
Fixed Length:  24


In [31]:
train['result'].value_counts()

0    45455
1     6145
Name: result, dtype: int64

In [34]:
def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [35]:
def under_sample(data):
    
    pos_events = data[data['result'] == 1]
    neg_events = data[data['result'] == 0]
    
    #Randomize and pick same n number of events
    number_pos_events = len(pos_events)  

    pos_events = pos_events.reindex(np.random.permutation(pos_events.index))
    neg_events = neg_events.reindex(np.random.permutation(neg_events.index))
        
    undersampled_events = pd.concat([neg_events.head(number_pos_events), pos_events])
    X_data_u, scaler = preprocess_data(undersampled_events.drop('result',1))
    y_data_u = undersampled_events['result'] 

    X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(X_data_u, y_data_u, test_size=0.3)
    
    return X_train_u, X_test_u, y_train_u, y_test_u, scaler

In [36]:
X_train, X_test, Y_train, Y_test, scaler = under_sample(train)

  return self.partial_fit(X, y)
  """


In [40]:
model = Sequential()
model.add(Dropout(0.13, input_shape=(X_train.shape[1],)))
model.add(Dense(75))
model.add(PReLU())

model.add(Dropout(0.11))
model.add(Dense(60))
model.add(PReLU())

model.add(Dropout(0.09))
model.add(Dense(45))
model.add(PReLU())

model.add(Dropout(0.07))
model.add(Dense(30))
model.add(PReLU())

model.add(Dropout(0.11))
model.add(Dense(15))
model.add(PReLU())

model.add(Dense(2))
model.add(Activation('sigmoid'))

In [41]:
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.SGD(lr=0.05, nesterov=True), metrics=['accuracy'])

In [42]:
Y_train_nn = np_utils.to_categorical(Y_train)
Y_test_nn = np_utils.to_categorical(Y_test)

In [43]:
model.fit(X_train, Y_train_nn, batch_size=64, epochs=70, verbose=2, shuffle=True, validation_data = (X_test, Y_test_nn))

Train on 8603 samples, validate on 3687 samples
Epoch 1/70
 - 1s - loss: 0.6793 - acc: 0.5693 - val_loss: 0.6532 - val_acc: 0.6306
Epoch 2/70
 - 0s - loss: 0.6373 - acc: 0.6464 - val_loss: 0.6099 - val_acc: 0.6786
Epoch 3/70
 - 0s - loss: 0.6130 - acc: 0.6738 - val_loss: 0.5950 - val_acc: 0.6919
Epoch 4/70
 - 0s - loss: 0.6050 - acc: 0.6774 - val_loss: 0.5898 - val_acc: 0.6919
Epoch 5/70
 - 0s - loss: 0.5985 - acc: 0.6873 - val_loss: 0.5828 - val_acc: 0.6984
Epoch 6/70
 - 0s - loss: 0.5921 - acc: 0.6892 - val_loss: 0.5809 - val_acc: 0.6970
Epoch 7/70
 - 0s - loss: 0.5904 - acc: 0.6910 - val_loss: 0.5786 - val_acc: 0.6992
Epoch 8/70
 - 0s - loss: 0.5839 - acc: 0.6927 - val_loss: 0.5750 - val_acc: 0.7025
Epoch 9/70
 - 0s - loss: 0.5838 - acc: 0.6936 - val_loss: 0.5729 - val_acc: 0.7068
Epoch 10/70
 - 0s - loss: 0.5811 - acc: 0.6938 - val_loss: 0.5699 - val_acc: 0.7044
Epoch 11/70
 - 0s - loss: 0.5787 - acc: 0.6979 - val_loss: 0.5679 - val_acc: 0.7065
Epoch 12/70
 - 0s - loss: 0.5772 - ac

<keras.callbacks.History at 0x7fe095efd5d0>

2018-11-03 15:55:20.963611: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA


In [44]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
r = rf.predict(X_test)
Y_valid = np.array(Y_test)
print("Accuracy for Random Forest: %.2f" % (accuracy_score(Y_test, r.round()) * 100))

Accuracy for Random Forest: 72.99
