In [44]:
from ROOT import *
from root_numpy import tree2array
from ROOT import TFile
import pandas as pd
import numpy as np
import deepdish.io as io
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import os

In [3]:
data = TFile.Open("/home/minerva1993/public/v808/nosplit/ttHbb_PowhegPythia.root")
data2 = TFile.Open("/home/minerva1993/public/v808/nosplit/TTLJ_PowhegPythia_ttbb.root")
tree = data.Get("ttbbLepJets/tree")
tree2 = data2.Get("ttbbLepJets/tree")

In [4]:
def tree_to_df(tree, branch_names=[], index_name='', drop_roofit_labels=False):
    if tree is None:
        return None

    branch_list = tree.GetListOfBranches()
    all_branch_names = [branch_list.At(i).GetName() for i in range(branch_list.GetEntries())]
    if len(branch_names) == 0:
        branch_names = all_branch_names
    for bn in branch_names[:]:
        if bn not in all_branch_names:
            branch_names.remove(bn)
        if drop_roofit_labels:
            if bn.endswith('_lbl'):
                branch_names.remove(bn)

    arrs = tree2array(tree, branch_names, start = 0, stop = 20000)
    df = pd.DataFrame(arrs)

    if len(index_name) == 0:
        for col in df.columns:
            if col.startswith('__index__'):
                index_name = col
                break
    if len(index_name):
        try:
            df[index_name] = df[index_name].astype(np.int32)
            df.set_index(index_name, inplace=True)
        except BaseException:
            pass

    if drop_roofit_labels:
        df.columns = [col.replace('_idx', '') for col in df.columns]

    n_tree = tree.GetEntries()
    n_df = len(df.index)

    return df 

In [5]:
dftree = tree_to_df(tree)
dftree_bg = tree_to_df(tree2)

In [6]:
# results = []
# for i in range(len(dftree['addbjet1_pt'])):
#     k = np.sqrt((dftree['addbjet1_e'][i] + dftree['addbjet2_e'][i])**2 - (dftree['addbjet1_pt'][i] + dftree['addbjet2_pt'][i])**2 - (dftree['addbjet1_eta'][i] + dftree['addbjet2_eta'][i])**2 - (dftree['addbjet1_phi'][i] + dftree['addbjet2_phi'][i])**2)
#     results.append(k)

In [7]:
def process(df):
    columns = ['draddjets','lepton_pT','lepton_eta','lepton_phi','lepton_E','MET','MET_phi','channel','event_weight']
    
    for t in range(1,7):
        for i in ['jet_pT','jet_eta','jet_phi','jet_E','jet_CvsB']:
            columns.append(i+'_'+str(t))
            
    end = []
    
    for i in range(len(df['lepton_SF'])):
        if df['jet_number'][i] >= 6:
            parts = []
            for t in ['draddjets','lepton_pT','lepton_eta','lepton_phi','lepton_E','MET','MET_phi','channel']:
                parts.append(df[t][i])
            
            product = df['lepton_SF'][i][0] * df['jet_SF_CSV_30'][i][0] * df['PUWeight'][i][0] * df['genweight'][i]
            
            parts.append(product)
            
            for t in range(len(df['jet_pT'][i])):
                passed = True
                partial = []
                for k in ['jet_pT','jet_eta','jet_phi','jet_E','jet_CvsB']:
                    if k == 'jet_pT':
                        if df[k][i][t] < 30:
                            passed = False
                            break
                    elif k == 'jet_eta':
                        if df[k][i][t] > 2.4 or df[k][i][t] < -2.4:
                            passed = False
                            break
                    partial.append(df[k][i][t])
                
                if passed:
                    parts += partial
                    
                if len(parts) == len(columns):
                    break
                    
            end.append(parts)
            
    train_tree = pd.DataFrame(end, columns=columns)
    return train_tree

In [8]:
train_tree = process(dftree)

In [9]:
train_tree_2 = process(dftree_bg)

In [10]:
train_tree['result'] = np.zeros(len(train_tree))

In [11]:
train_tree_2['result'] = [1 for i in range(len(train_tree_2))]

In [12]:
train = train_tree.append(train_tree_2, ignore_index=True)

In [13]:
y_pred = train['result']
train = train.drop('result',1)

In [14]:
def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [15]:
train, scaler = preprocess_data(train)

  return self.partial_fit(X, y)
  """


In [16]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train, y_pred, test_size = 0.2, random_state = 42)

# Neural Networks

In [17]:
model = Sequential()
model.add(Dropout(0.13, input_shape=(X_train.shape[1],)))
model.add(Dense(75))
model.add(PReLU())

model.add(Dropout(0.11))
model.add(Dense(50))
model.add(PReLU())

model.add(Dropout(0.09))
model.add(Dense(30))
model.add(PReLU())

model.add(Dropout(0.07))
model.add(Dense(25))
model.add(PReLU())

model.add(Dense(2))
model.add(Activation('sigmoid'))

In [18]:
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.SGD(lr=0.01, nesterov=True), metrics=['accuracy'])

In [19]:
Y_train_nn = np_utils.to_categorical(Y_train)
Y_valid_nn = np_utils.to_categorical(Y_valid)

In [20]:
model.fit(X_train, Y_train_nn, batch_size=64, epochs=40, verbose=2, shuffle=True, validation_data = (X_valid, Y_valid_nn))

Train on 12003 samples, validate on 3001 samples
Epoch 1/40
 - 1s - loss: 0.6954 - acc: 0.5021 - val_loss: 0.6918 - val_acc: 0.5312
Epoch 2/40
 - 0s - loss: 0.6932 - acc: 0.5098 - val_loss: 0.6914 - val_acc: 0.5288
Epoch 3/40
 - 0s - loss: 0.6926 - acc: 0.5152 - val_loss: 0.6912 - val_acc: 0.5292
Epoch 4/40
 - 0s - loss: 0.6927 - acc: 0.5153 - val_loss: 0.6910 - val_acc: 0.5335
Epoch 5/40
 - 0s - loss: 0.6923 - acc: 0.5187 - val_loss: 0.6909 - val_acc: 0.5302
Epoch 6/40
 - 0s - loss: 0.6919 - acc: 0.5174 - val_loss: 0.6907 - val_acc: 0.5328
Epoch 7/40
 - 1s - loss: 0.6919 - acc: 0.5171 - val_loss: 0.6905 - val_acc: 0.5348
Epoch 8/40
 - 0s - loss: 0.6917 - acc: 0.5171 - val_loss: 0.6904 - val_acc: 0.5365
Epoch 9/40
 - 0s - loss: 0.6912 - acc: 0.5250 - val_loss: 0.6903 - val_acc: 0.5418
Epoch 10/40
 - 0s - loss: 0.6912 - acc: 0.5167 - val_loss: 0.6901 - val_acc: 0.5408
Epoch 11/40
 - 0s - loss: 0.6909 - acc: 0.5274 - val_loss: 0.6900 - val_acc: 0.5415
Epoch 12/40
 - 0s - loss: 0.6903 - a

<keras.callbacks.History at 0x7f324eaba650>

2018-10-18 18:10:00.964580: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA


# Gradient Boosting

In [27]:
Y_train = np.array(Y_train)
Y_valid = np.array(Y_valid)
xgb = XGBRegressor()

In [29]:
params={
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'max_depth':5,
    'gamma': 0.3,
    'min_child_weight':1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 1000,
}

In [None]:
grs = GridSearchCV(xgb, param_grid=params, cv=2, n_jobs=4, verbose=2)
grs.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], verbose=False)

print("Best parameters " + str(grs.best_params_))
gpd = pd.DataFrame(grs.cv_results_)
print("Estimated accuracy of this model for unseen data: {0:1.4f}".format(gpd['mean_test_score'][grs.best_index_]))

In [30]:
my_model = XGBRegressor(
    objective = 'binary:logistic',
    learning_rate = 0.1,
    max_depth = 5,
    gamma = 0,
    min_child_weight = 1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    n_estimators = 1000,
    silent = 1
)

In [31]:
my_model.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
       subsample=0.8)

In [32]:
a = my_model.predict(X_valid)

In [33]:
Y_valid = np.array(Y_valid)

In [35]:
print("Accuracy for Gradient Boosting: %.2f" % (accuracy_score(Y_valid, a.round()) * 100))

Accuracy for Gradient Boosting: 59.78


# Random Forest

In [37]:
rf = RandomForestClassifier(n_estimators=100, max_features=15, max_depth=11, random_state=1)
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features=15, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [38]:
r = rf.predict(X_valid)
Y_valid = np.array(Y_valid)
print("Accuracy for Random Forest: %.2f" % (accuracy_score(Y_valid, r.round()) * 100))

Accuracy for Random Forest: 59.85


In [39]:
r_train = rf.predict(X_train)
Y_train = np.array(Y_train)
print("Training Accuracy for Random Forest: %.2f" % (accuracy_score(Y_train, r_train.round()) * 100))

Training Accuracy for Random Forest: 90.92


# Linear Regression

In [46]:
estimator = LinearRegression()
selector = RFE(estimator, 5, step=1)
selector = selector.fit(X_train, Y_train)

In [50]:
for i in range(len(selector.support_)):
    if selector.support_[i]:
        print train_tree.columns[i]

jet_pT_1
jet_E_1
jet_pT_2
jet_E_2
jet_E_3


In [48]:
selector.ranking_

array([17, 22, 18, 32, 33, 25, 24, 29,  3,  1, 12, 19,  1,  7,  1, 13, 16,
        1, 15,  2, 20, 35,  1, 28,  5, 11, 30,  4, 14, 10, 34, 27,  6, 21,
        8, 23, 26,  9, 31])