In [None]:
# Setup notifications
from time import time as tme
from IPython import get_ipython
from IPython.display import Audio, display

class Beeper:

    def __init__(self, threshold, **audio_kwargs):
        self.threshold = threshold
        self.start_time = None    # time in sec, or None
        self.audio = audio_kwargs

    def pre_execute(self):
        if not self.start_time:
            self.start_time = tme()

    def post_execute(self):
        end_time = tme()
        if self.start_time and end_time - self.start_time > self.threshold:
            audio = Audio(**self.audio, autoplay=True)
            display(audio)
        self.start_time = None

beeper = Beeper(5, filename='/usr/share/sounds/gnome/default/alerts/drip.ogg')

ipython = get_ipython()
ipython.events.register('pre_execute', beeper.pre_execute)
ipython.events.register('post_execute', beeper.post_execute)

In [None]:
# Add the relevant scripts from LArMachineLearningData
# Nice the process so it can run with lots of cores on low priority
import os
os.nice(20)

# Add path for LArMachineLearningData
import sys
pandoraMVADir = os.environ['PANDORAMVADIR']
dataDir       = os.environ['PANDORABDTDATADIR']
plotsDir      = os.environ['PANDORABDTPLOTSDIR']

sys.path.append(pandoraMVADir + 'LArMachineLearningData/scripts')

# Import pandora libraries
from importlib import reload
from PandoraBDT import *

# Import concatenation tool
from itertools import chain

# Import relevant SKLearn libraries
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics

# Set global params
testTrainFraction = 0.5
nCores = -1

In [None]:
# Set some analysis specific things
trainingFile = dataDir + 'training_file_vertex_mix.txt'
BDTName = "VertexVertex"

allFeatureNames = ['Event_Showeryness', # 0
                   'Event_Energy',
                   'Event_Volume',
                   'Event_Longitudinality',
                   'Event_Num_Hits', 
                   'Event_Num_Clusters', # 5
                   'Event_Num_Candidates', 
                   
                   'Event_Area', # 7
                   'Event_Longitudinality2D',
                   'Event_Area_Average_Z',
                   'Event_Longitudinality2D_Average_Z', #10
                   'Event_Hit_Proportion',
                   'Event_Energy_Proportion',
                   'Event_Hit_Outlier_Proportion',
                   'Event_Energy_Outlier_Proportion',
                   'Event_Hit_Harsh_Proportion', # 15
                   'Event_Energy_Harsh_Proportion',
                   'Event_Hit_Outlier_Harsh_Proportion',
                   'Event_Energy_Outlier_Harsh_Proportion',

                   'Vertex1_Beam_Deweighting', # 19
                   'Vertex1_Energy_Kick',  # 20
                   'Vertex1_Global_Asymmetry', 
                   'Vertex1_Local_Asymmetry',
                   'Vertex1_Shower_Asymmetry',

                   'Vertex1_Global_Smooth_Asymmetry', # 24
                   'Vertex1_Global_Asymmetry_AC', # 25
                   'Vertex1_Global_Smooth_Asymmetry_AC',
                   'Vertex1_Global_Asymmetry_MC',
                   'Vertex1_Global_Smooth_Asymmetry_MC',
                   'Vertex1_Local_Smooth_Asymmetry',
                   'Vertex1_Shower_Smooth_Asymmetry', # 30
                   'Vertex1_dEdx_Asymmetry',
                   'Vertex1_dEdx_Smooth_Asymmetry',
                   'Vertex1_dEdx_Asymmetry_AC',
                   'Vertex1_dEdx_Smooth_Asymmetry_AC',
                   'Vertex1_dEdx_Asymmetry_MC', # 35
                   'Vertex1_dEdx_Smooth_Asymmetry_MC',
                   'Vertex1_dEdx_Deviation',
                   'Vertex1_dEdx_Deviation_AC',
                   'Vertex1_dEdx_Deviation_MC',
                   'Vertex1_Braggness', # 40
                   'Vertex1_Braggness_AC',
                   'Vertex1_Braggness_MC',
                   'Vertex1_Energy',
                   'Vertex1_Average_Energy',
                   'Vertex1_Charge', # 45
                   'Vertex1_Average_Charge',
                   'Vertex1_Energy_Ratio',
                   'Vertex1_Charge_Ratio',
                   'Vertex1_Num_Local_Clusters',
                   'Vertex1_Num_Local_Sliding_Fits', # 50
                   'Vertex1_rPhi',

                   'Vertex2_Beam_Deweighting', # 52
                   'Vertex2_Energy_Kick',
                   'Vertex2_Global_Asymmetry', 
                   'Vertex2_Local_Asymmetry', # 55
                   'Vertex2_Shower_Asymmetry',

                   'Vertex2_Global_Smooth_Asymmetry', # 57
                   'Vertex2_Global_Asymmetry_AC',
                   'Vertex2_Global_Smooth_Asymmetry_AC',
                   'Vertex2_Global_Asymmetry_MC', # 60
                   'Vertex2_Global_Smooth_Asymmetry_MC',
                   'Vertex2_Local_Smooth_Asymmetry',
                   'Vertex2_Shower_Smooth_Asymmetry',
                   'Vertex2_dEdx_Asymmetry',
                   'Vertex2_dEdx_Smooth_Asymmetry', # 65
                   'Vertex2_dEdx_Asymmetry_AC',
                   'Vertex2_dEdx_Smooth_Asymmetry_AC',
                   'Vertex2_dEdx_Asymmetry_MC',
                   'Vertex2_dEdx_Smooth_Asymmetry_MC',
                   'Vertex2_dEdx_Deviation', # 70
                   'Vertex2_dEdx_Deviation_AC',
                   'Vertex2_dEdx_Deviation_MC',
                   'Vertex2_Braggness',
                   'Vertex2_Braggness_AC',
                   'Vertex2_Braggness_MC', # 75
                   'Vertex2_Energy',
                   'Vertex2_Average_Energy',
                   'Vertex2_Charge',
                   'Vertex2_Average_Charge',
                   'Vertex2_Energy_Ratio', # 80
                   'Vertex2_Charge_Ratio',
                   'Vertex2_Num_Local_Clusters',
                   'Vertex2_Num_Local_Sliding_Fits',
                   'Vertex2_rPhi' # 84
                  ]

# Set background and signal label names
params = {
    'labelNames': ['Incorrect Vertex','Correct Vertex'],
    'signalDefs': [0, 1],
    'signalCols': ['r', 'b']
}

# Create the base BDT to vary the params from and compare to
baseBDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10),algorithm='SAMME', 
                             random_state=42, n_estimators=10)

In [None]:
# Load the data
allData, nFeatures, nExamples = LoadData(trainingFile, ',')

# Define version and variable usage

version  = "Original"
removals = {*range(7,19),*range(24,51),*range(57,84)}

#version  = "NewEventShapes"
#removals = {*range(2,4),*range(7,9),*range(11,19),*range(24,51),
#            *range(57,84)}

#version  = "Smooth"
#removals = {*range(7,19),*range(21,24),*range(25,29),*range(31,51),
#            *range(54,57),*range(58,62),*range(64,84)}

#version  = "New"
#removals = {*range(7,12),*range(13,19),*range(24,31),*range(32,37),
#            *range(38,44),*range(45,49),*range(50,51),*range(57,64),
#            *range(65,70),*range(71,77),*range(78,82),*range(83,84)}


#version  = "FirstPass"
#removals = {*range(2,4),*range(7,9),*range(15,19),*range(21,28),
#            *range(31,36),*range(37,39),*range(40,42),*range(43,44),
#            *range(45,49),*range(54,61),*range(64,69),*range(70,72),
#            *range(73,75),*range(76,77),*range(78,82)}

#version  = "SmoothAndNew"
#removals = {*range(2,4),*range(7,9),*range(15,19),*range(21,24), 
#            *range(25,29),*range(31,32),*range(33,37),*range(38,40),
#            *range(41,44),*range(45,49),*range(54,57),*range(58,62),
#            *range(64,65),*range(66,70),*range(71,73),*range(74,77),
#            *range(78,82)}

# Using better hyperparameters ?
#version += "_HighHP"
version += "_ThinHP"

# Plotting Directory
topDir = plotsDir + '/' + BDTName + '/' + version + '/'

# Remove unwanted features
data         = RemoveFeature(allData,removals)
featureNames = []

for i in [x for x in range(0,nFeatures) if x not in removals]:
    featureNames.append(allFeatureNames[i])

# Redefine size
nFeatures = len(data[0]) - 1

# Check removals
print(nFeatures, " =? ", len(featureNames))
print(featureNames)

In [None]:
featuresOrg, labelsOrg    = SplitTrainingSet(data, nFeatures)
weightsOrg                = GetWeights(data,6)
features, labels, weights = Randomize(featuresOrg, labelsOrg, weightsOrg, True)

# Split into train and test samples
xTrain, yTrain, weightsTrain, xTest, yTest, weightsTest = Sample(features, labels, weights, testTrainFraction)

# Split into signal and background based on the true labels
signalFeatures = features[labels==1]
backgroundFeatures = features[labels==0]

# Check the features array is the same size as the feature names array
print (len(featureNames))
print (np.shape(features))
print('Total: '+str(len(features))+', signal: '+
      str(len(signalFeatures))+' and background: '+
      str(len(backgroundFeatures)))
print (len(weights))

In [None]:
# Construct the Pandas dataframe
# First create a dictionary
allDict = {featureNames[i]: features[:, i] for i in range(nFeatures)}
allDict.update({'Labels': labels})

# Create the Pandas dataframe, create seperate df for signal/background
df = pd.DataFrame(data=allDict)

In [None]:
# Make plots drawing the variables for signal/background
DrawVariablesDF(df, params, topDir, save = False)

In [None]:
# Make correlation matricies
Correlation(df[df['Labels']==params['signalDefs'][0]], params['labelNames'][0] + ' Correlation Matrix',topDir, save = False)
Correlation(df[df['Labels']==params['signalDefs'][1]], params['labelNames'][1] + ' Correlation Matrix',topDir, save = False)

In [None]:
# Reference BDT with controlled hyperparams
#baseBDT.fit(xTrain,yTrain, sample_weight = weightsTrain)
baseBDT.fit(xTrain,yTrain)

In [None]:
# Plot ROC curves
fig, ax = plt.subplots()
metrics.plot_roc_curve(baseBDT, xTest, yTest, ax=ax)

plt.title("ROC Curves")
ax.invert_xaxis()
ax.legend()
ax.grid()

plt.savefig(topDir + '/' + "roc.png", bbox_inches='tight')
plt.savefig(topDir + '/' + "roc.pdf", bbox_inches='tight')

In [None]:
# Plot Confusion Matricies
fig, ax = plt.subplots()
metrics.plot_confusion_matrix(baseBDT, xTest, yTest, display_labels=params['labelNames'],
                             ax=ax, normalize='true')
ax.invert_xaxis()
plt.title("Confusion matrix (True Normalised)")

plt.savefig(topDir + '/' + "confusion_matrix.png", bbox_inches='tight')
plt.savefig(topDir + '/' + "confusion_matrix.pdf", bbox_inches='tight')
plt.show()

In [None]:
# Print more detailed performance info
bdtPredicted = baseBDT.predict(xTest)

print ("Background (0): ", params['labelNames'][0])
print ("Signal (1): ", params['labelNames'][1])
print ("BDT:\n", metrics.classification_report(yTest, bdtPredicted))

In [None]:
# Plot importance of features
importanceDF = pd.DataFrame({'Features': featureNames, 'Importance Score':baseBDT.feature_importances_})
print (importanceDF.sort_values(by=['Importance Score']))
ax = importanceDF.sort_values(by=['Importance Score'])\
    .plot(kind='barh', x='Features', y='Importance Score')

plt.savefig(topDir + '/' + "feature_importance.png", bbox_inches='tight')
plt.savefig(topDir + '/' + "feature_importance.pdf", bbox_inches='tight')

In [None]:
# Print all tunable params
baseBDT.get_params().keys()

In [None]:
import PandoraBDT
reload (PandoraBDT)
from PandoraBDT import *

parameters = {
  'ClassNames': ['True Vertex', 'Incorrect Vertex'],
  'SignalDefinition': [1, 0],
  'PlotColors': ['b', 'r'],
  'nBins': 100,
  'PlotStep': 1.0,
  'OptimalBinCut': 0,
  'OptimalScoreCut': 0.0,
  'nTrees': 100,
  'TreeDepth': 3
}

FindOptimalSignificanceCut(baseBDT, xTest, yTest, parameters)
PlotBdtScores(baseBDT, xTest, yTest, xTrain, yTrain, 'Vertex Vertex', parameters, topDir, save=True)

In [None]:
baseBDT.score(xTest,yTest)

In [None]:
WriteXmlFile(BDTName+".xml", baseBDT, BDTName)
SerializeToPkl(BDTName+".pkl", baseBDT)