# HIGGS Dataset
See https://archive.ics.uci.edu/ml/datasets/HIGGS for dataset information and feature descriptions.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import csv
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
from scipy import stats
import pickle
import operator
import glob
from scipy.io.arff import loadarff 
from scipy.stats import ttest_rel

import seaborn as sns; sns.set_style('white')

from sklearn.utils import resample
from sklearn.metrics import accuracy_score, plot_confusion_matrix, f1_score, plot_roc_curve, roc_auc_score, make_scorer
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [19]:
df = pd.read_csv('HIGGS.csv', header=None)
df.columns = ['class', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 
              'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 
              'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta', 
              'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag', 
              'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [21]:
df

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.000000,...,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.497970,-0.313010,1.095531,-0.557525,-1.588230,2.173076,...,-1.138930,-0.000819,0.000000,0.302220,0.833048,0.985700,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.000000,...,1.128848,0.900461,0.000000,0.909753,1.108330,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.992050,0.882454,1.786066,-1.646778,-0.942383,0.000000,...,-0.678379,-1.360356,0.000000,0.946652,1.028704,0.998656,0.728281,0.869200,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.000000,...,-0.373566,0.113041,0.000000,0.755856,1.361057,0.986610,0.838085,1.133295,0.872245,0.808487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10999995,1.0,1.159912,1.013847,0.108615,1.495524,-0.537545,2.342396,-0.839740,1.320683,0.000000,...,-0.097068,1.190680,3.101961,0.822136,0.766772,1.002191,1.061233,0.837004,0.860472,0.772484
10999996,1.0,0.618388,-1.012982,1.110139,0.941023,-0.379199,1.004656,0.348535,-1.678593,2.173076,...,-0.216995,1.049177,3.101961,0.826829,0.989809,1.029104,1.199679,0.891481,0.938490,0.865269
10999997,1.0,0.700559,0.774251,1.520182,0.847112,0.211230,1.095531,0.052457,0.024553,2.173076,...,1.585235,1.713962,0.000000,0.337374,0.845208,0.987610,0.883422,1.888438,1.153766,0.931279
10999998,0.0,1.178030,0.117796,-1.276980,1.864457,-0.584370,0.998519,-1.264549,1.276333,0.000000,...,1.399515,-1.313189,0.000000,0.838842,0.882890,1.201380,0.939216,0.339705,0.759070,0.719119


In [4]:
# Separate majority and minority classes
df_majority = df[df['class']==0]
df_minority = df[df['class']==1]

# Downsample majority and minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=6000,     # to match minority class
                                 random_state=123) # reproducible results

df_minority_downsampled = resample(df_minority, 
                                 replace=False,    # sample with replacement
                                 n_samples=6000,     # to match majority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority_downsampled])
 
# Display new class counts
df_downsampled['class'].value_counts()

1.0    6000
0.0    6000
Name: class, dtype: int64

In [5]:
df_downsampled

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
2984264,0.0,0.558361,0.125588,-0.383650,0.499787,-1.464681,0.656732,-0.140638,0.583364,0.000000,...,0.719931,0.224578,3.101961,1.373681,0.927890,0.994193,1.126345,1.159692,0.833733,0.796250
10764116,0.0,1.296253,0.516149,-0.117317,0.636398,-0.455267,0.988075,0.144548,0.907119,2.173076,...,1.291248,1.573014,0.000000,0.901899,0.875995,0.975326,0.882177,1.255846,0.909731,0.763828
9530004,0.0,1.062368,-0.338021,1.577887,0.534542,-0.183894,1.081973,-1.038777,-0.608094,2.173076,...,0.164438,0.940969,0.000000,1.120079,0.906208,1.043838,0.997955,1.427019,0.901307,0.789998
5455217,0.0,1.355182,0.181104,0.414898,0.758879,1.102309,0.513184,-0.193120,1.042387,2.173076,...,0.516723,1.498656,3.101961,0.959572,0.989931,0.984073,0.778193,0.565135,0.838409,0.813519
6082563,0.0,0.659565,-1.691838,0.469274,0.860520,-0.639793,0.850298,-1.143741,1.576804,2.173076,...,-1.006511,-1.012426,0.000000,0.901754,1.060937,0.985931,0.842516,0.874739,0.762080,0.752349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521000,1.0,1.165585,1.037223,-1.130496,1.259846,-0.187387,0.397484,-0.461473,-0.793256,0.000000,...,0.586680,1.607974,3.101961,1.355938,1.000538,1.261941,1.162553,0.474319,0.771688,1.002877
1672790,1.0,0.706049,0.738214,1.374808,0.757621,-0.172691,0.392995,0.581240,0.354407,2.173076,...,-0.128715,-0.656173,0.000000,0.957407,1.225450,0.997393,0.571817,1.134243,0.831666,0.737647
7951892,1.0,0.741004,-0.879548,0.831599,0.560017,-0.590536,1.122463,-0.517916,-1.707975,0.000000,...,-0.693370,0.101387,3.101961,0.807574,1.053625,0.986507,0.659439,0.759948,0.887652,0.748781
10444720,1.0,1.245926,-0.610733,-0.588952,0.719146,0.448291,1.028657,0.864445,1.571815,2.173076,...,-0.700865,1.082472,3.101961,0.727640,0.912113,1.055416,0.650828,0.919102,0.956839,0.801805


In [6]:
df = df_downsampled.copy()
scaler = StandardScaler()
X, y = df.iloc[:,1:].to_numpy(), df.iloc[:,0].to_numpy()

## Hyperparameter Search & Experimentation

In [10]:
def experiment():
    pipeline1 = Pipeline((
    ('clf', RandomForestClassifier()),
    ))

    pipeline2 = Pipeline((
    ('clf', KNeighborsClassifier()),
    ))

    pipeline3 = Pipeline((
    ('clf', AdaBoostClassifier()),
    ))
    
    pipeline4 = Pipeline((
    ('clf', LogisticRegression()),
    ))
    
    pipeline5 = Pipeline((
    ('clf', MLPClassifier()),
    ))
    
    # Random Forest
    parameters1 = {
    'clf__n_estimators': [1024],
    'clf__max_features': [1, 2, 4, 6, 8, 12, 16, 20]
    }

    # KNN
    parameters2 = {
    'clf__n_neighbors': [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105],
    'clf__weights': ['uniform', 'distance']
    }
    
    # AdaBoost (Boosted Decision Tree)
    parameters3 = {
        'clf__algorithm': ['SAMME.R'],
        'clf__n_estimators': [2,4,8,16,32,64,128,256,512,1024,2048],
        'clf__learning_rate': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 2e1, 5e1]
    }
    
    # Logistic
    parameters4 = {
    'clf__penalty':['l1', 'l2', None],
    'clf__C':[1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1e0, 1e1, 1e2, 1e3, 1e4],
    'clf__max_iter':[5000]
    }
    
    # Multi-layer Perceptron
    parameters5 = {
        'clf__hidden_layer_sizes':[(1,), (2,), (4,), (8,), (32,), (128,)],
        'clf__solver':['sgd'],
        'clf__activation':['relu'], 
        'clf__learning_rate':['constant', 'invscaling'], 
        'clf__learning_rate_init': [1e-3, 1e-2, 1e-1, 1e0],
        'clf__max_iter': [2, 4, 8, 16, 32, 64, 128, 256, 512]
    }
    
    pars = [parameters1, parameters2, parameters3, parameters4, parameters5]
    pips = [pipeline1, pipeline2, pipeline3, pipeline4, pipeline5]
    
    # List of dictionaries to hold the scores of the various metrics for each type of classifier
    best_clf_list = []
    trial_storage = {}
    training_storage = {}
    
    print("starting Gridsearch")
    for i in range(len(pars)):
        trial_averages = []
        train_performance = []
        for t in range(5):
            # split and scale data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/6, random_state=t)
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=t)
            X_train = scaler.fit_transform(X_train)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
                
            clf = GridSearchCV(pips[i], pars[i], refit=False, n_jobs=8, cv=5, verbose=3, scoring=('accuracy', 'roc_auc', 'f1'))
            clf = clf.fit(X_val, y_val)
            
            print("finished Gridsearch trial " + str(t + 1) + " classifier " + str(i + 1))
            print("")
            print("")
            
            # find the best params for each metric in a given trial 
            best_index_acc = np.argmin(clf.cv_results_['rank_test_accuracy'])
            best_params_acc = clf.cv_results_['params'][best_index_acc]
            best_index_roc = np.argmin(clf.cv_results_['rank_test_roc_auc'])
            best_params_roc = clf.cv_results_['params'][best_index_roc]
            best_index_f1 = np.argmin(clf.cv_results_['rank_test_f1'])
            best_params_f1 = clf.cv_results_['params'][best_index_f1]
    
            # train and test models for given metric with their corresponding best parameter settings
            pipe = pips[i]
            clf_acc = pipe.set_params(**best_params_acc)
            clf_acc = clf_acc.fit(X_train, y_train)
            clf_roc = pipe.set_params(**best_params_roc)
            clf_roc = clf_roc.fit(X_train, y_train)
            clf_f1 = pipe.set_params(**best_params_f1)
            clf_f1 = clf_f1.fit(X_train, y_train)
            
            # get training set performance
            train_acc = accuracy_score(y_train, clf_acc.predict(X_train))
            train_roc = roc_auc_score(y_train, clf_roc.predict_proba(X_train)[:, 1])
            train_f1 = f1_score(y_train, clf_f1.predict(X_train))
            
            train_performance.append({
                'Model #': i + 1,
                'average': (train_f1 + train_acc + train_roc)/3,
                'accuracy': train_acc,
                'roc_auc_score': train_roc,
                'f1 score': train_f1
            })
            
            # get test set performances 
            trial_acc = clf_acc.score(X_test, y_test)
            trial_roc = roc_auc_score(y_test, clf_roc.predict_proba(X_test)[:, 1])
            trial_f1 = f1_score(y_test, clf_f1.predict(X_test))
            
            # store scores and their averages in list containing averages for each trial
            trial_averages.append({
                'Model #': i + 1, # model number corresponds to the numbers used in pipeline above (i.e. 1 = Random Forest)
                'average':(trial_acc + trial_roc + trial_f1) / 3,
                'accuracy': trial_acc,
                'roc_auc_score': trial_roc,
                'f1_score': trial_f1
            })
            
            train_performance.append({
                'Model #': i + 1, # model number corresponds to the numbers used in pipeline above (i.e. 1 = Random Forest)
                'average':(train_acc + train_roc + train_f1) / 3,
                'accuracy': train_acc,
                'roc_auc_score': train_roc,
                'f1_score': train_f1
            })
            
        # find the trial with the best average metric scores and append those scores as a dict to best clf list
        max_average = 0
        for trial in trial_averages:
            if trial['average'] > max_average:
                max_average = trial['average']
                best_trial = trial

        best_clf_list.append(best_trial)
        training_storage[str(i + 1)]=train_performance
        trial_storage[str(i + 1)]=trial_averages
    
    return best_clf_list, trial_storage, training_storage

In [11]:
%%capture --no-stdout --no-display
best_clf_list, trial_storage, training_perf = experiment()

starting Gridsearch
Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 1 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 2 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 3 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 4 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 5 classifier 1


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 1 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 2 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 3 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 4 classifier 2


Fitting 5 folds for each of 54 candidates, to

## Calculating and Organizing Results

In [22]:
print('Best Models On Average For Test Set:')
for element in best_clf_list:
    print(element)

print()

print('Train Set Data')
for i in range(len(training_perf)):
    print(training_perf[str(i + 1)])

print()

alg_avg = {}
alg_acc = {}
alg_roc = {}
alg_f1 = {}

for i in range(len(trial_storage)):
    alg_avg[str(i + 1)]=[]
    alg_acc[str(i + 1)]=[]
    alg_roc[str(i + 1)]=[]
    alg_f1[str(i + 1)]=[]
    for entry in trial_storage[str(i + 1)]:
        alg_avg[str(i + 1)].append(entry['average'])
        alg_acc[str(i + 1)].append(entry['accuracy'])
        alg_roc[str(i + 1)].append(entry['roc_auc_score'])
        alg_f1[str(i + 1)].append(entry['f1_score'])

print('set of averages of algorithms over 5 trials:')
print(alg_avg)
print()
print('set of acc values of algorithms over 5 trials')
print(alg_acc)
print()
print('set of roc values of algorithms over 5 trials')
print(alg_roc)
print()
print('set of f1 values of algorithms over 5 trials')
print(alg_f1)

Best Models On Average For Test Set:
{'Model #': 1, 'average': 0.7561565748411576, 'accuracy': 0.735, 'roc_auc_score': 0.8038778877887788, 'f1_score': 0.7295918367346939}
{'Model #': 2, 'average': 0.6618301953002318, 'accuracy': 0.627, 'roc_auc_score': 0.6856835683568356, 'f1_score': 0.6728070175438597}
{'Model #': 3, 'average': 0.7288081064743643, 'accuracy': 0.705, 'roc_auc_score': 0.7714931493149316, 'f1_score': 0.7099311701081613}
{'Model #': 4, 'average': 0.6415731580835212, 'accuracy': 0.62, 'roc_auc_score': 0.668008193179818, 'f1_score': 0.6367112810707457}
{'Model #': 5, 'average': 0.6889371606935969, 'accuracy': 0.671, 'roc_auc_score': 0.7281347144040231, 'f1_score': 0.6676767676767676}

Train Set Data
[{'Model #': 1, 'average': 1.0, 'accuracy': 1.0, 'roc_auc_score': 1.0, 'f1 score': 1.0}, {'Model #': 1, 'average': 1.0, 'accuracy': 1.0, 'roc_auc_score': 1.0, 'f1_score': 1.0}, {'Model #': 1, 'average': 1.0, 'accuracy': 1.0, 'roc_auc_score': 1.0, 'f1 score': 1.0}, {'Model #': 1,

In [23]:
# calculate average acc metric scores per algorithm over 5 trials
alg_acc_averages = {}
for i in range(len(alg_acc)):
    alg_acc_averages[str(i + 1)] = sum(alg_acc[str(i + 1)])/5

print(alg_acc_averages)

{'1': 0.7104999999999999, '2': 0.6085, '3': 0.6943, '4': 0.5891, '5': 0.6429}


In [14]:
# calculate average roc metric scores per algorithm over 5 trials
alg_roc_averages = {}
for i in range(len(alg_roc)):
    alg_roc_averages[str(i + 1)] = sum(alg_roc[str(i + 1)])/5

print(alg_roc_averages)

{'1': 0.7794622743489954, '2': 0.6649067821966286, '3': 0.7603644969664627, '4': 0.627141463605107, '5': 0.6950234356492659}


In [15]:
# calculate average f1 metric scores per algorithm over 5 trials
alg_f1_averages = {}
for i in range(len(alg_f1)):
    alg_f1_averages[str(i + 1)] = sum(alg_f1[str(i + 1)])/5

print(alg_f1_averages)

{'1': 0.7007262064637629, '2': 0.6507842311602793, '3': 0.6934850179150744, '4': 0.6059293300259198, '5': 0.6694721116385507}


In [16]:
averages = {}
for i in range(len(alg_acc_averages)):
    averages[str(i + 1)] = (alg_acc_averages[str(i + 1)] + alg_roc_averages[str(i + 1)] + alg_f1_averages[str(i + 1)])/3

print(averages)

{'1': 0.7302294936042527, '2': 0.6413970044523026, '3': 0.7160498382938458, '4': 0.6073902645436755, '5': 0.6691318490959389}


In [17]:
# t-test best against rest mean of metrics (RF against rest)
combined_metrics_1 = []
combined_metrics_2 = []
combined_metrics_3 = []
combined_metrics_4 = []
combined_metrics_5 = []

for item in alg_acc['1']:
    combined_metrics_1.append(item)
for item in alg_roc['1']:
    combined_metrics_1.append(item)
for item in alg_f1['1']:
    combined_metrics_1.append(item)
    
for item in alg_acc['2']:
    combined_metrics_2.append(item)
for item in alg_roc['2']:
    combined_metrics_2.append(item)
for item in alg_f1['2']:
    combined_metrics_2.append(item)

for item in alg_acc['3']:
    combined_metrics_3.append(item)
for item in alg_roc['3']:
    combined_metrics_3.append(item)
for item in alg_f1['3']:
    combined_metrics_3.append(item)
    
for item in alg_acc['4']:
    combined_metrics_4.append(item)
for item in alg_roc['4']:
    combined_metrics_4.append(item)
for item in alg_f1['4']:
    combined_metrics_4.append(item)

for item in alg_acc['5']:
    combined_metrics_5.append(item)
for item in alg_roc['5']:
    combined_metrics_5.append(item)
for item in alg_f1['5']:
    combined_metrics_5.append(item)
    
print(ttest_rel(combined_metrics_1, combined_metrics_2))
print(ttest_rel(combined_metrics_1, combined_metrics_3))
print(ttest_rel(combined_metrics_1, combined_metrics_4))
print(ttest_rel(combined_metrics_1, combined_metrics_5))

Ttest_relResult(statistic=10.85608253750064, pvalue=3.3462524215480636e-08)
Ttest_relResult(statistic=3.9539094923767744, pvalue=0.0014406524366184668)
Ttest_relResult(statistic=9.13710780210442, pvalue=2.822386600659466e-07)
Ttest_relResult(statistic=6.999851511777664, pvalue=6.249422937916668e-06)
