# HTRU2 Pulsar Dataset
See https://archive.ics.uci.edu/ml/datasets/HTRU2 for dataset information and feature descriptions.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import csv
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
from scipy import stats
import pickle
import operator
import glob
from scipy.io.arff import loadarff 
from scipy.stats import ttest_rel

import seaborn as sns; sns.set_style('white')

from sklearn.utils import resample
from sklearn.metrics import accuracy_score, plot_confusion_matrix, f1_score, plot_roc_curve, roc_auc_score, make_scorer
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [2]:
raw_data = loadarff('HTRU2/HTRU_2.arff')
df = pd.DataFrame(raw_data[0])
df['class'] = np.where(df['class']==b'1', 1, 0)
df['class'].value_counts()

0    16259
1     1639
Name: class, dtype: int64

In [3]:
# Separate majority and minority classes
df_majority = df[df['class']==0]
df_minority = df[df['class']==1]

# Downsample majority and minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=6000,     # to match minority class
                                 random_state=123) # reproducible results

df_minority_downsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=6000,     # to match majority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority_downsampled])
 
# Display new class counts
df_downsampled['class'].value_counts()

1    6000
0    6000
Name: class, dtype: int64

In [4]:
df = df_downsampled.copy()
scaler = StandardScaler()
X, y = df.iloc[:,0:8].to_numpy(), df.iloc[:,8].to_numpy()

## Hyperparameter Search & Experimentation

In [5]:
def experiment():
    pipeline1 = Pipeline((
    ('clf', RandomForestClassifier()),
    ))

    pipeline2 = Pipeline((
    ('clf', KNeighborsClassifier()),
    ))

    pipeline3 = Pipeline((
    ('clf', AdaBoostClassifier()),
    ))
    
    pipeline4 = Pipeline((
    ('clf', LogisticRegression()),
    ))
    
    pipeline5 = Pipeline((
    ('clf', MLPClassifier()),
    ))
    
    # Random Forest
    parameters1 = {
    'clf__n_estimators': [1024],
    'clf__max_features': [1, 2, 4, 6, 8, 12, 16, 20]
    }

    # KNN
    parameters2 = {
    'clf__n_neighbors': [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105],
    'clf__weights': ['uniform', 'distance']
    }
    
    # AdaBoost (Boosted Decision Tree)
    parameters3 = {
        'clf__algorithm': ['SAMME.R'],
        'clf__n_estimators': [2,4,8,16,32,64,128,256,512,1024,2048],
        'clf__learning_rate': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 2e1, 5e1]
    }
    
    # Logistic
    parameters4 = {
    'clf__penalty':['l1', 'l2', None],
    'clf__C':[1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1e0, 1e1, 1e2, 1e3, 1e4],
    'clf__max_iter':[5000]
    }
    
    # Multi-layer Perceptron
    parameters5 = {
        'clf__hidden_layer_sizes':[(1,), (2,), (4,), (8,), (32,), (128,)],
        'clf__solver':['sgd'],
        'clf__activation':['relu'], 
        'clf__learning_rate':['constant', 'invscaling'], 
        'clf__learning_rate_init': [1e-3, 1e-2, 1e-1, 1e0],
        'clf__max_iter': [2, 4, 8, 16, 32, 64, 128, 256, 512]
    }
    
    pars = [parameters1, parameters2, parameters3, parameters4, parameters5]
    pips = [pipeline1, pipeline2, pipeline3, pipeline4, pipeline5]
    
    # List of dictionaries to hold the scores of the various metrics for each type of classifier
    best_clf_list = []
    trial_storage = {}
    training_storage = {}
    
    print("starting Gridsearch")
    for i in range(len(pars)):
        trial_averages = []
        train_performance = []
        for t in range(5):
            # split and scale data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/6, random_state=t)
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=t)
            X_train = scaler.fit_transform(X_train)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
                
            clf = GridSearchCV(pips[i], pars[i], refit=False, n_jobs=8, cv=5, verbose=3, scoring=('accuracy', 'roc_auc', 'f1'))
            clf = clf.fit(X_val, y_val)
            
            print("finished Gridsearch trial " + str(t + 1) + " classifier " + str(i + 1))
            print("")
            print("")
            
            # find the best params for each metric in a given trial 
            best_index_acc = np.argmin(clf.cv_results_['rank_test_accuracy'])
            best_params_acc = clf.cv_results_['params'][best_index_acc]
            best_index_roc = np.argmin(clf.cv_results_['rank_test_roc_auc'])
            best_params_roc = clf.cv_results_['params'][best_index_roc]
            best_index_f1 = np.argmin(clf.cv_results_['rank_test_f1'])
            best_params_f1 = clf.cv_results_['params'][best_index_f1]
    
            # train and test models for given metric with their corresponding best parameter settings
            pipe = pips[i]
            clf_acc = pipe.set_params(**best_params_acc)
            clf_acc = clf_acc.fit(X_train, y_train)
            clf_roc = pipe.set_params(**best_params_roc)
            clf_roc = clf_roc.fit(X_train, y_train)
            clf_f1 = pipe.set_params(**best_params_f1)
            clf_f1 = clf_f1.fit(X_train, y_train)
            
            # get training set performance
            train_acc = accuracy_score(y_train, clf_acc.predict(X_train))
            train_roc = roc_auc_score(y_train, clf_roc.predict_proba(X_train)[:, 1])
            train_f1 = f1_score(y_train, clf_f1.predict(X_train))
            
            train_performance.append({
                'Model #': i + 1,
                'average': (train_f1 + train_acc + train_roc)/3,
                'accuracy': train_acc,
                'roc_auc_score': train_roc,
                'f1 score': train_f1
            })
            
            # get test set performances 
            trial_acc = clf_acc.score(X_test, y_test)
            trial_roc = roc_auc_score(y_test, clf_roc.predict_proba(X_test)[:, 1])
            trial_f1 = f1_score(y_test, clf_f1.predict(X_test))
            
            # store scores and their averages in list containing averages for each trial
            trial_averages.append({
                'Model #': i + 1, # model number corresponds to the numbers used in pipeline above (i.e. 1 = Random Forest)
                'average':(trial_acc + trial_roc + trial_f1) / 3,
                'accuracy': trial_acc,
                'roc_auc_score': trial_roc,
                'f1_score': trial_f1
            })
            
            train_performance.append({
                'Model #': i + 1, # model number corresponds to the numbers used in pipeline above (i.e. 1 = Random Forest)
                'average':(train_acc + train_roc + train_f1) / 3,
                'accuracy': train_acc,
                'roc_auc_score': train_roc,
                'f1_score': train_f1
            })
            
        # find the trial with the best average metric scores and append those scores as a dict to best clf list
        max_average = 0
        for trial in trial_averages:
            if trial['average'] > max_average:
                max_average = trial['average']
                best_trial = trial

        best_clf_list.append(best_trial)
        training_storage[str(i + 1)]=train_performance
        trial_storage[str(i + 1)]=trial_averages
    
    return best_clf_list, trial_storage, training_storage

In [6]:
%%capture --no-stdout --no-display
best_clf_list, trial_storage, training_perf = experiment()

starting Gridsearch
Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 1 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 2 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 3 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 4 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 5 classifier 1


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 1 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 2 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 3 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 4 classifier 2


Fitting 5 folds for each of 54 candidates, to

## Calculating and Organizing Results

In [7]:
print('Best Models On Average For Test Set:')
for element in best_clf_list:
    print(element)

print()

print('Train Set Data')
for i in range(len(training_perf)):
    print(training_perf[str(i + 1)])

print()

alg_avg = {}
alg_acc = {}
alg_roc = {}
alg_f1 = {}

for i in range(len(trial_storage)):
    alg_avg[str(i + 1)]=[]
    alg_acc[str(i + 1)]=[]
    alg_roc[str(i + 1)]=[]
    alg_f1[str(i + 1)]=[]
    for entry in trial_storage[str(i + 1)]:
        alg_avg[str(i + 1)].append(entry['average'])
        alg_acc[str(i + 1)].append(entry['accuracy'])
        alg_roc[str(i + 1)].append(entry['roc_auc_score'])
        alg_f1[str(i + 1)].append(entry['f1_score'])

print('set of averages of algorithms over 5 trials:')
print(alg_avg)
print()
print('set of acc values of algorithms over 5 trials')
print(alg_acc)
print()
print('set of roc values of algorithms over 5 trials')
print(alg_roc)
print()
print('set of f1 values of algorithms over 5 trials')
print(alg_f1)

Best Models On Average For Test Set:
{'Model #': 1, 'average': 0.985648124035749, 'accuracy': 0.9815, 'roc_auc_score': 0.9944090056285179, 'f1_score': 0.9810353664787289}
{'Model #': 2, 'average': 0.9805553504954202, 'accuracy': 0.976, 'roc_auc_score': 0.9901558474046278, 'f1_score': 0.9755102040816327}
{'Model #': 3, 'average': 0.9770331745120471, 'accuracy': 0.9695, 'roc_auc_score': 0.9928655409631019, 'f1_score': 0.9687339825730394}
{'Model #': 4, 'average': 0.960472762341343, 'accuracy': 0.95, 'roc_auc_score': 0.9819744043142212, 'f1_score': 0.949443882709808}
{'Model #': 5, 'average': 0.9657670237746814, 'accuracy': 0.9565, 'roc_auc_score': 0.9846297866892832, 'f1_score': 0.9561712846347608}

Train Set Data
[{'Model #': 1, 'average': 1.0, 'accuracy': 1.0, 'roc_auc_score': 1.0, 'f1 score': 1.0}, {'Model #': 1, 'average': 1.0, 'accuracy': 1.0, 'roc_auc_score': 1.0, 'f1_score': 1.0}, {'Model #': 1, 'average': 1.0, 'accuracy': 1.0, 'roc_auc_score': 1.0, 'f1 score': 1.0}, {'Model #': 1

In [8]:
# calculate average acc metric scores per algorithm over 5 trials
alg_acc_averages = {}
for i in range(len(alg_acc)):
    alg_acc_averages[str(i + 1)] = sum(alg_acc[str(i + 1)])/5

print(alg_acc_averages)

{'1': 0.9785, '2': 0.9705999999999999, '3': 0.9664000000000001, '4': 0.9456, '5': 0.9488}


In [9]:
# calculate average roc metric scores per algorithm over 5 trials
alg_roc_averages = {}
for i in range(len(alg_roc)):
    alg_roc_averages[str(i + 1)] = sum(alg_roc[str(i + 1)])/5

print(alg_roc_averages)

{'1': 0.9948326985709499, '2': 0.9912145687881999, '3': 0.9918385096219566, '4': 0.9783811170287539, '5': 0.983436305831695}


In [10]:
# calculate average f1 metric scores per algorithm over 5 trials
alg_f1_averages = {}
for i in range(len(alg_f1)):
    alg_f1_averages[str(i + 1)] = sum(alg_f1[str(i + 1)])/5

print(alg_f1_averages)

{'1': 0.9781659979150834, '2': 0.9703545324353801, '3': 0.9658869112258381, '4': 0.9431128839246217, '5': 0.9470726210627618}


In [11]:
# calculate average of all 3 metric scores for each algorithm 
averages = {}
for i in range(len(alg_acc_averages)):
    averages[str(i + 1)] = (alg_acc_averages[str(i + 1)] + alg_roc_averages[str(i + 1)] + alg_f1_averages[str(i + 1)])/3

print(averages)

{'1': 0.9838328988286777, '2': 0.9773897004078599, '3': 0.9747084736159316, '4': 0.9556980003177918, '5': 0.9597696422981522}


In [12]:
# t-test best against rest mean of metrics (RF against rest)
combined_metrics_1 = []
combined_metrics_2 = []
combined_metrics_3 = []
combined_metrics_4 = []
combined_metrics_5 = []

for item in alg_acc['1']:
    combined_metrics_1.append(item)
for item in alg_roc['1']:
    combined_metrics_1.append(item)
for item in alg_f1['1']:
    combined_metrics_1.append(item)
    
for item in alg_acc['2']:
    combined_metrics_2.append(item)
for item in alg_roc['2']:
    combined_metrics_2.append(item)
for item in alg_f1['2']:
    combined_metrics_2.append(item)

for item in alg_acc['3']:
    combined_metrics_3.append(item)
for item in alg_roc['3']:
    combined_metrics_3.append(item)
for item in alg_f1['3']:
    combined_metrics_3.append(item)
    
for item in alg_acc['4']:
    combined_metrics_4.append(item)
for item in alg_roc['4']:
    combined_metrics_4.append(item)
for item in alg_f1['4']:
    combined_metrics_4.append(item)

for item in alg_acc['5']:
    combined_metrics_5.append(item)
for item in alg_roc['5']:
    combined_metrics_5.append(item)
for item in alg_f1['5']:
    combined_metrics_5.append(item)
    
print(ttest_rel(combined_metrics_1, combined_metrics_2))
print(ttest_rel(combined_metrics_1, combined_metrics_3))
print(ttest_rel(combined_metrics_1, combined_metrics_4))
print(ttest_rel(combined_metrics_1, combined_metrics_5))

Ttest_relResult(statistic=9.488391242469374, pvalue=1.783103793622407e-07)
Ttest_relResult(statistic=7.314255242662729, pvalue=3.82457095450185e-06)
Ttest_relResult(statistic=11.394423748363014, pvalue=1.812873909233341e-08)
Ttest_relResult(statistic=9.31379364409074, pvalue=2.2366945324128233e-07)
