In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [2]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [3]:
data=pd.read_csv('./processed_logs.csv',index_col=0)

In [4]:
data.shape

(575061, 157)

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,147,148,149,150,151,152,153,154,155,anomaly_label
0,3.0,1.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,3.0,1.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3.0,1.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3.0,1.0,3.0,3.0,3.0,1.0,0.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,3.0,1.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [6]:
X_train,X_test,y_train,y_test=model_selection.train_test_split(data.iloc[:,:-1],data.loc[:,'anomaly_label'],test_size=0.15,random_state=586)

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(488801, 156)
(488801,)
(86260, 156)
(86260,)


In [8]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
480256,3.0,1.0,3.0,3.0,3.0,6.0,3.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108706,3.0,1.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
379236,3.0,1.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
573215,3.0,1.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305262,3.0,1.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
print('Train percent anamolies:',str(np.round(y_train.mean(),4)))
print('Test percent anamolies:',str(np.round(y_test.mean(),4)))

Train percent anamolies: 0.0293
Test percent anamolies: 0.0291


## Standardization

In [10]:
## Perform standardization

# Fit StandardScaler on training set
scaler=preprocessing.StandardScaler().fit(X_train)

# Apply standardization to train/test sets
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

# Examine mean/std of first 5
print('Train mean:',X_train_scaled.mean(axis=0)[0:5])
print('Test mean:',X_test_scaled.mean(axis=0)[0:5])

print('Train std:',X_train_scaled.std(axis=0)[0:5])
print('Test std:',X_test_scaled.std(axis=0)[0:5])

Train mean: [-4.28388944e-17  0.00000000e+00 -6.02622737e-16 -4.51792616e-17
  2.19434859e-16]
Test mean: [-0.00519304  0.         -0.00121893 -0.00130376 -0.00438258]
Train std: [1. 0. 1. 1. 1.]
Test std: [0.98143849 0.         1.00334537 1.0063736  0.9804627 ]


## PCA (Dimensionality Reduction)

In [11]:
# Perform PCA on scaled data
pca=PCA()
pca.fit(X_train_scaled)

In [12]:
# Examine cumulative proportion of variance explained
print(pca.explained_variance_ratio_.cumsum())

[0.04912652 0.089005   0.11733082 0.14365539 0.16733301 0.18999414
 0.21150424 0.23293285 0.25436147 0.27579008 0.2972187  0.3186368
 0.33987786 0.35985766 0.37878781 0.39709499 0.4145823  0.43101974
 0.44718355 0.46286492 0.47833309 0.49347838 0.50780719 0.52171951
 0.53497629 0.54775877 0.55985099 0.57098809 0.58183423 0.59234806
 0.60205399 0.61166981 0.62085651 0.629801   0.63818631 0.64639744
 0.65423019 0.66203025 0.66969688 0.67718418 0.68457114 0.69190038
 0.69918576 0.70636176 0.71350754 0.72065071 0.7277936  0.73493649
 0.74207936 0.74922223 0.7563651  0.76350797 0.77065085 0.77779372
 0.78493659 0.79207946 0.79922233 0.8063652  0.81350808 0.82065095
 0.82779382 0.83493669 0.84207956 0.84922244 0.85636531 0.86350818
 0.87065037 0.87779083 0.88492879 0.89205189 0.89898001 0.90588547
 0.91271485 0.91949584 0.92575575 0.9317303  0.93688184 0.94171224
 0.94613692 0.95038088 0.95442041 0.95814992 0.96172137 0.96513302
 0.96819554 0.97119517 0.97406299 0.9767474  0.97932905 0.98185

In [13]:
num_components=np.argmax(pca.explained_variance_ratio_.cumsum()>=0.99)
print('# Components to explain 99% of variance:',num_components) 

# Components to explain 99% of variance: 94


In [14]:
# Get component scores for train/test sets
X_train_scores=pca.transform(X_train_scaled)[:,:num_components]
X_test_scores=pca.transform(X_test_scaled)[:,:num_components]
print(X_train_scores.shape)
print(X_test_scores.shape)

(488801, 94)
(86260, 94)


## Define weighted binary cross entropy loss function

In [15]:
def weighted_bce(y_true,y_pred):
    sample_weights=(y_true.values==1).astype(float)
    sample_weights=sample_weights*(1/np.mean(y_true))
    sample_weights=np.where(sample_weights==0,1,sample_weights)
    return metrics.log_loss(y_true=y_true,y_pred=y_pred,sample_weight=sample_weights)

## Logistic regression on first 90 PCs

In [16]:
# Perform logistic regression on component scores

In [20]:
# Define dictionary of metrics for CV
eval_metrics={'weighted_bce':metrics.make_scorer(weighted_bce),'accuracy':metrics.make_scorer(metrics.accuracy_score),'recall':metrics.make_scorer(metrics.recall_score),'precision':metrics.make_scorer(metrics.precision_score)}

In [21]:
# Get 5-fold CV estimates for logistic regression model
logistic_regressor_cv=model_selection.cross_validate(LogisticRegression(max_iter=1000,n_jobs=-1),X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.4s remaining:   21.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.4s finished


In [22]:
logistic_regressor_cv

{'fit_time': array([ 8.33012748, 10.0600636 , 10.23353171, 10.72658706,  8.55110264]),
 'score_time': array([0.24874282, 0.16886234, 0.15146422, 0.08641124, 0.26470304]),
 'test_weighted_bce': array([0.10547463, 0.08635812, 0.08542267, 0.06870317, 0.09311618]),
 'test_accuracy': array([0.99965221, 0.9996829 , 0.99973404, 0.99963175, 0.99965221]),
 'test_recall': array([0.99441731, 0.99546248, 0.99546248, 0.9965096 , 0.99511344]),
 'test_precision': array([0.99372385, 0.99372822, 0.99546248, 0.99097536, 0.99303379])}

In [23]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in logistic_regressor_cv.items()}

{'fit_time': 9.58028,
 'score_time': 0.18404,
 'test_weighted_bce': 0.08781,
 'test_accuracy': 0.99967,
 'test_recall': 0.99539,
 'test_precision': 0.99338}

## QDA on first 90 PCs

In [237]:
qda_model=QuadraticDiscriminantAnalysis()
# Get 5-fold CV estimates for QDA
qda_model_cv=model_selection.cross_validate(qda_model,X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   20.1s remaining:   30.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.3s finished


In [238]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in qda_model_cv.items()}

{'fit_time': 18.95564,
 'score_time': 0.354,
 'test_weighted_bce': 0.90334,
 'test_accuracy': 0.99653,
 'test_recall': 0.95048,
 'test_precision': 0.93374}

## KNN on first 90 PCs

In [315]:
# Create KNN model with default params
knn_model=KNeighborsClassifier(n_jobs=-1)

In [317]:
# Get 5-fold CV estimates for default KNN model
# Use 10% of training set (computationally expensive)
knn_model_cv=model_selection.cross_validate(knn_model,X=X_train_scores[:int(len(X_train_scores)*0.1),:],y=y_train[:int(len(X_train_scores)*0.1)],scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   25.7s remaining:   38.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   25.8s finished


In [318]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in knn_model_cv.items()}

{'fit_time': 0.01714,
 'score_time': 24.27379,
 'test_weighted_bce': 0.12314,
 'test_accuracy': 0.99959,
 'test_recall': 0.99318,
 'test_precision': 0.99319}

In [324]:
# Tune k hyperparameter
k_vals=[]
knn_cv_weighted_bce=[]
counter=0
max_k=50
for i in np.arange(1,max_k+1):
    counter+=1
    k_vals.append(i)
    # Define KNN model
    knn_model_tuner=KNeighborsClassifier(n_neighbors=i,n_jobs=-1)
    # Get CV estimate of weighted binary cross entropy loss
    knn_model_tuner_cv=model_selection.cross_validate(knn_model_tuner,X=X_train_scores[:int(len(X_train_scores)*0.1),:],y=y_train[:int(len(X_train_scores)*0.1)],scoring=eval_metrics,n_jobs=-1)
    knn_cv_weighted_bce.append(np.mean(knn_model_tuner_cv['test_weighted_bce']))
    print('Performed iteration '+str(counter)+' of '+str(max_k))
knn_grid_results=pd.DataFrame({'k_val':k_vals,'cv_weighted_bce':knn_cv_weighted_bce})

Performed iteration 1 of 50
Performed iteration 2 of 50
Performed iteration 3 of 50
Performed iteration 4 of 50




Performed iteration 5 of 50
Performed iteration 6 of 50
Performed iteration 7 of 50
Performed iteration 8 of 50
Performed iteration 9 of 50
Performed iteration 10 of 50
Performed iteration 11 of 50
Performed iteration 12 of 50
Performed iteration 13 of 50
Performed iteration 14 of 50
Performed iteration 15 of 50
Performed iteration 16 of 50
Performed iteration 17 of 50
Performed iteration 18 of 50
Performed iteration 19 of 50
Performed iteration 20 of 50
Performed iteration 21 of 50
Performed iteration 22 of 50
Performed iteration 23 of 50
Performed iteration 24 of 50
Performed iteration 25 of 50
Performed iteration 26 of 50
Performed iteration 27 of 50
Performed iteration 28 of 50
Performed iteration 29 of 50
Performed iteration 30 of 50
Performed iteration 31 of 50
Performed iteration 32 of 50
Performed iteration 33 of 50
Performed iteration 34 of 50




Performed iteration 35 of 50
Performed iteration 36 of 50
Performed iteration 37 of 50
Performed iteration 38 of 50
Performed iteration 39 of 50
Performed iteration 40 of 50
Performed iteration 41 of 50
Performed iteration 42 of 50
Performed iteration 43 of 50
Performed iteration 44 of 50
Performed iteration 45 of 50
Performed iteration 46 of 50
Performed iteration 47 of 50
Performed iteration 48 of 50
Performed iteration 49 of 50
Performed iteration 50 of 50


In [328]:
knn_grid_results.sort_values('cv_weighted_bce').head(10)

Unnamed: 0,k_val,cv_weighted_bce
0,1,0.051057
2,3,0.075311
47,48,0.094361
46,47,0.094361
48,49,0.09472
49,50,0.09472
1,2,0.121745
4,5,0.12314
6,7,0.135067
5,6,0.135067


In [331]:
# Create KNN model with tuned k param
tuned_knn_model=KNeighborsClassifier(n_neighbors=3,n_jobs=-1)

In [332]:
# Get 5-fold CV estimates for default KNN model on whole dataset
tuned_knn_model_cv=model_selection.cross_validate(tuned_knn_model,X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 34.6min remaining: 51.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 34.7min finished


In [334]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in tuned_knn_model_cv.items()}

{'fit_time': 0.18673,
 'score_time': 2072.15979,
 'test_weighted_bce': 0.02739,
 'test_accuracy': 0.99986,
 'test_recall': 0.99854,
 'test_precision': 0.99666}

## Random forest on first 90 PCs

In [239]:
# Create random forest with default params
random_forest=RandomForestClassifier(n_jobs=-1,random_state=586)

In [241]:
# Get 5-fold CV estimates for default random forest
random_forest_cv=model_selection.cross_validate(random_forest,X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   59.2s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   59.6s finished


In [242]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in random_forest_cv.items()}

{'fit_time': 58.08247,
 'score_time': 0.3241,
 'test_weighted_bce': 0.01887,
 'test_accuracy': 0.99987,
 'test_recall': 0.99902,
 'test_precision': 0.99659}

In [243]:
## Tune random forest hyperparameters using OOB weighted binary cross entropy loss
rf_params={'criterion':['gini','entropy'],
           'max_features':['auto','sqrt','log2'],
           'max_depth':[30,40,50,60,None]}

In [245]:
# Implement random grid search
counter=0
num_iter=15
rf_grid_results=[]
for i in range(num_iter):
    counter+=1
    # Select random parameter set
    param_choice={key:np.random.choice(val) for key,val in rf_params.items()}
    # Define model with param choices
    random_forest_tuner=RandomForestClassifier(
                           criterion=param_choice['criterion'],
                           max_features=param_choice['max_features'],
                           max_depth=param_choice['max_depth'],
                           oob_score=True,
                           n_jobs=-1,
                           verbose=1,
                           random_state=586)
    # Fit model
    random_forest_tuner.fit(X_train_scores,y_train)
    # Compute OOB binary weighted cross entropy loss
    param_choice['oob_weighted_bce']=weighted_bce(y_train,random_forest_tuner.oob_decision_function_[:, 1])
    rf_grid_results.append(param_choice)
    print('Performed iteration '+str(counter)+' of '+str(num_iter))
rf_grid_results=pd.DataFrame(rf_grid_results)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.2s finished


Performed iteration 1 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.2s finished


Performed iteration 2 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.3s finished


Performed iteration 3 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.1s finished


Performed iteration 4 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.7s finished


Performed iteration 5 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.9s finished


Performed iteration 6 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.3s finished


Performed iteration 7 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.7s finished


Performed iteration 8 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.8s finished


Performed iteration 9 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.3s finished


Performed iteration 10 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.7s finished


Performed iteration 11 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.2s finished


Performed iteration 12 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.5s finished


Performed iteration 13 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.2s finished


Performed iteration 14 of 15


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.8s finished


Performed iteration 15 of 15


In [248]:
rf_grid_results.sort_values('oob_weighted_bce')

Unnamed: 0,criterion,max_features,max_depth,oob_weighted_bce
2,entropy,auto,40.0,0.003522
3,entropy,sqrt,30.0,0.003522
6,entropy,auto,,0.003522
7,entropy,sqrt,30.0,0.003522
12,entropy,sqrt,,0.003522
10,gini,log2,,0.003659
1,entropy,log2,30.0,0.0047
5,entropy,log2,,0.0047
0,gini,sqrt,30.0,0.004714
4,gini,sqrt,,0.004714


In [249]:
# Fit random forest using tuned hyperparameters
best_rf_params=rf_grid_results.sort_values('oob_weighted_bce').iloc[0,:]
tuned_random_forest=RandomForestClassifier(criterion=best_rf_params['criterion'],
                                           max_features=best_rf_params['max_features'],
                                           max_depth=best_rf_params['max_depth'],
                                           n_jobs=-1,
                                           random_state=586)

In [251]:
# Get 5-fold CV estimates for tuned random forest
tuned_random_forest_cv=model_selection.cross_validate(tuned_random_forest,X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   50.0s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   50.6s finished


In [252]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in tuned_random_forest_cv.items()}

{'fit_time': 48.84725,
 'score_time': 0.49096,
 'test_weighted_bce': 0.01769,
 'test_accuracy': 0.99987,
 'test_recall': 0.99909,
 'test_precision': 0.99652}

## Gradient boosting classifier on first 90 PCs

In [24]:
# Create gradient boosting classifier with default params
gradient_boosting=GradientBoostingClassifier(random_state=586)

In [25]:
# Get 5-fold CV estimates for gradient boosting classifier
gradient_boosting_cv=model_selection.cross_validate(gradient_boosting,X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 21.3min remaining: 32.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 21.4min finished


In [26]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in gradient_boosting_cv.items()}

{'fit_time': 1275.63731,
 'score_time': 1.41368,
 'test_weighted_bce': 0.03101,
 'test_accuracy': 0.99986,
 'test_recall': 0.99839,
 'test_precision': 0.99693}

In [27]:
## Tune gradient boosting classifier hyperparameters
gb_params={'n_estimators':[100,200,300,400,500],
           'subsample':[0.4,0.6,0.8,1],
           'max_depth':[4,6,8,10]}

In [28]:
# Implement random grid search
# Using 10% sample of dataset -- too computationally expensive otherwise
counter=0
num_iter=15
gb_grid_results=[]
for i in range(num_iter):
    counter+=1
    # Select random parameter set
    param_choice={key:np.random.choice(val) for key,val in gb_params.items()}
    # Define model with param choices
    gradient_boosting_tuner=GradientBoostingClassifier(
                           n_estimators=param_choice['n_estimators'],
                           subsample=param_choice['subsample'],
                           max_depth=param_choice['max_depth'],
                           learning_rate=10/param_choice['n_estimators'],
                           random_state=586)
    # Get CV estimate of weighted binary cross entropy loss
    gradient_boosting_tuner_cv=model_selection.cross_validate(gradient_boosting_tuner,X=X_train_scores[:int(len(X_train_scores)*0.1),:],y=y_train[:int(len(X_train_scores)*0.1)],scoring=eval_metrics,n_jobs=-1,verbose=1)
    param_choice['learning_rate']=10/param_choice['n_estimators']
    param_choice['cv_weighted_bce']=np.mean(gradient_boosting_tuner_cv['test_weighted_bce'])
    gb_grid_results.append(param_choice)
    print('Performed iteration '+str(counter)+' of '+str(num_iter))
gb_grid_results=pd.DataFrame(gb_grid_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  5.0min remaining:  7.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.2min finished


Performed iteration 1 of 15


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.0min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.0min finished


Performed iteration 2 of 15


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
gb_grid_results.sort_values('cv_weighted_bce')

In [None]:
best_gb_params=gb_grid_results.sort_values('cv_weighted_bce').iloc[0,:]

In [None]:
gb_lr_params={'learning_rate':[0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15]}

In [None]:
# Implement exhaustive grid search to further tune learning rate
counter=0
gb_lr_grid_results=[]
for i in range(len(gb_lr_params['learning_rate'])):
    counter+=1
    # Select parameter
    param_choice={key:val[i] for key,val in gb_lr_params.items()}
    # Define model with param choices
    gradient_boosting_lr_tuner=GradientBoostingClassifier(
                           n_estimators=int(best_gb_params['n_estimators']),
                           subsample=best_gb_params['subsample'],
                           max_depth=int(best_gb_params['max_depth']),
                           learning_rate=param_choice['learning_rate'],
                           random_state=586)
    # Get CV estimate of weighted binary cross entropy loss
    gradient_boosting_lr_tuner_cv=model_selection.cross_validate(gradient_boosting_lr_tuner,X=X_train_scores[:int(len(X_train_scores)*0.1),:],y=y_train[:int(len(X_train_scores)*0.1)],scoring=eval_metrics,n_jobs=-1,verbose=1)
    param_choice['cv_weighted_bce']=np.mean(gradient_boosting_lr_tuner_cv['test_weighted_bce'])
    gb_lr_grid_results.append(param_choice)
    print('Performed iteration '+str(counter)+' of '+str(len(gb_lr_params['learning_rate'])))
gb_lr_grid_results=pd.DataFrame(gb_lr_grid_results)

In [None]:
gb_lr_grid_results.sort_values('cv_weighted_bce')

In [None]:
# Create gradient boosting classifier with tuned params
tuned_gradient_boosting=GradientBoostingClassifier(
                                                  n_estimators=100,
                                                  subsample=0.8,
                                                  max_depth=8,
                                                  learning_rate=0.1,
                                                  random_state=586)

In [None]:
# Get 5-fold CV estimates for tuned gradient boosting classifier
tuned_gradient_boosting_cv=model_selection.cross_validate(tuned_gradient_boosting,X=X_train_scores,y=y_train,scoring=eval_metrics,n_jobs=-1,verbose=1)

In [None]:
# Get avg error metrics from folds
{key:np.round(np.mean(val),5) for key,val in tuned_gradient_boosting_cv.items()}

## Comparison of results

In [376]:
all_results=[]
for model in [logistic_regressor_cv,qda_model_cv,knn_model_cv,tuned_knn_model_cv,random_forest_cv,tuned_random_forest_cv,gradient_boosting_cv,tuned_gradient_boosting_cv]:
    all_results.append({key:np.round(np.mean(val),5) for key,val in model.items()})
all_results_df=pd.DataFrame(all_results)
all_results_df['test_f1_score']=(2*all_results_df['test_precision']*all_results_df['test_recall'])/(all_results_df['test_precision']+all_results_df['test_recall'])
all_results_df['model']=['Logistic Regression','QDA','KNN','Tuned KNN','Random Forest','Tuned Random Forest','Gradient Boosting','Tuned Gradient Boosting']
all_results_df=all_results_df.rename(columns={'fit_time':'Avg Fit Time','score_time':'Avg Score Time','test_weighted_bce':'CV Binary Cross Entropy Loss','test_accuracy':'CV Accuracy','test_recall':'CV Recall','test_precision':'CV Precision','test_f1_score':'CV F1 Score','model':'Model'})
all_results_df=all_results_df.sort_values('CV F1 Score',ascending=False)[['Model','CV F1 Score','CV Binary Cross Entropy Loss','CV Accuracy','CV Recall','CV Precision','Avg Fit Time','Avg Score Time']]
all_results_df

Unnamed: 0,Model,CV F1 Score,CV Binary Cross Entropy Loss,CV Accuracy,CV Recall,CV Precision,Avg Fit Time,Avg Score Time
7,Tuned Gradient Boosting,0.998014,0.01628,0.99988,0.99916,0.99687,598.12489,0.29889
6,Gradient Boosting,0.997839,0.01884,0.99987,0.99902,0.99666,313.38383,0.28748
4,Random Forest,0.997804,0.01887,0.99987,0.99902,0.99659,58.08247,0.3241
5,Tuned Random Forest,0.997803,0.01769,0.99987,0.99909,0.99652,48.84725,0.49096
3,Tuned KNN,0.997599,0.02739,0.99986,0.99854,0.99666,0.18673,2072.15979
2,KNN,0.993185,0.12314,0.99959,0.99318,0.99319,0.01714,24.27379
0,Logistic Regression,0.989705,0.11033,0.99939,0.99414,0.98531,37.73917,0.07511
1,QDA,0.942036,0.90334,0.99653,0.95048,0.93374,18.95564,0.354


## Evaluate best model (tuned gradient boosting classifier) on test set

In [30]:
# Define gradient boosting model with tuned hyperparameters
tuned_gb=GradientBoostingClassifier(
                                   n_estimators=100,
                                   subsample=0.8,
                                   max_depth=8,
                                   learning_rate=0.1,
                                   random_state=586)

In [31]:
# Fit model
tuned_gb.fit(X_train_scores,y_train)

In [32]:
# Generate predictions+probabilities on test set
test_preds=tuned_gb.predict(X_test_scores)
test_probs=tuned_gb.predict_proba(X_test_scores)

# Compute error metrics
test_bce=weighted_bce(y_test,test_probs)
test_acc=metrics.accuracy_score(y_test,test_preds)
test_recall=metrics.recall_score(y_test,test_preds)
test_precision=metrics.precision_score(y_test,test_preds)

# Compile results into DataFrame
test_results=pd.DataFrame({'Weighted Binary Cross Entropy':test_bce,'Accuracy':test_acc,'Recall':test_recall,'Precision':test_precision},index=[0])
test_results['F1 Score']=(2*test_results['Precision']*test_results['Recall'])/(test_results['Precision']+test_results['Recall'])
test_results

Unnamed: 0,Weighted Binary Cross Entropy,Accuracy,Recall,Precision,F1 Score
0,0.001748,0.999815,0.999204,0.994453,0.996823


In [33]:
!pip install joblib



In [34]:
import joblib

joblib.dump(tuned_gb, 'tuned_gb_model.pkl')

['tuned_gb_model.pkl']

In [35]:
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')
print(f"Scaler saved to scaler.pkl, PCA model saved to pca.pkl")

Scaler saved to scaler.pkl, PCA model saved to pca.pkl


In [37]:
import sklearn
print(sklearn.__version__)

1.5.1
