In [1]:
# importing necessarily libraries for the binary classification task

# libraries imported for data processing and analysis
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

# libraries imported for learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import pipeline

# libraries imported for performance metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
# load 'Electrical Grid Stability' data and names into pandas dataframe

# load data by using read_csv from .data file
df = pd.read_csv("datasets/Grid_Stability/grid_stability.csv")

# clean data
# replace string label classifiers into binary values
df = df.replace(to_replace="stable", value=1)
df = df.replace(to_replace="unstable", value=0)
# drop all samples with NaN entries
df = df.dropna()

df

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,0
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,0
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,0
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,1
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,1
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,0


In [3]:
# pre-declared values/arrays/functions to be used once inside the trial loop
# C values for logistic regression regularization in range of 10(-8) to 10(4)
Cvals = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
# K values for k-nearest neighbors in range of 1 to 105 in steps of 4
Kvals = np.linspace(1, 105, num=26, dtype=int).tolist()
# max feature values for random forest similar to CNM06
max_features = [1, 2, 4, 6, 8, 12, 16, 20]
# max depth values for decision trees (shallower = better)
max_depths = np.linspace(1, 5, num=5, dtype=int).tolist()
# array of performance metrics
scoring = ['accuracy', 'f1_micro', 'roc_auc_ovr']

# build parameter grids to be passed into GridSearchCV
logreg_pgrid = {'classifier__penalty': ['l1','l2'], 'classifier__C': Cvals, 'classifier__max_iter': [5000]}
knn_pgrid = {'classifier__weights': ['distance'], 'classifier__n_neighbors': Kvals}
rforest_pgrid = {'classifier__n_estimators': [1024], 'classifier__max_features': max_features}
dtree_pgrid = {'classifier__max_depth': max_depths}

# arrays + dictionaries to store scores
score_dict = [{}, {}, {}, {}, {}]

# loop through this entire trial FIVE (5) times
for i in range(5):
    # slice the dataframe to not include the binary classifier (label)
    # last column is the label (income>50K)
    X, y = df.iloc[:,:-1], df.iloc[:,-1]

    # randomly pick 5000 samples with replacement for training set
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)

    # make pipeline for each algorithms to condense model call
    logreg = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', LogisticRegression(n_jobs=-1))])
    knn = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', KNeighborsClassifier(n_jobs=-1))])
    rforest = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', RandomForestClassifier(n_jobs=-1))])
    dtree = pipeline.Pipeline([('scale', StandardScaler()), ('classifier', DecisionTreeClassifier())])

    # 5-fold cross validation using Stratified KFold
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    # GridSearchCV classifier for each algorithm
    logreg_clf = GridSearchCV(estimator=logreg, param_grid=logreg_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    knn_clf = GridSearchCV(estimator=knn, param_grid=knn_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    rforest_clf = GridSearchCV(estimator=rforest, param_grid=rforest_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)
    dtree_clf = GridSearchCV(estimator=dtree, param_grid=dtree_pgrid, scoring=scoring, 
                                n_jobs=-1, cv=k_fold, verbose=2, refit=False)



    # for each classifier
    for clf, clf_name in zip([logreg_clf, knn_clf, rforest_clf, dtree_clf], 
                ['LogReg', 'KNN', 'Ran_For', 'Dec_Tree']):
        # fit to training data of 5000 samples
        clf.fit(X_train, y_train)

        # get parameters for each scoring metric's best
        best_acc_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_accuracy']) ]
        best_f1_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_f1_micro']) ]
        best_roc_param = clf.cv_results_['params'][ np.argmin(clf.cv_results_['rank_test_roc_auc_ovr']) ]

        # get pipeline based on current classifier
        if (clf_name == 'LogReg'):
            pipe = logreg
        elif (clf_name == 'KNN'):
            pipe = knn
        elif (clf_name == 'Ran_For'):
            pipe = rforest
        elif (clf_name == 'Dec_Tree'):
            pipe = dtree

        # set pipeline parameters to the parameters for best accuracy
        pipe.set_params(**best_acc_param)
        # fit classifier with training data and new parameters for scoring metric
        pipe.fit(X_train, y_train)
        # get predictions for both training and testing data
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        # get scores for all metrics from both training and testing data
        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        # store all scores into a dictionary for accuracy metric
        acc_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}

        
        # do ^^^^^ all that for f1 score
        pipe.set_params(**best_f1_param)
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        f1_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}


        # do ^^^^^ all that for roc_auc score
        pipe.set_params(**best_roc_param)
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)

        acc_train = accuracy_score(y_train, y_train_pred)
        f1_train = f1_score(y_train, y_train_pred)
        roc_auc_train = roc_auc_score(y_train, y_train_pred)

        acc_test = accuracy_score(y_test, y_test_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc_test = roc_auc_score(y_test, y_test_pred)

        roc_auc_dict = {'acc_train': acc_train, 'f1_train': f1_train, 'roc_auc_train': roc_auc_train, 
                    'acc_test': acc_test, 'f1_test': f1_test, 'roc_auc_test': roc_auc_test}

        # build final dictionary to store all scores from all three models and their best parameters
        score_dict[i][clf_name] = {'acc_dict': acc_dict, 'f1_dict': f1_dict, 'roc_auc_dict': roc_auc_dict}

Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:    9.8s finished
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   24.7s finished
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent 

In [4]:
print(score_dict)

[{'LogReg': {'acc_dict': {'acc_train': 0.9998, 'f1_train': 0.9997232216994187, 'roc_auc_train': 0.9997232982844494, 'acc_test': 0.9994, 'f1_test': 0.9991733259851198, 'roc_auc_test': 0.9995293379353625}, 'f1_dict': {'acc_train': 0.9998, 'f1_train': 0.9997232216994187, 'roc_auc_train': 0.9997232982844494, 'acc_test': 0.9994, 'f1_test': 0.9991733259851198, 'roc_auc_test': 0.9995293379353625}, 'roc_auc_dict': {'acc_train': 0.9998, 'f1_train': 0.9997232216994187, 'roc_auc_train': 0.9997232982844494, 'acc_test': 0.9994, 'f1_test': 0.9991733259851198, 'roc_auc_test': 0.9995293379353625}}, 'KNN': {'acc_dict': {'acc_train': 1.0, 'f1_train': 1.0, 'roc_auc_train': 1.0, 'acc_test': 0.9472, 'f1_test': 0.9227166276346604, 'roc_auc_test': 0.9304027617712677}, 'f1_dict': {'acc_train': 1.0, 'f1_train': 1.0, 'roc_auc_train': 1.0, 'acc_test': 0.9472, 'f1_test': 0.9227166276346604, 'roc_auc_test': 0.9304027617712677}, 'roc_auc_dict': {'acc_train': 1.0, 'f1_train': 1.0, 'roc_auc_train': 1.0, 'acc_test': 0