In [73]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, mean_squared_error

import warnings

In [2]:
%store -r wp4
wp4 = wp4
 
%store -r wp8
wp8 = wp8

%store -r dwt4
dwt4 = dwt4

%store -r dwt8
dwt8 = dwt8

## Part A: Repeated Sampling Function

In [4]:
def log_reg_rep_samples(df, n_iter = 1000): #default 1k iterations
    df = df
    x = df.loc[:, df.columns != 'state'] #features
    y = df.loc[:, df.columns == 'state'] #supervisor

    eval_metrics = {} #empty dictionary to store metrics

    for i in range(n_iter): 

        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        log_model = LogisticRegression(class_weight='balanced')
        log_model.fit(x_train, y_train.values.ravel())
        y_pred = log_model.predict(x_test)

        accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
        specificity = classification_report(y_test, y_pred, output_dict=True)['0']['recall'] #recall of the negative class = specificity
        sensitivity = classification_report(y_test, y_pred, output_dict=True)['1']['recall'] #recall of the positive class = sensitivity

        metrics = [accuracy, specificity, sensitivity] #store values in list
        eval_metrics[i]=list(metrics) #store list in dictionary

    eval_metrics = pd.DataFrame.from_dict(eval_metrics).T 
    eval_metrics.columns = ['accuracy', 'specificity', 'sensitivity']
    eval_metrics.head()

    means = eval_metrics.mean(axis=0) #mean of each column
    
    return means

#### 1. Wavelet Packet / Wang, 4-3-02 data (wp4)

In [5]:
log_reg_rep_samples(df = wp4, n_iter = 10000)

accuracy       0.860645
specificity    0.849033
sensitivity    0.873112
dtype: float64

#### 2. Wavelet Packet / Wang, 8-7-02 data (wp8)

In [6]:
log_reg_rep_samples(df = wp8, n_iter = 10000)

accuracy       0.960526
specificity    0.956043
sensitivity    0.963150
dtype: float64

#### 3. Discrete Wavelet Transform, 4-3-02 data (dwt4)

In [7]:
log_reg_rep_samples(df = dwt4, n_iter = 10000)

accuracy       0.879708
specificity    0.874583
sensitivity    0.885560
dtype: float64

#### 4. Discrete Wavelet Transform, 8-7-02 data (dwt8)

In [8]:
log_reg_rep_samples(df = dwt8, n_iter = 10000)

accuracy       0.954730
specificity    0.953027
sensitivity    0.955852
dtype: float64

## Part B: Grid Search

##### Significant accuracy discrepancy between random state = 0 and random state = 1 between two grid search functions:

##### random_state = 0, with accuracy = 0.952

In [123]:
x = dwt8.loc[:, dwt8.columns != 'state'] #features
y = dwt8.loc[:, dwt8.columns == 'state'] #supervisor

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67, random_state=0)

scaler = StandardScaler()
log_reg_model = LogisticRegression(class_weight='balanced')

pipeline = Pipeline(steps=[("scaler", scaler), ("log_reg", log_reg_model)])

param_grid = {
    #'log_reg__penalty': ['none', 'l2'],
    'log_reg__C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

grid_model = GridSearchCV(pipeline, param_grid, n_jobs=1)
grid_model.fit(x_train, y_train.values.ravel())

#print("Best parameter (CV score=%0.3f):" % grid_model.best_score_)
#print(grid_model.best_params_)

y_pred = grid_model.predict(x_test)

accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
specificity = classification_report(y_test, y_pred, output_dict=True)['0']['recall'] #recall of the negative class = specificity
sensitivity = classification_report(y_test, y_pred, output_dict=True)['1']['recall'] #recall of the positive class = sensitivity

print('Accuracy: ', accuracy)
print('Specificity: ', specificity)
print('Sensitivity: ', sensitivity)


Accuracy:  0.9523809523809523
Specificity:  0.9666666666666667
Sensitivity:  0.9444444444444444


##### random_state = 1, with accuracy = 0.929

In [124]:
x = dwt8.loc[:, dwt8.columns != 'state'] #features
y = dwt8.loc[:, dwt8.columns == 'state'] #supervisor

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67, random_state=1)

scaler = StandardScaler()
log_reg_model = LogisticRegression(class_weight='balanced')

pipeline = Pipeline(steps=[("scaler", scaler), ("log_reg", log_reg_model)])

param_grid = {
    #'log_reg__penalty': ['none', 'l2'],
    'log_reg__C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
}

grid_model = GridSearchCV(pipeline, param_grid, n_jobs=1)
grid_model.fit(x_train, y_train.values.ravel())

#print("Best parameter (CV score=%0.3f):" % grid_model.best_score_)
#print(grid_model.best_params_)

y_pred = grid_model.predict(x_test)

accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
specificity = classification_report(y_test, y_pred, output_dict=True)['0']['recall'] #recall of the negative class = specificity
sensitivity = classification_report(y_test, y_pred, output_dict=True)['1']['recall'] #recall of the positive class = sensitivity

print('Accuracy: ', accuracy)
print('Specificity: ', specificity)
print('Sensitivity: ', sensitivity)


Accuracy:  0.9285714285714286
Specificity:  0.8823529411764706
Sensitivity:  0.96


## Part C: Repeated Sampling with Grid Search

In [125]:
def log_reg_grid_reps(df, n_iter = 100): #default 100 iterations
    df = df
    x = df.loc[:, df.columns != 'state'] #features
    y = df.loc[:, df.columns == 'state'] #supervisor

    eval_metrics = {} #empty dictionary to store metrics

    for i in range(n_iter): 

        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

        scaler = StandardScaler()
        log_reg_model = LogisticRegression(class_weight='balanced')

        pipeline = Pipeline(steps=[("scaler", scaler), ("log_reg", log_reg_model)])

        param_grid = {
            'log_reg__C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
        }

        grid_model = GridSearchCV(pipeline, param_grid, n_jobs=1)
        grid_model.fit(x_train, y_train.values.ravel())

        y_pred = grid_model.predict(x_test)

        accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
        specificity = classification_report(y_test, y_pred, output_dict=True)['0']['recall'] #recall of the negative class = specificity
        sensitivity = classification_report(y_test, y_pred, output_dict=True)['1']['recall'] #recall of the positive class = sensitivity

        metrics = [accuracy, specificity, sensitivity] 
        eval_metrics[i]=list(metrics)

    eval_metrics = pd.DataFrame.from_dict(eval_metrics).T 
    eval_metrics.columns = ['accuracy', 'specificity', 'sensitivity']
    eval_metrics.head()

    means = eval_metrics.mean(axis=0) #mean of each column
    
    return means

#### Wavelet Packet / Wang, 8-7-02 data (wp8)

In [140]:
log_reg_grid_reps(df = wp8, n_iter=1000)

accuracy       0.955631
specificity    0.954018
sensitivity    0.956598
dtype: float64

#### Discrete Wavelet Transform, 8-7-02 data (dwt8)

In [141]:
log_reg_grid_reps(df = dwt8, n_iter=1000)

accuracy       0.951786
specificity    0.951311
sensitivity    0.952258
dtype: float64