In [1]:
#import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

# Aviation Accident Capstone:
## *Part VI: Model Selection & Evaluation*
Created by: Katy Christensen <br>
Created on: September 26, 2022 <br>
Created for: BrainStation Data Science Bootcamp Capstone<br>
Notebook 6 of 6<br>

Previous Notebook: *Part V: Decision Tree Model* <br>

--------------

## Table of Contents
[1. Load Data](#Step-1) <br>
[2. Split Data](#Step-2) <br>
- [Train-Test Split](#Step-2) <br>
- [Train-Validation Split](#train-val)

[3. Model Selection](#Step-3) <br>
- [Fit Baseline Models](#Step-3)<br>
- [Optimize Models](#optimize)<br>
- [Select Model](#select)<br>
- [Fit Final Model](#fit-final)<br>

[4. Model Evaluation](#Step-4) <br>
- [Confusion Matrix](#Step-4)<br>
- [AUC/ROC Evaluation](#auc-roc)<br>

[5. Class Balance](#Step-5) <br>
- [Up Sampling](#Step-5)<br>
- [Down Sampling](#down-samp)<br>

[6. Results & Summary](#Step-6) 

--------
<a id='Step-1'></a>
## 1. Load Data
--------

In [2]:
ntsb08 = pd.read_csv('data/ntsb08_model.csv')

In [3]:
X = ntsb08.drop(columns='ev_highest_injury')
y = ntsb08['ev_highest_injury']

In [4]:
X.shape

(64192, 10453)

--------
<a id='Step-2'></a>
## 2. Split Data
--------
### Train-Test Split

In [5]:
#Split the test into train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9, stratify=y)

In [6]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (44934, 10453)
X_test shape: (19258, 10453)


<a id='train-val'></a>
### Train-Validation Split

In [7]:
#Split the test into train (80%) and test (20%)
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                    random_state=18, stratify=y_train)

In [8]:
print('X_train2 shape:', X_train2.shape)
print('X_val shape:', X_val.shape)

X_train2 shape: (35947, 10453)
X_val shape: (8987, 10453)


### Scale

In [9]:
#Scale Data
scaler = StandardScaler()
scaler.fit(X_train2)
X_scale = scaler.transform(X_train2)
X_scale_t = scaler.transform(X_train)
X_sval = scaler.transform(X_val)
X_stest = scaler.transform(X_test)

--------
<a id='Step-3'></a>
## 3. Model Selection
--------
### Fit Baseline Models

##### (A) Logistic Regression

In [10]:
# Fitting the logistic model
log_base = LogisticRegression(random_state=9)
log_base.fit(X_train2, y_train2)

# Evaluate its classification accuracy (Just on the training set for now)
print(f"Train Set Accuracy: {log_base.score(X_train2, y_train2)}")
print(f"Test Set Accuracy: {log_base.score(X_val, y_val)}")

Train Set Accuracy: 0.8116393579436393
Test Set Accuracy: 0.8118393234672304


##### (B) Decision Tree

In [11]:
# Instantiate
dt_base = DecisionTreeClassifier(random_state=2)

# Fit
dt_base.fit(X_train2, y_train2)

# Score
print(f'Train Set Accuracy: {dt_base.score(X_train2, y_train2)}')
print(f'Test Set Accuracy: {dt_base.score(X_val, y_val)}')

Train Set Accuracy: 1.0
Test Set Accuracy: 0.9448091687993769


##### (C) KNN

In [None]:
# Instantiate the model & fit it to our data
KNN_base = KNeighborsClassifier()
KNN_base.fit(X_train2, y_train2)

# Score the model on the test set
print("Number of neighbors:", KNN_base.n_neighbors)
print("Train accuracy:", KNN_base.score(X_train2, y_train2))
print("Test accuracy:", KNN_base.score(X_val, y_val))

Number of neighbors: 5


<a id='optimize'></a>
### Optimize Hyperparameters

##### KNN Model:
KNN models are computationally expensive and to reduce the estimator time, the n_neighbors hyperparameter was optimized before running a machine learning (ML) pipeline for model selection. The pipeline is designed to optimize hyperparameters and will also output the best model for the dataset. 

In [None]:
neighbors = range(1, 21, 2) 
# range is a list iterator
# X_train.shape[0] = working in the first dimension of my array
#2 = step size so only odd numbers show up

train_acc = []
test_acc = []

for n in neighbors: 
    print(f"Working on my model with {n} neighbors...", end="\r")
    
    #Instantiate and Fit
    KNN_model = KNeighborsClassifier(n_neighbors=n)
    KNN_model.fit(X_scale, y_train2)
    
    
    #Score the model
    train_accuracy = KNN_model.score(X_scale, y_train2)
    test_accuracy = KNN_model.score(X_val, y_val)
    
    
    #Append my accuracy
    train_acc.append(train_accuracy)
    test_acc.append(test_accuracy)


In [None]:
#index value that is the largest in the test accuracy
index_of_max = np.argmax(test_acc)

#the corresponding coordinate k value
best_k = neighbors[index_of_max]
print(f'Best KNN k value: {best_k}')

<a id='select'></a>
### Model Tuning: 
##### (A) Logistic Regression

In [None]:
estimators = [
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA()),
    ('model', LogisticRegression())
]

model_pipe = Pipeline(estimators)

param_grid = [
    {
        'scaler': [StandardScaler(), None],
        'dim_reducer': [PCA()],
        'model': [LogisticRegression()], 
        'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'model__penalty': ['l1', 'l2'],
        'dim_reducer__n_components': [0, 2, 3, 4, 5, 6]
    }
]
grid = GridSearchCV(model_pipe, param_grid, cv=5)
logreg_search = grid.fit(X_train2, y_train2)    

In [None]:
logreg_search.best_params_

##### (B) Decision Tree

In [None]:
# Create placeholders for the three steps of scaling, dimention reduction, and model
estimators = [
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA()),
    ('model', LogisticRegression())
]

model_pipe = Pipeline(estimators)

param_grid = [
    {
        'scaler': [StandardScaler(), None],
        'dim_reducer':[PCA()],
        'model': [DecisionTreeClassifier()], 
        'model__max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
        'model__min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
        'dim_reducer__n_components': [0, 2, 3, 4, 5, 6]
    }
]

grid = GridSearchCV(model_pipe, param_grid, cv=5)
dt_search = grid.fit(X_train2, y_train2)    

In [None]:
dt_search.best_params_

##### (C) KNN

In [None]:
estimators = [
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA()),
    ('model', LogisticRegression())
]

model_pipe = Pipeline(estimators)

param_grid = [
    {
        'scaler': [StandardScaler(), None],
        'dim_reducer':[PCA()],
        'model': [KNeighborsClassifier()],
        'model__n_neighbors': [7, 8, 9, 10, 11]
        'dim_reducer__n_components': [0, 2, 3, 4, 5, 6]
    }
]

grid = GridSearchCV(model_pipe, param_grid, cv=5)
knn_search = grid.fit(X_train2, y_train2)    

In [None]:
fittedsearch.best_params_

<a id='fit-val'></a>
### Fit Optimized Model - Test on Validation Data
##### Validation Data

In [None]:
# Fitting the logistic model
log_base = LogisticRegression(random_state=9)
log_base.fit(X_train2, y_train2)

# Evaluate its classification accuracy (Just on the training set for now)
print(f"Train Set Accuracy: {log_base.score(X_train2, y_train2)}")
print(f"Test Set Accuracy: {log_base.score(X_val, y_val)}")

# Instantiate
dt_base = DecisionTreeClassifier(random_state=2)

# Fit
dt_base.fit(X_train2, y_train2)

# Score
print(f'Train Set Accuracy: {dt_base.score(X_train2, y_train2)}')
print(f'Test Set Accuracy: {dt_base.score(X_val, y_val)}')

# Instantiate the model & fit it to our data
KNN_base = KNeighborsClassifier()
KNN_base.fit(X_train2, y_train2)

# Score the model on the test set
print("Number of neighbors:", KNN_base.n_neighbors)
print("Train accuracy:", KNN_base.score(X_train2, y_train2))
print("Test accuracy:", KNN_base.score(X_val, y_val))

##### Fit Optimized Model to Test Data

In [None]:
# Fitting the logistic model
log_base = LogisticRegression(random_state=9)
log_base.fit(X_train2, y_train2)

# Evaluate its classification accuracy (Just on the training set for now)
print(f"Train Set Accuracy: {log_base.score(X_train2, y_train2)}")
print(f"Test Set Accuracy: {log_base.score(X_val, y_val)}")

# Instantiate
dt_base = DecisionTreeClassifier(random_state=2)

# Fit
dt_base.fit(X_train2, y_train2)

# Score
print(f'Train Set Accuracy: {dt_base.score(X_train2, y_train2)}')
print(f'Test Set Accuracy: {dt_base.score(X_val, y_val)}')

# Instantiate the model & fit it to our data
KNN_base = KNeighborsClassifier()
KNN_base.fit(X_train2, y_train2)

# Score the model on the test set
print("Number of neighbors:", KNN_base.n_neighbors)
print("Train accuracy:", KNN_base.score(X_train2, y_train2))
print("Test accuracy:", KNN_base.score(X_val, y_val))

--------
<a id='Step-4'></a>
### 4. Model Evaluation
--------
#### Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix, confusion_matrix

# Get class predictions
y_pred = credit_logit.predict(X_test)

# Generate confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
# label rows and columns
cf_df = pd.DataFrame(
    cf_matrix, 
    columns=["Predicted Non-fraudulent", "Predicted Fraudulent"],
    index=["True Non-fraudulent", "True Fraudulent"]
)

display(cf_df)

<a id='auc-roc'></a>
#### AUC-ROC Evaluation

--------
<a id='Step-5'></a>
### 5. Class Balance
--------
From the *Part III: Logistic Regression* notebook, the balance of the data is imbalanced as most accidents are not fatal (approx. 81%). There are two lines of thinking:
(1) Artificially balance the data through up sampling, down sammpling or Synthetic Minority Oversampling Technique (SMOTE)
(2) Artificially balancing data improperly trains the model and introduces bias to the data resulting in poor model performance on new data

#### Up Sampling

<a id='down-samp'></a>
#### Down Sampling

--------
<a id='Step-6'></a>
### 6. Results & Summary
--------