In [1]:
#import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Aviation Accident Capstone:
## *Part IV: Decision Tree Model*
Created by: Katy Christensen <br>
Created on: September 26, 2022 <br>
Created for: BrainStation Data Science Bootcamp Capstone<br>
Notebook 4 of 6<br>

Previous Notebook: *Part III: Logistic Regression Model* <br>
Upcoming Notebook: *Part V: K-Nearest Neighbor Model* <br>

--------------
 

## Table of Contents
[1. Load Data & Review](#Step-1) <br>
[2. Splitting Data](#Step-2) <br>
- [Train-Test Split](#test-split) <br>
- [Train-Validation Split](#val-split)<br>

[3. Modeling](#Step-3) <br>
- [Baseline Decision Tree](#base-dt) <br>

[4. Hyperparameter Optimization](#Step-4) <br>
- [Max_Depth](#max_depth) <br>
- [Min_Samples_Leaf](#min-leaf) <br>
- [Refit Decision Tree](#refit)<br>

[5. Model Evaluation](#Step-5) <br>
[6. Results & Summary](#Step-6) <br>

--------
<a id='Step-1'></a>
### 1. Load Data & Review
--------
#### Load Data

In [None]:
ntsb08 = pd.read_csv('data/ntsb08_model.csv')

In [None]:
X = ntsb08.drop(columns='ev_highest_injury')
y = ntsb08['ev_highest_injury']

--------
<a id='Step-2'></a>
### 2. Splitting the Data
---------
<a id='test-split'></a>
#### Train-Test Split
The first split is the train/test split; 70% of the data is allocated to the training data and 30% to the final test data. The data is stratified to ensure similar distributions to the original data.

In [None]:
#Split the test into train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5, stratify=y)

<a id='val-split'></a>
#### Train-Validation Split
The second split is the train/validation split; 80% of the data is allocated to the training data and 20% to the validation data. 

In [None]:
#Split the test into train (80%) and test (20%)
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                    random_state=6, stratify=y_train)

--------------
<a id='Step-3'></a>
### 3. Modeling
--------------
<a id='base-dt'></a>
#### Baseline Decision Tree

In [None]:
# Decision Tree
# Instantiate
ntsb_dt = DecisionTreeClassifier()

# Fit
ntsb_dt.fit(X_train2, y_train2)

# Score
print(f'Train Accuracy Score: {ntsb_dt.score(X_train2, y_train2)}')
print(f'Test Accuracy Score: {ntsb_dt.score(X_val, y_val)}')

In [None]:
# Instantiate & fit PCA model to data
# Default n_components will generate the same number of PCs as you have features 
dt_PCA = PCA(n_components=10)
dt_PCA.fit(X_train2)

# transform data 
dt_train_PCA = dt_PCA.transform(X_train2)
dt_val_PCA = dt_PCA.transform(X_val)

# Instantiate
dt_PCA = DecisionTreeClassifier()

# Fit
dt_PCA.fit(X_train2, y_train2)

# Score 
print(f"Train Set Accuracy: {dt_PCA.score(X_train2, y_train2)}")
print(f"Test Set Accuracy: {dt_PCA.score(X_val, y_val)}")

------------
<a id='Step-4'></a>
### 4. Hyperparameter Optimization
------------
Hyperparameter optimization is important because Decision Tree models will overfit since they continue to split through every feature. Pre-pruning is an effort to curb this tendency by setting hyperparameter limits on:
1. `max_depth`: limits the consecutive splits the model can make, resulting in a "simplified tree" that limit how well the model can fit branches to a single data point.
2. `min_samples_leaf`: forces the model to fit around larger data regions, which prevents fitting around individual data points. 

In [None]:
m_depth = range(1, 11, 1)
# FIT DECISION TREE
train_accs = []
test_accs = []

for depth in m_depth:

    # 1. Instantiate 
    ntsb_tree = DecisionTreeClassifier(max_depth=depth)

    # 2. Fit (on the train set)
    ntsb_tree.fit(X_train, y_train)

    # 3. Score (on both sets)
    train_score = ntsb_tree.score(X_train2, y_train2)
    test_score = ntsb_tree.score(X_val, y_val)
    
    # Add these to the lists
    train_accs.append(train_score)
    test_accs.append(test_score)

In [None]:
# Visualize
plt.figure()
plt.plot(range(1, 21), train_accs, label='train', marker='o')
plt.plot(range(1, 21), test_accs, label='test', marker='o')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy Score')
plt.legend()
plt.show()

In [None]:
#index value that is the largest in the test accuracy
index_of_max = np.argmax(test_accs)

#the corresponding coordinate k value
best_md = m_depth[index_of_max]
print(f'Best max depth value: {best_md}')

**Comment:** The best max depth is at easier 4 or 6. This can be determined by using a pipeline to test for specific pr

In [None]:
min_leaf = range(1, 21, 1) 
train_accs = []
test_accs = []

for leaves in min_leaf:

    # 1. Instantiate 
    ntsb_tree = DecisionTreeClassifier(min_samples_leaf=leaves)

    # 2. Fit (on the train set)
    ntsb_tree.fit(X_train, y_train)

    # 3. Score (on both sets)
    train_score = ntsb_tree.score(X_train2, y_train2)
    test_score = ntsb_tree.score(X_val, y_val)
    
    # Add these to the lists
    train_accs.append(train_score)
    test_accs.append(test_score)

In [None]:
#index value that is the largest in the test accuracy
index_of_max = np.argmax(test_accs)

#the corresponding coordinate k value
best_ms = min_leaf[index_of_max]
print(f'Best max depth value: {best_ms}')

In [None]:
#we give our estimators as a list of tuples: name:function.
estimator = [('model', DecisionTreeClassifier())]

pipe = Pipeline(estimator)

param_grid = [
        {'model': [DecisionTreeClassifier()], 
         'model__max_depth': [7, 9, 13, 21],
         'model__min_samples_leaf': [1, 2, 3],
        }
    ]

grid = GridSearchCV(pipe, param_grid, cv=5)
fittedgrid = grid.fit(X_train2, y_train2)

In [None]:
fittedgrid.best_params_

<a id='refit'></a>
#### Refitting Decision Tree Model

In [None]:
# Decision Tree
# Instantiate
ntsb_dt6 = DecisionTreeClassifier(max_depth=6, min_samples_leaf=2 )

# Fit
ntsb_dt6.fit(X_train2, y_train2)

# Score
print(f'Train Score: {ntsb_dt6.score(X_train2, y_train2)}')
print(f'Validation Score: {ntsb_dt6.score(X_val, y_val)}')

---
<a id='Results'></a>
## 6. Results & Summary 
---
dfs

---
#### <div align = "right">Up Next:</div>
<div align = "right">Aviation Accident Captson Part V: K-Nearest Neighbor</div>