<a href="https://colab.research.google.com/github/jacobdwatters/NIOSH-Project/blob/main/SIG_SUB_Regression_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install scikeras[tensorflow]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikeras[tensorflow]
  Downloading scikeras-0.10.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.10.0


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

from scikeras.wrappers import KerasRegressor

import scipy as sp
from scipy import stats

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
FEATURES = ['VIOLATION_OCCUR_DT', 'MINE_ID', 'MINE_TYPE', 'COAL_METAL_IND', 'SIG_SUB', 
            'INJ_ILLNESS', 'NO_AFFECTED', 'NEGLIGENCE', 'VIOLATOR_VIOLATION_CNT',
            'VIOLATOR_INSPECTION_DAY_CNT']
TARGETS = ['PROPOSED_PENALTY']

In [6]:
path = '/content/gdrive/My Drive/NIOSH Project/data/violations_processed_after_2010.csv'
violation_data = pd.read_csv(path)

print('Samples in dataset: ', len(violation_data))

Samples in dataset:  1429135


In [7]:
significant_data = violation_data[violation_data['SIG_SUB'] == 'Y']
print('Significant and substantial samples in dataset: ', len(significant_data))

Significant and substantial samples in dataset:  342474


## Prepare and Split Significant Data

In [8]:
FEATURES = ['MINE_TYPE', 'COAL_METAL_IND', 'LIKELIHOOD', 
            'INJ_ILLNESS', 'NO_AFFECTED', 'NEGLIGENCE', 'VIOLATOR_VIOLATION_CNT',
            'VIOLATOR_INSPECTION_DAY_CNT']
TARGETS = ['PROPOSED_PENALTY']

X = significant_data[FEATURES]
y = significant_data[TARGETS]

# Define which columns should be encoded vs scaled
columns_to_encode = ['MINE_TYPE', 'COAL_METAL_IND', 'LIKELIHOOD', 'INJ_ILLNESS', 'NEGLIGENCE']
columns_to_scale  = ['VIOLATOR_VIOLATION_CNT', 'NO_AFFECTED', 'VIOLATOR_INSPECTION_DAY_CNT']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(X[columns_to_scale])
encoded_columns = ohe.fit_transform(X[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X_pre = np.concatenate([scaled_columns, encoded_columns], axis=1)
np.nan_to_num(X_pre, copy=False)

print('Features shape:', X_pre.shape)

X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size = 0.25, random_state = 0)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_train shape:', y_test.shape)



Features shape: (342474, 22)
X_train shape: (256855, 22)
X_test shape: (85619, 22)
y_train shape: (256855, 1)
y_train shape: (85619, 1)


## Desicion Tree Model With AdaBoost
Apply hyper parameter tuning using cross-validation and gridsearch.

In [56]:
n_estimators_range = np.arange(2, 11, 2)
max_depth_range = [DecisionTreeRegressor(max_depth=x) for x in range(2, 17, 2)]
hyper_param_grid = dict(estimator=max_depth_range, n_estimators=n_estimators_range)

tree_regressor_adaboost = AdaBoostRegressor()

grid = GridSearchCV(tree_regressor_adaboost, hyper_param_grid, cv=5, return_train_score=False, refit=True, n_jobs=-1)
grid.fit(X_train, y_train.values.ravel())

tree_regressor_adaboost_optimal = grid.best_estimator_

### Cross Validation Results


In [57]:
print('Mean cross-validated r2 score +/- stdev: %.3f +/- %.2f' % (grid.best_score_, grid.cv_results_['std_test_score'][grid.best_index_]))
print('Best Parameters: ', grid.best_params_)

Mean cross-validated r2 score +/- stdev: 0.501 +/- 0.02
Best Parameters:  {'estimator': DecisionTreeRegressor(max_depth=12), 'n_estimators': 4}


#### Final Testing Scores

In [59]:
tree_regressor_adaboost_y_pred_train = tree_regressor_adaboost_optimal.predict(X_train)
tree_regressor_adaboost_y_pred_test = tree_regressor_adaboost_optimal.predict(X_test)

tree_regressor_adaboost_r_squared_train = r2_score(y_train, tree_regressor_adaboost_y_pred_train)
tree_regressor_adaboost_r_squared_test = r2_score(y_test, tree_regressor_adaboost_y_pred_test)

tree_regressor_adaboost_mae_train = mean_absolute_error(y_train, tree_regressor_adaboost_y_pred_train)
tree_regressor_adaboost_mae_test = mean_absolute_error(y_test, tree_regressor_adaboost_y_pred_test)

print('Decision Tree Regression with AdaBoost Scores:')
print('Training R^2 = %.3f' % tree_regressor_adaboost_r_squared_train)
print('Training MAE = $%.2f' % tree_regressor_adaboost_mae_train)

print('\nTesting R^2 = %.3f' % tree_regressor_adaboost_r_squared_test)
print('Testing MAE = $%.2f' % tree_regressor_adaboost_mae_test)

Decision Tree Regression with AdaBoost Scores:
Training R^2 = 0.736
Training MAE = $1084.82

Testing R^2 = 0.511
Testing MAE = $1243.93


### Save Final Model

In [None]:
# Save the model
filename = '/content/gdrive/My Drive/SIG_SUB_tree_regressor_adaboost_model_final.joblib'
dump(tree_regressor_adaboost_optimal, filename)

['/content/gdrive/My Drive/SIG_SUB_tree_regressor_adaboost_model_final.joblib']

## Train Neural Network

In [10]:
def scheduler(epoch, lr):
  if epoch < 85:
    return lr
  else: # Drop learning rate after the first 85 epochs
    return lr*np.exp(-0.05)

In [30]:
batch_size = 4096*2
epochs = 120
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

def build_keras_model():
  keras_model = keras.Sequential()

  keras_model.add(keras.Input(shape = (len(X_train[0]), ) ))
  keras_model.add(layers.Dense(64, activation="relu"))
  keras_model.add(layers.Dense(32, activation="relu"))
  keras_model.add(layers.Dense(16, activation="relu"))
  keras_model.add(layers.Dense(8, activation="relu"))
  keras_model.add(layers.Dense(8, activation="relu"))
  keras_model.add(layers.Dense(8, activation="relu"))
  keras_model.add(layers.Dense(1, activation="linear"))

  keras_model.compile(loss="mse")

  return keras_model

# Wrap in KerasRegressor object
keras_model = KerasRegressor(model = build_keras_model)

# Create hyper parameter grid.
learning_rate_grid = [0.1, 0.01, 0.001, 0.0001]
# beta_1_grid = [0.9, 0.85, 0.8]
# beta_2_grid = [0.99, 0.95, 0.9]
keras_param_grid = dict(optimizer__learning_rate = learning_rate_grid, 
                        batch_size = [batch_size], 
                        epochs = [epochs], 
                        callbacks = [callback])

# Apply grid search with cross-validation for hyper parameter optimization.
keras_grid = GridSearchCV(keras_model, keras_param_grid, cv=10, return_train_score=False, refit=True, n_jobs=-1)
keras_grid.fit(X_train, y_train.values.ravel())

# history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
#                     callbacks=[callback], validation_data=(X_test, y_test))



Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

In [52]:
keras_model_optimal = keras_grid.best_estimator_

print('Mean cross-validated r2 score +/- stdev: %.3f +/- %.2f' % (keras_grid.best_score_, keras_grid.cv_results_['std_test_score'][keras_grid.best_index_]))
print("Best Parameters:", keras_grid.best_params_)

Mean cross-validated r2 score +/- stdev: 0.553 +/- 0.03
Best Parameters: {'batch_size': 8192, 'callbacks': <keras.callbacks.LearningRateScheduler object at 0x7f4c132ef610>, 'epochs': 120, 'optimizer__learning_rate': 0.01}


#### Final Testing Score

In [39]:
y_pred = keras_model_optimal.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('Testing r2 = %.3f' % r2)
print('MAE = $%.2f' % mae)

Testing r2 = 0.579
MAE = $1149.39
