<a href="https://colab.research.google.com/github/jacobdwatters/NIOSH-Project/blob/main/DecisionTreeRegressors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and Setup

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

import scipy as sp
from scipy import stats

In [15]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load and process data

In [4]:
path_after_2010 = '/content/gdrive/My Drive/School/Grad School/NIOSH Project/Data/violations_processed_after_2010.csv'
violation_data = pd.read_csv(path_after_2010)

  violation_data = pd.read_csv(path_after_2010)


In [5]:
FEATURES = ['MINE_TYPE', 'COAL_METAL_IND', 'SIG_SUB', 'LIKELIHOOD',
            'INJ_ILLNESS', 'NO_AFFECTED', 'NEGLIGENCE', 'VIOLATOR_VIOLATION_CNT',
            'VIOLATOR_INSPECTION_DAY_CNT']
TARGETS = ['PROPOSED_PENALTY']

X = violation_data[FEATURES]
y = violation_data[TARGETS]

In [6]:
# Define which columns should be encoded vs scaled
columns_to_encode = ['MINE_TYPE', 'COAL_METAL_IND', 'LIKELIHOOD', 'INJ_ILLNESS', 'SIG_SUB', 'NEGLIGENCE']
columns_to_scale  = ['VIOLATOR_VIOLATION_CNT', 'NO_AFFECTED', 'VIOLATOR_INSPECTION_DAY_CNT']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(X[columns_to_scale])
encoded_columns = ohe.fit_transform(X[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X_pre = np.concatenate([scaled_columns, encoded_columns], axis=1)
np.nan_to_num(X_pre, copy=False)

print('Features shape:', X_pre.shape)



Features shape: (1530011, 24)


In [16]:
joblib.dump(scaler, '/content/gdrive/My Drive/School/Grad School/NIOSH Project/Models/PropPenalty-DecTree/PropPenalty-DecTree-scaler.pkl')
joblib.dump(ohe, '/content/gdrive/My Drive/School/Grad School/NIOSH Project/Models/PropPenalty-DecTree/PropPenalty-DecTree-encoder.pkl')

['/content/gdrive/My Drive/School/Grad School/NIOSH Project/Models/PropPenalty-DecTree/PropPenalty-DecTree-encoder.pkl']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size = 0.25, random_state = 0)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_train shape:', y_test.shape)

X_train shape: (1147508, 24)
X_test shape: (382503, 24)
y_train shape: (1147508, 1)
y_train shape: (382503, 1)


## Decision Tree Regressor

In [18]:
tree_regressor = DecisionTreeRegressor(max_depth=10, random_state=0).fit(X_train, y_train)

In [19]:
tree_regressor_y_pred_train = tree_regressor.predict(X_train)
tree_regressor_y_pred_test = tree_regressor.predict(X_test)

tree_regressor_r_squared_train = r2_score(y_train, tree_regressor_y_pred_train)
tree_regressor_r_squared_test = r2_score(y_test, tree_regressor_y_pred_test)

tree_regressor_mae_train = mean_absolute_error(y_train, tree_regressor_y_pred_train)
tree_regressor_mae_test = mean_absolute_error(y_test, tree_regressor_y_pred_test)

print('Decision Tree Regression Scores:')
print('Training R^2 = %.3f' % tree_regressor_r_squared_train)
print('Training MAE = %.3f' % tree_regressor_mae_train)

print('\nTesting R^2 = %.3f' % tree_regressor_r_squared_test)
print('Testing MAE = %.3f' % tree_regressor_mae_test)

Decision Tree Regression Scores:
Training R^2 = 0.628
Training MAE = 411.747

Testing R^2 = 0.523
Testing MAE = 427.553


## Decision Tree Regressor with AdaBoost

In [22]:
tree_regressor_adaboost = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=15), n_estimators=5, random_state=0
)
tree_regressor_adaboost.fit(X_train, np.ravel(y_train))


In [23]:
tree_regressor_adaboost_y_pred_train = tree_regressor_adaboost.predict(X_train)
tree_regressor_adaboost_y_pred_test = tree_regressor_adaboost.predict(X_test)

tree_regressor_adaboost_r_squared_train = r2_score(y_train, tree_regressor_adaboost_y_pred_train)
tree_regressor_adaboost_r_squared_test = r2_score(y_test, tree_regressor_adaboost_y_pred_test)

tree_regressor_adaboost_mae_train = mean_absolute_error(y_train, tree_regressor_adaboost_y_pred_train)
tree_regressor_adaboost_mae_test = mean_absolute_error(y_test, tree_regressor_adaboost_y_pred_test)

print('Decision Tree Regression with AdaBoost Scores:')
print('Training R^2 = %.3f' % tree_regressor_adaboost_r_squared_train)
print('Training MAE = %.3f' % tree_regressor_adaboost_mae_train)

print('\nTesting R^2 = %.3f' % tree_regressor_adaboost_r_squared_test)
print('Testing MAE = %.3f' % tree_regressor_adaboost_mae_test)

Decision Tree Regression with AdaBoost Scores:
Training R^2 = 0.799
Training MAE = 346.210

Testing R^2 = 0.490
Testing MAE = 415.585


Save Model

In [None]:
joblib.dump(tree_regressor_adaboost, 'filename.pkl')

## Prepare Inlier Data

In [None]:
MAX_VALUE = 1000
violations_cliped = violation_data[violation_data['PROPOSED_PENALTY'] <= MAX_VALUE]

print('Total samples = %d' % len(violation_data))
print('Total samples with PROPOSED_PENALTY <= %d: %d' % (MAX_VALUE, len(violations_cliped)))

Total samples = 1429135
Total samples with PROPOSED_PENALTY <= 1000: 1259626


In [None]:
X_inliers = violations_cliped[FEATURES]
y_inliers = violations_cliped[TARGETS]

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns_inliers  = scaler.fit_transform(X_inliers[columns_to_scale])
encoded_columns_inliers = ohe.fit_transform(X_inliers[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X_pre_inliers = np.concatenate([scaled_columns_inliers, encoded_columns_inliers], axis=1)
np.nan_to_num(X_pre_inliers, copy=False)

X_train_inliers, X_test_inliers, y_train_inliers, y_test_inliers = train_test_split(X_pre_inliers, y_inliers, test_size = 0.25, random_state = 0)

## Inlier Decision Tree Regressor with AdaBoost

In [None]:
tree_regressor_adaboost_inliers = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=18), n_estimators=4, random_state=0
)
tree_regressor_adaboost_inliers.fit(X_train_inliers, np.ravel(y_train_inliers))

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=18),
                  n_estimators=4, random_state=0)

In [None]:
tree_regressor_adaboost_inliers_y_pred_train = tree_regressor_adaboost_inliers.predict(X_train_inliers)
tree_regressor_adaboost_inliers_y_pred_test = tree_regressor_adaboost_inliers.predict(X_test_inliers)

tree_regressor_adaboost_inliers_r_squared_train = r2_score(y_train_inliers, tree_regressor_adaboost_inliers_y_pred_train)
tree_regressor_adaboost_inliers_r_squared_test = r2_score(y_test_inliers, tree_regressor_adaboost_inliers_y_pred_test)

tree_regressor_adaboost_inliers_mae_train = mean_absolute_error(y_train_inliers, tree_regressor_adaboost_inliers_y_pred_train)
tree_regressor_adaboost_inliers_mae_test = mean_absolute_error(y_test_inliers, tree_regressor_adaboost_inliers_y_pred_test)

print('Inlier Decision Tree Regression with AdaBoost Scores:')
print('Training R^2 = %.3f' % tree_regressor_adaboost_inliers_r_squared_train)
print('Training MAE = %.3f' % tree_regressor_adaboost_inliers_mae_train)

print('\nTesting R^2 = %.3f' % tree_regressor_adaboost_inliers_r_squared_test)
print('Testing MAE = %.3f' % tree_regressor_adaboost_inliers_mae_test)

Inlier Decision Tree Regression with AdaBoost Scores:
Training R^2 = 0.769
Training MAE = 58.629

Testing R^2 = 0.706
Testing MAE = 65.346


## Prepare Outlier Data

In [None]:
violations_outliers = violation_data[violation_data['PROPOSED_PENALTY'] > MAX_VALUE]

print('Total samples = %d' % len(violation_data))
print('Total samples with PROPOSED_PENALTY > %d: %d' % (MAX_VALUE, len(violations_outliers)))

Total samples = 1429135
Total samples with PROPOSED_PENALTY > 1000: 169509


In [None]:
X_outliers = violations_outliers[FEATURES]
y_outliers = violations_outliers[TARGETS]

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns_outliers  = scaler.fit_transform(X_outliers[columns_to_scale])
encoded_columns_outliers = ohe.fit_transform(X_outliers[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X_pre_outliers = np.concatenate([scaled_columns_outliers, encoded_columns_outliers], axis=1)
np.nan_to_num(X_pre_outliers, copy=False)

X_train_outliers, X_test_outliers, y_train_outliers, y_test_outliers = train_test_split(X_pre_outliers, y_outliers, test_size = 0.25, random_state = 0)

## Outlier Decision Tree Regression with AdaBoost

In [None]:
tree_regressor_adaboost_outliers = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=11), n_estimators=3, random_state=0
)
tree_regressor_adaboost_outliers.fit(X_train_outliers, np.ravel(y_train_outliers))

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=11),
                  n_estimators=3, random_state=0)

In [None]:
tree_regressor_adaboost_outliers_y_pred_train = tree_regressor_adaboost_outliers.predict(X_train_outliers)
tree_regressor_adaboost_outliers_y_pred_test = tree_regressor_adaboost_outliers.predict(X_test_outliers)

tree_regressor_adaboost_outliers_r_squared_train = r2_score(y_train_outliers, tree_regressor_adaboost_outliers_y_pred_train)
tree_regressor_adaboost_outliers_r_squared_test = r2_score(y_test_outliers, tree_regressor_adaboost_outliers_y_pred_test)

tree_regressor_adaboost_outliers_mae_train = mean_absolute_error(y_train_outliers, tree_regressor_adaboost_outliers_y_pred_train)
tree_regressor_adaboost_outliers_mae_test = mean_absolute_error(y_test_outliers, tree_regressor_adaboost_outliers_y_pred_test)

print('Outlier Decision Tree Regression with AdaBoost Scores:')
print('Training R^2 = %.3f' % tree_regressor_adaboost_outliers_r_squared_train)
print('Training MAE = %.3f' % tree_regressor_adaboost_outliers_mae_train)

print('\nTesting R^2 = %.3f' % tree_regressor_adaboost_outliers_r_squared_test)
print('Testing MAE = %.3f' % tree_regressor_adaboost_outliers_mae_test)

Outlier Decision Tree Regression with AdaBoost Scores:
Training R^2 = 0.630
Training MAE = 2495.198

Testing R^2 = 0.409
Testing MAE = 2776.637
