<a href="https://colab.research.google.com/github/jacobdwatters/NIOSH-Project/blob/main/LinearModels_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RANSACRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

import scipy as sp
from scipy import stats

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Load and proceses data

#### Load data from csv

In [4]:
path_after_2010 = '/content/gdrive/My Drive/NIOSH Project/data/violations_processed_after_2010.csv'
violation_data = pd.read_csv(path_after_2010)

#### Split into features and targets

In [16]:
FEATURES = ['MINE_TYPE', 'COAL_METAL_IND', 'SIG_SUB', 'LIKELIHOOD', 
            'INJ_ILLNESS', 'NO_AFFECTED', 'NEGLIGENCE', 'VIOLATOR_VIOLATION_CNT',
            'VIOLATOR_INSPECTION_DAY_CNT']
TARGETS = ['PROPOSED_PENALTY']

X = violation_data[FEATURES]
y = violation_data[TARGETS]

#### Encode and scale features

In [20]:
# Define which columns should be encoded vs scaled
columns_to_encode = ['MINE_TYPE', 'COAL_METAL_IND', 'LIKELIHOOD', 'INJ_ILLNESS', 'SIG_SUB', 'NEGLIGENCE']
columns_to_scale  = ['VIOLATOR_VIOLATION_CNT', 'NO_AFFECTED', 'VIOLATOR_INSPECTION_DAY_CNT']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(X[columns_to_scale])
encoded_columns = ohe.fit_transform(X[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X_pre = np.concatenate([scaled_columns, encoded_columns], axis=1)
np.nan_to_num(X_pre, copy=False)

print('Features shape:', X_pre.shape)

Features shape: (1429135, 24)


#### Split data into training and testing datasets

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size = 0.25, random_state = 0)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_train shape:', y_test.shape)

X_train shape: (1071851, 24)
X_test shape: (357284, 24)
y_train shape: (1071851, 1)
y_train shape: (357284, 1)


## Define and Fit Simple Linear Regression Model

In [36]:
lin_reg = LinearRegression().fit(X_train, y_train)

In [58]:
lin_reg_y_pred_train = lin_reg.predict(X_train)
lin_reg_y_pred_test = lin_reg.predict(X_test)

lin_reg_r_squared_train = r2_score(y_train, lin_reg_y_pred_train)
lin_reg_r_squared_test = r2_score(y_test, lin_reg_y_pred_test)

lin_reg_mae_train = mean_absolute_error(y_train, lin_reg_y_pred_train)
lin_reg_mae_test = mean_absolute_error(y_test, lin_reg_y_pred_test)

print('Simple Linear Regression Scores:')
print('Training R^2 = %.3f' % lin_reg_r_squared_train)
print('Training MAE = %.3f' % lin_reg_mae_train)

print('\nTesting R^2 = %.3f' % lin_reg_r_squared_test)
print('Testing MAE = %.3f' % lin_reg_mae_test)

Simple Linear Regression Scores:
Training R^2 = 0.276
Training MAE = 841.728

Testing R^2 = 0.260
Testing MAE = 837.038


## Define and Fit RANSAC Regressor

In [47]:
ransac_reg = RANSACRegressor().fit(X_train, y_train)

In [59]:
ransac_reg_y_pred_train = ransac_reg.predict(X_train)
ransac_reg_y_pred_test = ransac_reg.predict(X_test)

ransac_reg_r_squared_train = r2_score(y_train, ransac_reg_y_pred_train)
ransac_reg_r_squared_test = r2_score(y_test, ransac_reg_y_pred_test)

ransac_reg_mae_train = mean_absolute_error(y_train, ransac_reg_y_pred_train)
ransac_reg_mae_test = mean_absolute_error(y_test, ransac_reg_y_pred_test)

print('RANSAC Regression Scores:')
print('Training R^2 = %.3f' % ransac_reg_r_squared_train)
print('Training MAE = %.3f' % ransac_reg_mae_train)

print('\nTesting R^2 = %.3f' % ransac_reg_r_squared_test)
print('Testing MAE = %.3f' % ransac_reg_mae_test)

RANSAC Regression Scores:
Training R^2 = 0.043
Training MAE = 609.615

Testing R^2 = 0.045
Testing MAE = 600.971
