In [2]:
from importlib import reload
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC, RandomOverSampler
import numpy as np
import violation_common
# run this cell after updating violation_common
reload(violation_common)

<module 'violation_common' from '/home/zack/work/NIOSH-Project/violation_common.py'>

## Overview of data splits

First we split into a full training set and a test set. The test set will be used to get an unbiased measure of the final model's performance.

```
data -> train_full, test
```

Now we create a version (not a split) of train_full with SMOTE resampling. The original train_full will be used to train models tat didn't make use of SMOTE training during hyperparamter selection. The SMOTE train_full will be used to train model that did (after hyperparam selection as well).

```
train_smote_full = SMOTE(train_full)
```

Now we split the train_full dataset in to train_hp and validate_hp sets to train and validate different hyperparam combinations. Keep in mind these are both not resampled.

```
train_full -> train_hp, validate_hp
```

Finally we make a version of train_hp that is SMOTE resampled so we can train hyperparams with SMOTE resampled data. Keep in mind we don't create a SMOTE hyperparam validation set since we want to validate hyperparams with the original data distribution.

```
train_hp_smote = SMOTE(train_hp)
```

In [3]:
data = violation_common.get_processed_violation_data()
after_2010 = data.query('YEAR_OCCUR > 2010')
after_2010.to_csv('data/after_2010.csv')

In [4]:
# create train and test dataset to be consistent between models
after_2010 = pd.read_csv('data/after_2010.csv', index_col=0)
train, test = train_test_split(after_2010, test_size=0.2, random_state=0)
# hyperparameter tuning sets
train_hp, validate_hp = train_test_split(train, test_size=0.2, random_state=0)

In [5]:
test.to_csv('data/after_2010_test.csv')
train.to_csv('data/after_2010_train_full.csv')
train_hp.to_csv('data/after_2010_train_hp.csv')
validate_hp.to_csv('data/after_2010_validate_hp.csv')

In [6]:
# SMOTE for full train dataset (to be used after hyperparameter tuning)

n_samples = len(train)
n_sig_target = int(n_samples / 2)
n_non_sig_target = n_samples - n_sig_target
print(f'''N Samples in Train Dataset: {n_samples}
S&S samples after SMOTE: {n_sig_target}
Non-S&S samples after SMOTE: {n_non_sig_target}''')
      
# randomly drop neccesary samples from majority class (non-S&S)
current_non_sig_n = len(train.query('SIG_SUB == "N"'))
num_to_drop = current_non_sig_n - n_non_sig_target
drop_indices = np.random.choice(train.query('SIG_SUB == "N"').index, num_to_drop, replace=False)
train_undersampled = train.drop(drop_indices)

# oversample with SMOTE
categorical_features = ['VIOLATOR_TYPE_CD', 'MINE_TYPE', 'COAL_METAL_IND', 'PRIMARY_OR_MILL']
numerical_features = ['VIOLATOR_VIOLATION_CNT', 'VIOLATOR_INSPECTION_DAY_CNT', 'YEAR_OCCUR']
target = ['SIG_SUB']

X = train_undersampled[categorical_features + numerical_features].to_numpy()
y = train_undersampled[target].to_numpy()

categorical_indices = list(range(len(categorical_features)))

smotenc_sampler = SMOTENC(random_state=0,
                          categorical_features=categorical_indices,
                          sampling_strategy={'Y': n_sig_target, 'N': n_non_sig_target})

print('Performing SMOTENC')
X_res, y_res = smotenc_sampler.fit_resample(X, y)

N Samples in Train Dataset: 1023606
S&S samples after SMOTE: 511803
Non-S&S samples after SMOTE: 511803
Performing SMOTENC


In [7]:
train_res = pd.DataFrame(X_res, columns=categorical_features + numerical_features)
train_res['SIG_SUB'] = y_res
train_res.to_csv('data/after_2010_train_smote_full.csv')

In [9]:
# stats for resampled data vs original data

original_full_train_data = pd.read_csv('data/after_2010_train_full.csv', index_col=0)
resampled_full_train_data = pd.read_csv('data/after_2010_train_smote_full.csv', index_col=0)

orig_not_sig, orig_sig = original_full_train_data['SIG_SUB'].value_counts()
res_not_sig, res_sig = resampled_full_train_data['SIG_SUB'].value_counts()

print(f'''Original Full Train Data:
S&S: {orig_sig}
Non-S&S: {orig_not_sig}
Ratio Non-S&S / S&S: {orig_not_sig / orig_sig : .3f}''')

print()

print(f'''SMOTE Resampled Full Train Data:
S&S: {res_sig}
Non-S&S: {res_not_sig}
Ratio Non-S&S / S&S: {res_not_sig / res_sig : .3f}''')

Original Full Train Data:
S&S: 237324
Non-S&S: 786282
Ratio Non-S&S / S&S:  3.313

SMOTE Resampled Full Train Data:
S&S: 511803
Non-S&S: 511803
Ratio Non-S&S / S&S:  1.000


In [10]:
# SMOTE for hyperparam train dataset

n_samples_hp = len(train_hp)
n_sig_target_hp = int(n_samples_hp / 2)
n_non_sig_target_hp = n_samples_hp - n_sig_target_hp
print(f'''N Samples in Hyperparam Train Dataset: {n_samples_hp}
S&S samples after SMOTE: {n_sig_target_hp}
Non-S&S samples after SMOTE: {n_non_sig_target_hp}''')
      
# randomly drop neccesary samples from majority class (non-S&S)
current_non_sig_n_hp = len(train_hp.query('SIG_SUB == "N"'))
num_to_drop_hp = current_non_sig_n_hp - n_non_sig_target_hp
drop_indices_hp = np.random.choice(train_hp.query('SIG_SUB == "N"').index, num_to_drop_hp, replace=False)
train_undersampled_hp = train_hp.drop(drop_indices_hp)

# oversample with SMOTE
categorical_features = ['VIOLATOR_TYPE_CD', 'MINE_TYPE', 'COAL_METAL_IND', 'PRIMARY_OR_MILL']
numerical_features = ['VIOLATOR_VIOLATION_CNT', 'VIOLATOR_INSPECTION_DAY_CNT', 'YEAR_OCCUR']
target = ['SIG_SUB']

X_hp = train_undersampled_hp[categorical_features + numerical_features].to_numpy()
y_hp = train_undersampled_hp[target].to_numpy()

categorical_indices = list(range(len(categorical_features)))

smotenc_sampler_hp = SMOTENC(random_state=0,
                          categorical_features=categorical_indices,
                          sampling_strategy={'Y': n_sig_target_hp, 'N': n_non_sig_target_hp})

print('Performing SMOTENC for hyperparameter train set')
X_res_hp, y_res_hp = smotenc_sampler_hp.fit_resample(X_hp, y_hp)

N Samples in Hyperparam Train Dataset: 818884
S&S samples after SMOTE: 409442
Non-S&S samples after SMOTE: 409442
Performing SMOTENC for hyperparameter train set


In [11]:
train_res_hp = pd.DataFrame(X_res_hp, columns=categorical_features + numerical_features)
train_res_hp['SIG_SUB'] = y_res_hp
train_res_hp.to_csv('data/after_2010_train_smote_hp.csv')

In [12]:
# stats for resampled data vs original data (hyperparam train set)

original_hp_train_data = pd.read_csv('data/after_2010_train_hp.csv', index_col=0)
resampled_hp_train_data = pd.read_csv('data/after_2010_train_smote_hp.csv', index_col=0)

orig_not_sig, orig_sig = original_hp_train_data['SIG_SUB'].value_counts()
res_not_sig, res_sig = resampled_hp_train_data['SIG_SUB'].value_counts()

print(f'''Original Hyperparam Train Data:
S&S: {orig_sig}
Non-S&S: {orig_not_sig}
Ratio Non-S&S / S&S: {orig_not_sig / orig_sig : .3f}''')

print()

print(f'''SMOTE Resampled Hyperparam Train Data:
S&S: {res_sig}
Non-S&S: {res_not_sig}
Ratio Non-S&S / S&S: {res_not_sig / res_sig : .3f}''')

Original Hyperparam Train Data:
S&S: 189768
Non-S&S: 629116
Ratio Non-S&S / S&S:  3.315

SMOTE Resampled Hyperparam Train Data:
S&S: 409442
Non-S&S: 409442
Ratio Non-S&S / S&S:  1.000
