In [3]:
import violation_common
from copy import deepcopy
from importlib import reload
from scipy.stats import chi2_contingency, pointbiserialr, pearsonr
import pandas as pd
import numpy as np
# run this cell after updating violation_common
reload(violation_common)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

In [15]:
data = violation_common.get_processed_violation_data()

In [17]:
data

Unnamed: 0,VIOLATOR_TYPE_CD,MINE_TYPE,COAL_METAL_IND,SIG_SUB,PRIMARY_OR_MILL,PROPOSED_PENALTY,VIOLATOR_VIOLATION_CNT,VIOLATOR_INSPECTION_DAY_CNT,YEAR_OCCUR
0,Operator,Surface,M,N,Primary,55.0,0.0,0.0,2000
1,Operator,Surface,M,N,Primary,55.0,0.0,0.0,2000
2,Operator,Surface,M,N,Primary,55.0,0.0,0.0,2000
3,Operator,Surface,M,Y,Primary,196.0,0.0,0.0,2000
4,Operator,Surface,M,N,Primary,55.0,0.0,0.0,2000
...,...,...,...,...,...,...,...,...,...
2809262,Operator,Surface,M,Y,Primary,2282.0,14.0,5.0,2012
2809263,Operator,Surface,M,N,Primary,460.0,14.0,5.0,2012
2809264,Operator,Surface,M,N,Primary,207.0,14.0,6.0,2012
2809265,Operator,Surface,M,N,Primary,127.0,14.0,6.0,2012


In [19]:
after_2010 = data.query('YEAR_OCCUR > 2010')
after_2010.to_csv('data/after_2010.csv')

In [4]:
after_2010 = pd.read_csv('data/after_2010.csv', index_col=0)

In [17]:
np.std(after_2010['PROPOSED_PENALTY']) / np.mean(after_2010['PROPOSED_PENALTY'])

4.721410093785144

In [34]:
_, (num_non_sig, num_sig) = np.unique(after_2010['SIG_SUB'], return_counts=True)
num_samples = len(after_2010)
pp_mean = np.mean(after_2010['PROPOSED_PENALTY'])
pp_median = np.median(after_2010['PROPOSED_PENALTY'])
pp_std = np.std(after_2010['PROPOSED_PENALTY'])
pp_cv = pp_std / pp_mean
freq_sig = num_sig / num_samples
freq_non_sig = num_non_sig / num_samples

print('Proposed Pentalty Stats:')
print(f'''n: {num_samples}
Mean: {pp_mean:.3f}
Median: {pp_median}
Std. Dev: {pp_std:.3f}
CV: {pp_cv:.3f}

S&S Stats:
S&S samples: {num_sig} ({freq_sig:.3f})
Non-S&S samples: {num_non_sig} ({freq_non_sig:.3f})''')

Proposed Pentalty Stats:
n: 1279508
Mean: 684.474
Median: 138.0
Std. Dev: 3224.218
CV: 4.711

S&S Stats:
S&S samples: 296136 (0.231)
Non-S&S samples: 983372 (0.769)


In [43]:
# comparing SIG_SUB to categorical columns

categorical_cols = ['PRIMARY_OR_MILL', 'COAL_METAL_IND', 'MINE_TYPE', 'VIOLATOR_TYPE_CD']
chi2_results_ss = dict()
for col in categorical_cols:
    contingency_table = pd.crosstab(after_2010['SIG_SUB'], after_2010[col])
    observed = contingency_table.values
    n = contingency_table.sum().sum()
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    V = np.sqrt(chi2 / (n * (min(observed.shape) - 1)))
    chi2_results_ss[col] = {'p-value': p_value, 'V': V, 'stat': chi2}

print('Chi Square Results SIG_SUB')
print(pd.DataFrame.from_dict(chi2_results_ss, orient='index'))

Chi Square Results
                        p-value         V         stat
PRIMARY_OR_MILL   2.932445e-139  0.022329   637.966983
COAL_METAL_IND    2.394633e-108  0.019549   488.963979
MINE_TYPE          6.158087e-53  0.013708   240.438488
VIOLATOR_TYPE_CD   0.000000e+00  0.047323  2865.400331


In [50]:
# comparing SIG_SUG to continuous columns

numerical_cols = ['VIOLATOR_INSPECTION_DAY_CNT', 'VIOLATOR_VIOLATION_CNT', 'YEAR_OCCUR']
point_biserial_results_ss = dict()
for col in numerical_cols:
    r, p = pointbiserialr(after_2010['SIG_SUB'] == 'Y', after_2010[col])
    point_biserial_results_ss[col] = {'R': r, 'p-value': p}

print("Point Biserial Results SIG_SUB")
print(pd.DataFrame.from_dict(point_biserial_results_ss, orient='index'))

Point Biserial Results SIG_SUB
                                    R       p-value
VIOLATOR_INSPECTION_DAY_CNT -0.011480  1.459256e-38
VIOLATOR_VIOLATION_CNT      -0.007927  3.054101e-19
YEAR_OCCUR                  -0.077026  0.000000e+00


In [52]:
# comparing PROPOSED_PENALTY to categorical columns

categorical_cols = ['PRIMARY_OR_MILL', 'COAL_METAL_IND', 'MINE_TYPE', 'VIOLATOR_TYPE_CD', 'SIG_SUB']

In [53]:
# comparing PROPOSED_PENALTY to numerical columns

numerical_cols = ['VIOLATOR_INSPECTION_DAY_CNT', 'VIOLATOR_VIOLATION_CNT', 'YEAR_OCCUR']
pearson_results_pp = dict()
for col in numerical_cols:
    r, p = pearsonr(after_2010['PROPOSED_PENALTY'], after_2010[col])
    pearson_results_pp[col] = {'R': r, 'p-value': p}

print("Pearson Correlation Results PROPOSED_PENALTY")
print(pd.DataFrame.from_dict(pearson_results_pp, orient='index'))

Pearson Correlation Results PROPOSED_PENALTY
                                    R  p-value
VIOLATOR_INSPECTION_DAY_CNT  0.061799      0.0
VIOLATOR_VIOLATION_CNT       0.103475      0.0
YEAR_OCCUR                  -0.037152      0.0


In [24]:
# feature selection with mutual information for SIG_SUB

categorical_cols = ['PRIMARY_OR_MILL', 'COAL_METAL_IND', 'MINE_TYPE', 'VIOLATOR_TYPE_CD']
numerical_cols = ['VIOLATOR_INSPECTION_DAY_CNT', 'VIOLATOR_VIOLATION_CNT', 'YEAR_OCCUR']

X = after_2010[categorical_cols + numerical_cols].copy()
for col in categorical_cols:
    X[col] = X[col].factorize()[0]

y = after_2010['SIG_SUB'].factorize()[0]

discrete_indices = list(range(len(categorical_cols)))
mutual_info = mutual_info_classif(X, y, discrete_features=discrete_indices, random_state=0)

mutual_info_series = pd.Series(mutual_info, index=X.columns)
mutual_info_series.sort_values(ascending=False, inplace=True)
print(mutual_info_series)

YEAR_OCCUR                     0.023600
VIOLATOR_INSPECTION_DAY_CNT    0.001881
VIOLATOR_VIOLATION_CNT         0.001658
VIOLATOR_TYPE_CD               0.001054
PRIMARY_OR_MILL                0.000249
COAL_METAL_IND                 0.000191
MINE_TYPE                      0.000094
dtype: float64


In [5]:
# feature selection with mutual information for PROPOSED_PENALTY

categorical_cols = ['PRIMARY_OR_MILL', 'COAL_METAL_IND', 'MINE_TYPE', 'VIOLATOR_TYPE_CD', 'SIG_SUB']
numerical_cols = ['VIOLATOR_INSPECTION_DAY_CNT', 'VIOLATOR_VIOLATION_CNT', 'YEAR_OCCUR']

X = after_2010[categorical_cols + numerical_cols].copy()
for col in categorical_cols:
    X[col] = X[col].factorize()[0]

y = after_2010['PROPOSED_PENALTY'].copy()

discrete_indices = list(range(len(categorical_cols)))
mutual_info = mutual_info_regression(X, y, discrete_features=discrete_indices, random_state=0)

mutual_info_series = pd.Series(mutual_info, index=X.columns)
mutual_info_series.sort_values(ascending=False, inplace=True)
print(mutual_info_series)

YEAR_OCCUR                     1.437295
VIOLATOR_VIOLATION_CNT         0.318062
VIOLATOR_INSPECTION_DAY_CNT    0.205223
SIG_SUB                        0.155703
MINE_TYPE                      0.048345
PRIMARY_OR_MILL                0.047417
COAL_METAL_IND                 0.037030
VIOLATOR_TYPE_CD               0.008511
dtype: float64


In [14]:
mutual_info_series.index.to_list()

['YEAR_OCCUR',
 'VIOLATOR_VIOLATION_CNT',
 'VIOLATOR_INSPECTION_DAY_CNT',
 'SIG_SUB',
 'MINE_TYPE',
 'PRIMARY_OR_MILL',
 'COAL_METAL_IND',
 'VIOLATOR_TYPE_CD']