In [3]:
import sqlite3
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split


In [4]:
db = r'C:/Users/jonat/UO_Bootcamp/Group_project/git_Water_Quality_Analysis/Water_Quality_Analysis/Database/database.sqlite3'
# Connect to SQLite database
conn = sqlite3.connect(db)
  
# Create cursor object
cursor = conn.cursor()

#Read in the Data from the DB
df = pd.read_sql_query("SELECT * FROM Census_Data INNER JOIN Contaminant_Summary on Census_Data.county_FIPS = Contaminant_Summary.county_FIPS",conn)

In [5]:
#Get the target binary data from the .csv file that was generated in the Priority_algo_dev.ipynb
target = pd.read_csv('data_with_binary_priority.csv', usecols=['Priority'])

In [6]:
#The problem is the imbalanced data and this will need to be addressed
target.Priority.value_counts()

0    616
1    266
Name: Priority, dtype: int64

In [7]:
df.sample(20)

Unnamed: 0,county_FIPS,Geographic_Area_Name,County,GEOID,Total_Population,White,Black,Native,Asian,Pacific_Islander,...,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index,County_FIPS,Num_Contaminants,Sum_Population_Served,Sum_ContaminantFactor,Min_Contaminant_Factor,Max_Contaminant_Factor,Avg_Contaminant_Factor
559,26133,"Osceola County, Michigan",Osceola County,0500000US26133,22891,21414,166,151,35,7,...,0.378641,0.088039,0.4086,26133,3,5719,1507,266,856,502.33
500,26007,"Alpena County, Michigan",Alpena County,0500000US26007,28907,27177,100,123,120,13,...,0.348201,0.075467,0.4332,26007,3,15930,3315,1109,804,1105.0
663,29221,"Washington County, Missouri",Washington County,0500000US29221,23514,21465,611,80,41,2,...,0.431321,0.054405,0.4966,29221,8,6463,1762,0,5,220.25
527,26065,"Ingham County, Michigan",Ingham County,0500000US26065,284900,198552,35581,1536,16523,125,...,1.231899,0.292525,0.4737,26065,5,44333,1503,148,808,300.6
275,55087,"Outagamie County, Wisconsin",Outagamie County,0500000US55087,190705,164009,3054,3144,6619,113,...,0.765223,0.196778,0.4149,55087,7,134452,5730,372,1541,818.57
422,17155,"Putnam County, Illinois",Putnam County,0500000US17155,5637,5175,27,14,19,5,...,0.550279,0.231715,0.3815,17155,5,2910,3754,110,444,750.8
38,38101,"Ward County, North Dakota",Ward County,0500000US38101,69919,57038,3025,1707,1121,134,...,0.927601,0.242232,0.4314,38101,7,3248,7497,557,1978,1071.0
730,41039,"Lane County, Oregon",Lane County,0500000US41039,382971,309194,4661,4675,9621,1016,...,0.975918,0.322106,0.462,41039,3,12901,468,202,46,156.0
584,29015,"Benton County, Missouri",Benton County,0500000US29015,19394,18031,77,109,58,3,...,0.386758,0.081969,0.4961,29015,21,4070,419,0,97,19.95
462,20073,"Greenwood County, Kansas",Greenwood County,0500000US20073,6016,5494,29,36,37,2,...,0.50146,0.14808,0.4675,20073,7,3955,10918,1386,918,1559.71


In [8]:
df.dtypes

county_FIPS                 int64
Geographic_Area_Name       object
County                     object
GEOID                      object
Total_Population            int64
White                       int64
Black                       int64
Native                      int64
Asian                       int64
Pacific_Islander            int64
Other                       int64
Two_or_more_Races           int64
Hispanic                    int64
Not_Hispanic                int64
Not_White                   int64
pct_White                 float64
pct_Black                 float64
pct_Native                float64
pct_Asian                 float64
pct_Pacific_Islander      float64
pct_Other                 float64
pct_Not_White             float64
pct_Hispanic              float64
pct_Not_Hispanic          float64
pct_Two_or_more_Races     float64
Simpson_Race_DI           float64
Simpson_Ethnic_DI         float64
Shannon_Race_DI           float64
Shannon_Ethnic_DI         float64
Gini_Index    

In [9]:
df.columns

Index(['county_FIPS', 'Geographic_Area_Name', 'County', 'GEOID',
       'Total_Population', 'White', 'Black', 'Native', 'Asian',
       'Pacific_Islander', 'Other', 'Two_or_more_Races', 'Hispanic',
       'Not_Hispanic', 'Not_White', 'pct_White', 'pct_Black', 'pct_Native',
       'pct_Asian', 'pct_Pacific_Islander', 'pct_Other', 'pct_Not_White',
       'pct_Hispanic', 'pct_Not_Hispanic', 'pct_Two_or_more_Races',
       'Simpson_Race_DI', 'Simpson_Ethnic_DI', 'Shannon_Race_DI',
       'Shannon_Ethnic_DI', 'Gini_Index', 'County_FIPS', 'Num_Contaminants',
       'Sum_Population_Served', 'Sum_ContaminantFactor',
       'Min_Contaminant_Factor', 'Max_Contaminant_Factor',
       'Avg_Contaminant_Factor'],
      dtype='object')

## Feature Selection

In [10]:
df_model = df.drop(columns=['county_FIPS', 
                            'Geographic_Area_Name', 
                            'County', 'GEOID',
                            'Total_Population',
                            'White', 
                            'Black', 
                            'Native', 
                            'Asian',
                            'Pacific_Islander', 
                            'Other', 
                            'Two_or_more_Races', 
                            'Hispanic',
                            'Not_Hispanic', 
                            'Not_White',
                            'County_FIPS',
                            'Sum_Population_Served',
                            'Min_Contaminant_Factor', 
                            'Max_Contaminant_Factor',
                            ])

In [11]:
#Check to make sure the data types don't need fixing
df_model.dtypes

pct_White                 float64
pct_Black                 float64
pct_Native                float64
pct_Asian                 float64
pct_Pacific_Islander      float64
pct_Other                 float64
pct_Not_White             float64
pct_Hispanic              float64
pct_Not_Hispanic          float64
pct_Two_or_more_Races     float64
Simpson_Race_DI           float64
Simpson_Ethnic_DI         float64
Shannon_Race_DI           float64
Shannon_Ethnic_DI         float64
Gini_Index                float64
Num_Contaminants            int64
Sum_ContaminantFactor       int64
Avg_Contaminant_Factor    float64
dtype: object

In [12]:
#Check for Nan even though cleaning scripts should have excluded them by this stage
df_model.isna().sum()

pct_White                 0
pct_Black                 0
pct_Native                0
pct_Asian                 0
pct_Pacific_Islander      0
pct_Other                 0
pct_Not_White             0
pct_Hispanic              0
pct_Not_Hispanic          0
pct_Two_or_more_Races     0
Simpson_Race_DI           0
Simpson_Ethnic_DI         0
Shannon_Race_DI           0
Shannon_Ethnic_DI         0
Gini_Index                0
Num_Contaminants          0
Sum_ContaminantFactor     0
Avg_Contaminant_Factor    0
dtype: int64

## Split the data into training and test data

In [13]:
# Create our features
X = df_model
# Create our target
y = target

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
#Check the imbalance in the training set
y_train.value_counts()

Priority
0           460
1           201
dtype: int64

## Ensemble Learners

### Balanced Random Forest Classifier

In [16]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
brf_model.fit(X_train,y_train)

BalancedRandomForestClassifier(random_state=1)

In [17]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9717948717948718

In [18]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,152,4
Actual Low-Priority,2,63


In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.97      0.97      0.98      0.97      0.94       156
          1       0.94      0.97      0.97      0.95      0.97      0.94        65

avg / total       0.97      0.97      0.97      0.97      0.97      0.94       221



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.21577329250084878, 'pct_Hispanic'),
 (0.18270331734030493, 'Simpson_Ethnic_DI'),
 (0.16933527790291714, 'Shannon_Ethnic_DI'),
 (0.14741969379439726, 'pct_Not_Hispanic'),
 (0.07618300294105276, 'pct_Other'),
 (0.06788945084338857, 'Shannon_Race_DI'),
 (0.04540258450048268, 'pct_Not_White'),
 (0.02858899205530442, 'Simpson_Race_DI'),
 (0.019062721712183664, 'pct_White'),
 (0.013419859566826287, 'pct_Asian'),
 (0.010463765501046053, 'pct_Two_or_more_Races'),
 (0.00537416069877027, 'pct_Black'),
 (0.005029362553335992, 'Sum_ContaminantFactor'),
 (0.003813001607423072, 'pct_Native'),
 (0.003033713296187055, 'Gini_Index'),
 (0.002441597706863405, 'Num_Contaminants'),
 (0.002387526716818439, 'Avg_Contaminant_Factor'),
 (0.0016786787618492632, 'pct_Pacific_Islander')]

### Easy Ensemble AdaBoost Classifier

In [21]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train,y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [22]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9685897435897436

In [23]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,151,5
Actual Low-Priority,2,63


In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.97      0.97      0.98      0.97      0.94       156
          1       0.93      0.97      0.97      0.95      0.97      0.94        65

avg / total       0.97      0.97      0.97      0.97      0.97      0.94       221



### Naive Random Oversampling

In [25]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

y_resampled.value_counts()

Priority
0           460
1           460
dtype: int64

In [26]:
from sklearn.linear_model import LogisticRegression
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [27]:
# make predictions
y_pred = model.predict(X_test)

In [28]:
from sklearn.metrics import balanced_accuracy_score
#Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8987179487179486

In [29]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,134,22
Actual Low-Priority,4,61


In [30]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.86      0.94      0.91      0.90      0.80       156
          1       0.73      0.94      0.86      0.82      0.90      0.81        65

avg / total       0.90      0.88      0.92      0.89      0.90      0.80       221



### SMOTE Oversampling

In [31]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
y_resampled.value_counts()

Priority
0           460
1           460
dtype: int64

In [32]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9019230769230769

In [34]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,135,21
Actual Low-Priority,4,61


In [35]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.87      0.94      0.92      0.90      0.81       156
          1       0.74      0.94      0.87      0.83      0.90      0.82        65

avg / total       0.90      0.89      0.92      0.89      0.90      0.81       221



## Undersampling


In [36]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Priority': 1})

In [37]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [38]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5935897435897436

In [39]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,118,38
Actual Low-Priority,37,28


In [40]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.76      0.43      0.76      0.57      0.34       156
          1       0.42      0.43      0.76      0.43      0.57      0.32        65

avg / total       0.66      0.66      0.53      0.66      0.57      0.33       221



## Combination (Over and Under) Sampling


In [41]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
y_resampled.value_counts()

Priority
1           192
0           158
dtype: int64

In [42]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [43]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.8865384615384615

In [44]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,135,21
Actual Low-Priority,6,59


In [45]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.87      0.91      0.91      0.89      0.78       156
          1       0.74      0.91      0.87      0.81      0.89      0.79        65

avg / total       0.89      0.88      0.90      0.88      0.89      0.78       221



## Naive Bayes Classifier

In [46]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB()

In [47]:
# Calculated the balanced accuracy score
y_pred = gnb.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5698717948717948

## XGBoost

In [48]:
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [49]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9814102564102565

In [50]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual High-Priority", "Actual Low-Priority"],
    columns=["Predicted High-Priority", "Predicted Low-Priority"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted High-Priority,Predicted Low-Priority
Actual High-Priority,155,1
Actual Low-Priority,2,63
