Import Relevant Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as SNS

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, \
                            plot_confusion_matrix, plot_roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

import xgboost

In [None]:
# read in cleaned data file
df = pd.read_csv('final_data.csv', index_col=0)
df.head()

In [None]:
# Create our predictors and target
X = df.drop('status_group', axis=1)
y = df.status_group

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_train.head()

In [None]:
# Confirm that out training data set is 3x larger than the testing dataset
print(X_train.shape, X_test.shape)

# Modeling

### Dummy Model

First, we will create a dummy model that always predicts the most frequent class. In this case, our dummy model will predict that all of the water wells are functional.

In [None]:
# Instantiate our dummy regressor object
dummy = DummyClassifier(strategy='most_frequent')

In [None]:
# Fit the model on our training data
dummy.fit(X_train, y_train)

In [None]:
# Inspect the accuracy of our initial dummy model
dummy.score(X_train, y_train)

Our dummy model accuracy of 54.6% reflects the distribution of our target. Thus, if we always predict that a water well is functional, we will achieve 54.6% accuracy.

### Set Up Column Transformer

In [None]:
# Create our sub pipelines for numeric and categorial data
subpipe_num = Pipeline(steps=[('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, 
                                                    handle_unknown='ignore'))])

In [None]:
# Include our sub pipelines in a column transformer to OHE our categorial variables and scale our numerical variables.
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [0,2,17]), 
                                     ('subpipe_cat', subpipe_cat, [1,3,4,5,6,7,8,9,10,11,12,13,14,15,16])])

### Create Pipeline for Simple Models

In [None]:
logreg_pipe = Pipeline(steps=[('CT', CT), 
                              ('logreg', LogisticRegression(random_state=42))])

knn_pipe = Pipeline(steps=[('CT', CT), 
                              ('knn', KNeighborsClassifier())])

dtc_pipe = Pipeline(steps=[('CT', CT), 
                              ('dtc', DecisionTreeClassifier(random_state=42))])


In [None]:
# Fit each pipeline to our training data
logreg_pipe.fit(X_train, y_train)
knn_pipe.fit(X_train, y_train)
dtc_pipe.fit(X_train, y_train)

In [None]:
# Get initial accuracy scores for each our our simple classifier models
logreg_score = logreg_pipe.score(X_train, y_train)
knn_score = knn_pipe.score(X_train, y_train)
dtc_score = dtc_pipe.score(X_train, y_train)

In [None]:
# Print out initial accuracy scores
print(f'Logistic Regression Accuracy: {round(logreg_score * 100, 2)}%')
print(f'KNearestNeighbors Accuracy: {round(knn_score * 100, 2)}%')
print(f'DecisionTree Accuracy: {round(dtc_score * 100, 2)}%')

Our initial numbers appear to be relatively good. We expect our decision tree classifier to be overfit given the high accuracy score. We will dive deeper into more complex models below to maximize our accuracy and recall. 

### Random Forest

In [None]:
# Create our random forest pipeline to incorporate column transformer
rfc_pipe = Pipeline(steps=[('CT', CT),
                          ('rfc', RandomForestClassifier(random_state=42))])

In [None]:
# Fit our random forest classifier to the training data
rfc_pipe.fit(X_train, y_train)

In [None]:
rfc_pipe.score(X_train, y_train)

In [None]:
cross_val_score(estimator=rfc_pipe, X=X_train, y=y_train, cv=5)

Given that our cross validation scores were significantly lower than the accuracy score from our single random forest classifier, we can say that this model is overfit to the training data. We will perform a grid search in order to optimize the hyper parameters of this classifier.

In [None]:
# create our grid for the grid search parameters
params = {'rfc__n_estimators': [50, 100, 150],
          'rfc__min_samples_split': [2, 10, 50],
          'rfc__max_depth': [5, 10, 15]}

# set up GridSearchCV object
grid_rfc = GridSearchCV(rfc_pipe, param_grid=params, cv=5)

In [None]:
# fit our grid object for random forests to the training data
grid_rfc.fit(X_train, y_train)

In [None]:
grid_rfc.best_params_

In [None]:
grid_rfc.best_score_

From our initial grid search, we attained an accuracy of 78.3% using a max_depth of 15, a min_samples_split of 2, and an n_estimators of 100. 

In [None]:
pd.DataFrame(grid_rfc.cv_results_)

Based on our initial grid search results, we can infer that a larger max depth is needed to improve accuracy. 

In [None]:
# create our grid for the grid search parameters
params2 = {'rfc__n_estimators': [50, 100, 150],
           'rfc__max_depth': [15, 25, 50]}

# set up GridSearchCV object
grid_rfc2 = GridSearchCV(rfc_pipe, param_grid=params2, cv=5)

In [None]:
# fit our grid object with parameter updates for random forests to the training data
grid_rfc2.fit(X_train, y_train)

In [None]:
grid_rfc2.best_params_

In [None]:
grid_rfc2.best_score_

From our second grid search, we attained an accuracy of 79.0% using a max_depth of 25 and an n_estimators of 150. 

In [None]:
pd.DataFrame(grid_rfc2.cv_results_)

Based on our new grid search, we can see that a maximum depth greater than 15 but less than 50 is ideal. We will also try running the grid search with a higher number of trees in the random forest to improve accuracy.

In [None]:
# create our grid for the grid search parameters
params3 = {'rfc__n_estimators': [100, 150, 200],
           'rfc__max_depth': [20, 25, 30]}

# set up GridSearchCV object
grid_rfc3 = GridSearchCV(rfc_pipe, param_grid=params3, cv=5)

In [None]:
# Fit the grid object to our training data with updated grid params for n_estimators and max_depth
grid_rfc3.fit(X_train, y_train)

In [None]:
grid_rfc3.best_params_

In [None]:
grid_rfc3.best_score_

From our third grid search, we attained an accuracy of 79.3% using a max_depth of 20 and an n_estimators of 150. 

In [None]:
pd.DataFrame(grid_rfc3.cv_results_)

In [None]:
# create our grid for the grid search parameters
params4 = {'rfc__n_estimators': [100, 150],
           'rfc__max_depth': [18, 20, 22],
           'rfc__max_features': ['auto', 'sqrt', 0.2]}

# set up GridSearchCV object
grid_rfc4 = GridSearchCV(rfc_pipe, param_grid=params4, cv=5)

In [None]:
grid_rfc4.fit(X_train, y_train)

In [None]:
grid_rfc4.best_params_

In [None]:
grid_rfc4.best_score_

From our fourth grid search, we attained an accuracy of 79.4% using a max_depth of 20, a max_features of 0.2, and an n_estimators of 150. 

In [None]:
pd.DataFrame(grid_rfc4.cv_results_)

### XGBoost


Here, we run an inital XGBoost model.

In [None]:
from xgboost import XGBClassifier

In [None]:
# Instantiate XGBClassifier
XGB_pipe = Pipeline(steps=[('CT', CT), 
                              ('XGB', XGBClassifier())])

In [None]:
#Fit XGBClassifier

In [None]:
XGB_pipe.fit(X_train, y_train)

In [None]:
# Predict on training and test sets
training_preds = XGB_pipe.predict(X_train)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
training_recall = recall_score(y_train, training_preds, average= None)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print (training_recall)

In [None]:
# create our massive grid for the grid search parameters
paramsXGB = {
    'XGB__learning_rate': [0.1, 0.2],
    'XGB__max_depth': range(3, 10, 2),
    'XGB__min_child_weight': range(1, 8, 2),
    'XGB__gamma': [0, .1, .2],
    'XGB__subsample': [.5, .75, 1]
}

In [None]:
grid_XGB = GridSearchCV(XGB_pipe, param_grid=paramsXGB, cv=5)

#Fit grid search object to our training data to check the hyper parameters
##Did not complete, took too long to run

In [None]:
best_parameters = grid_XGB.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
training_accuracy = accuracy_score(y_train, training_preds)
training_recall = recall_score(y_train, training_preds)

print('')
print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))


In [None]:
##ran some recall scores, realizing we may need to adjust data due to 
#undersampling of function but needs repairs class

In [None]:
y_train.value_counts()

In [None]:
y_train.shape

In [None]:
#Current breakdown of our percentages

In [None]:
3225/44271

In [None]:
16887/44271

In [None]:
24159/44271