Import Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as SNS

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, \
                            plot_confusion_matrix, plot_roc_curve, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [2]:
# read in cleaned data file
df = pd.read_csv('final_data.csv', index_col=0)
df.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,basin,region_code,district_code,public_meeting,scheme_management,permit,extraction_type_class,management,payment,water_quality,quantity,source,waterpoint_type,status_group,age
0,6000.0,Other,1390,Other,Lake Nyasa,11,5,True,VWC,False,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,functional,23
1,0.0,Other,1399,Other,Lake Victoria,20,2,Unknown,Other,True,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional,12
2,25.0,Other,686,Other,Pangani,21,4,True,VWC,True,gravity,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple,functional,13
3,0.0,Unicef,263,Other,Ruvuma / Southern Coast,90,63,True,VWC,True,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,non functional,36
4,0.0,Other,0,Other,Lake Victoria,18,1,True,Unknown,True,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional,22


In [3]:
# Create our predictors and target
X = df.drop('status_group', axis=1)
y = df.status_group

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,basin,region_code,district_code,public_meeting,scheme_management,permit,extraction_type_class,management,payment,water_quality,quantity,source,waterpoint_type,age
34904,5.0,Germany Republi,1376,CES,Pangani,3,5,True,Water Board,True,gravity,water board,pay per bucket,soft,enough,spring,communal standpipe,16
4442,0.0,Other,947,Other,Ruvuma / Southern Coast,10,2,True,VWC,True,gravity,vwc,never pay,soft,enough,river,communal standpipe,35
11884,0.0,Other,227,Other,Ruvuma / Southern Coast,8,3,True,VWC,False,gravity,vwc,never pay,unknown,dry,rainwater harvesting,communal standpipe,20
38675,0.0,Hesawa,1152,DWE,Lake Victoria,19,1,True,Unknown,True,other,vwc,never pay,soft,insufficient,shallow well,other,14
34585,50.0,Government Of Tanzania,248,DWE,Wami / Ruvu,6,3,True,Water Board,False,gravity,water board,pay per bucket,salty,insufficient,dam,communal standpipe,14


In [6]:
# Confirm that out training data set is 3x larger than the testing dataset
print(X_train.shape, X_test.shape)

(44271, 18) (14757, 18)


In [7]:
y.value_counts(normalize=True)

functional                 0.543030
non functional             0.383852
functional needs repair    0.073118
Name: status_group, dtype: float64

It appears that our distribution of target values is relatively uneven; only 7% of our data represents water wells that are functional but need repair. We will run baseline models to determine if we need to use SMOTE to oversample these observations.

# Modeling

### Dummy Model

First, we will create a dummy model that always predicts the most frequent class. In this case, our dummy model will predict that all of the water wells are functional.

In [8]:
# Instantiate our dummy regressor object
dummy = DummyClassifier(strategy='most_frequent')

In [9]:
# Fit the model on our training data
dummy.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [10]:
# Inspect the accuracy of our initial dummy model
dummy.score(X_train, y_train)

0.5450294775360846

Our dummy model accuracy of 54.6% reflects the distribution of our target. Thus, if we always predict that a water well is functional, we will achieve 54.6% accuracy.

### Set Up Column Transformer

In [11]:
# Create our sub pipelines for numeric and categorial data
subpipe_num = Pipeline(steps=[('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False,
                                                    handle_unknown='ignore'))])
subpipe_ord = Pipeline(steps=[('ord', OrdinalEncoder())])
subpipe_smote = SMOTE(sampling_strategy='auto', random_state=42)

In [12]:
# Include our sub pipelines in a column transformer to OHE our categorial variables and scale our numerical variables.
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [0,2,17]), 
                                     ('subpipe_cat', subpipe_cat, [1,3,4,5,6,7,8,10,11,12,13,14,15,16]), 
                                     ('subpipe_ord', subpipe_ord, [9])])

### Create Pipeline for Simple Models

In [None]:
logreg_pipe = Pipeline(steps=[('CT', CT), 
                              ('logreg', LogisticRegression(random_state=42))])

knn_pipe = Pipeline(steps=[('CT', CT), 
                           ('knn', KNeighborsClassifier())])

dtc_pipe = Pipeline(steps=[('CT', CT), 
                           ('dtc', DecisionTreeClassifier(random_state=42))])


In [None]:
# Fit each pipeline to our training data
logreg_pipe.fit(X_train, y_train)
knn_pipe.fit(X_train, y_train)
dtc_pipe.fit(X_train, y_train)

In [None]:
# Get initial accuracy scores for each our our simple classifier models
logreg_score = logreg_pipe.score(X_train, y_train)
knn_score = knn_pipe.score(X_train, y_train)
dtc_score = dtc_pipe.score(X_train, y_train)

In [None]:
# Print out initial accuracy scores
print(f'Logistic Regression Accuracy: {round(logreg_score * 100, 2)}%')
print(f'KNearestNeighbors Accuracy: {round(knn_score * 100, 2)}%')
print(f'DecisionTree Accuracy: {round(dtc_score * 100, 2)}%')

Our initial numbers appear to be relatively good. We expect our decision tree classifier to be overfit given the high accuracy score. We will dive deeper into more complex models below to maximize our accuracy and recall. 

### Random Forest

In [None]:
# Create our random forest pipeline to incorporate column transformer
rfc_pipe = Pipeline(steps=[('CT', CT),
                           ('rfc', RandomForestClassifier(random_state=42))])

In [None]:
# Fit our random forest classifier to the training data
rfc_pipe.fit(X_train, y_train)

In [None]:
rfc_pipe.score(X_train, y_train)

In [None]:
cross_val_score(estimator=rfc_pipe, X=X_train, y=y_train, cv=5)

Given that our cross validation scores were significantly lower than the accuracy score from our single random forest classifier, we can say that this model is overfit to the training data. We will perform a grid search in order to optimize the hyper parameters of this classifier.

In [None]:
# create our grid for the grid search parameters
params = {'rfc__n_estimators': [50, 100, 150],
          'rfc__min_samples_split': [2, 10, 50],
          'rfc__max_depth': [5, 10, 15]}

# set up GridSearchCV object
grid_rfc = GridSearchCV(rfc_pipe, param_grid=params, cv=5)

In [None]:
# fit our grid object for random forests to the training data
grid_rfc.fit(X_train, y_train)

In [None]:
grid_rfc.best_params_

In [None]:
grid_rfc.best_score_

**From our initial grid search, we attained an accuracy of 78.0% using a max_depth of 15, a min_samples_split of 2, and an n_estimators of 150.**


In [None]:
pd.DataFrame(grid_rfc.cv_results_)

Based on our initial grid search results, we can infer that a larger max depth is needed to improve accuracy. 

In [None]:
# create our grid for the grid search parameters
params2 = {'rfc__n_estimators': [50, 100, 150],
           'rfc__max_depth': [15, 25, 50]}

# set up GridSearchCV object
grid_rfc2 = GridSearchCV(rfc_pipe, param_grid=params2, cv=5)

In [None]:
# fit our grid object with parameter updates for random forests to the training data
grid_rfc2.fit(X_train, y_train)

In [None]:
grid_rfc2.best_params_

In [None]:
grid_rfc2.best_score_

**From our second grid search, we attained an accuracy of 78.9% using a max_depth of 25 and an n_estimators of 150.**

In [None]:
pd.DataFrame(grid_rfc2.cv_results_)

Based on our new grid search, we can see that a maximum depth greater than 15 but less than 50 is ideal. We will also try running the grid search with a higher number of trees in the random forest to improve accuracy.

In [None]:
# create our grid for the grid search parameters
params3 = {'rfc__n_estimators': [100, 150, 200],
           'rfc__max_depth': [20, 25, 30]}

# set up GridSearchCV object
grid_rfc3 = GridSearchCV(rfc_pipe, param_grid=params3, cv=5)

In [None]:
# Fit the grid object to our training data with updated grid params for n_estimators and max_depth
grid_rfc3.fit(X_train, y_train)

In [None]:
grid_rfc3.best_params_

In [None]:
grid_rfc3.best_score_

**From our third grid search, we attained an accuracy of 79.3% using a max_depth of 20 and an n_estimators of 200.** 

In [None]:
pd.DataFrame(grid_rfc3.cv_results_)

In [None]:
# create our grid for the grid search parameters
params4 = {'rfc__n_estimators': [100, 150],
           'rfc__max_depth': [18, 20, 22],
           'rfc__max_features': ['auto', 'sqrt', 0.2]}

# set up GridSearchCV object
grid_rfc4 = GridSearchCV(rfc_pipe, param_grid=params4, cv=5)

In [None]:
grid_rfc4.fit(X_train, y_train)

In [None]:
grid_rfc4.best_params_

In [None]:
grid_rfc4.best_score_

**From our fourth grid search, we attained an accuracy of 79.3% using a max_depth of 20, a max_features of 0.2, and an n_estimators of 150.** 

In [None]:
pd.DataFrame(grid_rfc4.cv_results_)

In [None]:
# create our massive grid for the grid search parameters
params5 = {'rfc__n_estimators': [125, 150, 175],
           'rfc__max_depth': [15, 20, 25],
           'rfc__max_features': ['sqrt', 0.1, 0.2, 0.5],
           'rfc__min_samples_leaf': [25, 50, 100],
           'rfc__min_impurity_decrease': [0, 0.025, 0.05, 0.075, 0.1],
           'rfc__criterion': ['gini', 'entropy', 'log_loss']}

# set up GridSearchCV object
#grid_rfc5 = GridSearchCV(rfc_pipe, param_grid=params5, cv=5)

In [None]:
# Fit the grid search object to our training data to check numerous hyperparameters.
#grid_rfc5.fit(X_train, y_train)

In [None]:
#grid_rfc5.best_params_

In [None]:
#grid_rfc5.best_score_

**From our fifth grid search, we attained an accuracy of 77.5% using a max_depth of 25, a max_features of 0.5, an entropy criterion, a minimum impurity decrease of 0, a minimum samples per leaf of 25, and an n_estimators of 175.** 

In [None]:
#pd.DataFrame(grid_rfc5.cv_results_)

In [None]:
# create our grid for the grid search parameters
params6 = {'rfc__n_estimators': [100, 150, 200],
           'rfc__max_depth': [18, 20, 22],
           'rfc__max_features': [0.1, 0.2, 0.5]}

# set up GridSearchCV object
grid_rfc6 = GridSearchCV(rfc_pipe, param_grid=params6, cv=5)

In [None]:
# Fit our sixth grid search on the training data
grid_rfc6.fit(X_train, y_train)

In [None]:
grid_rfc6.best_params_

In [None]:
grid_rfc6.best_score_

In [None]:
rf_preds6 = grid_rfc6.predict(X_train)

In [None]:
grid_rfc6.score(X_train, y_train)

In [None]:
plot_confusion_matrix(grid_rfc6, X_train, y_train);

In [None]:
print(classification_report(y_train, rf_preds6))

**From our sixth grid search, we attained an accuracy of 79.3% using a max_depth of 20, a max_features of 0.1, and an n_estimators of 200.** 

In [None]:
pd.DataFrame(grid_rfc6.cv_results_)

### Random Forest with SMOTE

In [13]:
# Create our random forest pipeline to incorporate column transformer
rfc_pipe_sm = ImPipeline(steps=[('CT', CT),
                                ('sm', subpipe_smote),
                                ('rfc', RandomForestClassifier(random_state=42))])

In [None]:
# Fit training data to our new random forest pipe using SMOTE to over sample minority classes
rfc_pipe_sm.fit(X_train, y_train)

In [None]:
rfc_pipe_sm.score(X_train, y_train)

In [None]:
cross_val_score(rfc_pipe_sm, X_train, y_train)

In [None]:
plot_confusion_matrix(rfc_pipe_sm, X_train, y_train);

In [None]:
print(classification_report(y_train, rfc_pipe_sm.predict(X_train)))

In [14]:
# create our grid for the grid search parameters
params = {'rfc__n_estimators': [50, 100, 150],
          'rfc__min_samples_split': [2, 10, 50],
          'rfc__max_depth': [5, 10, 15]}

# set up GridSearchCV object
grid_rfc_sm = GridSearchCV(rfc_pipe_sm, param_grid=params, cv=5)

In [19]:
grid_rfc_sm.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         [0, 2,
                                                                          17]),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                            

In [25]:
grid_rfc_sm.best_params_

{'rfc__max_depth': 15, 'rfc__min_samples_split': 2, 'rfc__n_estimators': 150}

In [27]:
grid_rfc_sm.best_score_

0.7430147736982279

In [29]:
print(classification_report(y_train, grid_rfc_sm.predict(X_train)))

                         precision    recall  f1-score   support

             functional       0.85      0.83      0.84     24129
functional needs repair       0.36      0.78      0.50      3230
         non functional       0.91      0.75      0.82     16912

               accuracy                           0.79     44271
              macro avg       0.71      0.79      0.72     44271
           weighted avg       0.84      0.79      0.81     44271



In [15]:
# create our grid for the grid search parameters
params2 = {'rfc__n_estimators': [50, 100, 150],
           'rfc__max_depth': [15, 25, 50]}

# set up GridSearchCV object
grid_rfc_sm2 = GridSearchCV(rfc_pipe_sm, param_grid=params2, cv=5)

In [20]:
grid_rfc_sm2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         [0, 2,
                                                                          17]),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                            

In [30]:
grid_rfc_sm2.best_params_

{'rfc__max_depth': 25, 'rfc__n_estimators': 150}

In [31]:
grid_rfc_sm2.best_score_

0.7654899679434892

In [32]:
print(classification_report(y_train, grid_rfc_sm2.predict(X_train)))

                         precision    recall  f1-score   support

             functional       0.94      0.92      0.93     24129
functional needs repair       0.59      0.88      0.71      3230
         non functional       0.96      0.90      0.93     16912

               accuracy                           0.91     44271
              macro avg       0.83      0.90      0.85     44271
           weighted avg       0.92      0.91      0.91     44271



In [16]:
# create our grid for the grid search parameters
params3 = {'rfc__n_estimators': [100, 150, 200],
           'rfc__max_depth': [20, 25, 30]}

# set up GridSearchCV object
grid_rfc_sm3 = GridSearchCV(rfc_pipe_sm, param_grid=params3, cv=5)

In [21]:
grid_rfc_sm3.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         [0, 2,
                                                                          17]),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                            

In [33]:
grid_rfc_sm3.best_params_

{'rfc__max_depth': 25, 'rfc__n_estimators': 200}

In [34]:
grid_rfc_sm3.best_score_

0.765828782545177

In [36]:
print(classification_report(y_train, grid_rfc_sm3.predict(X_train)))

                         precision    recall  f1-score   support

             functional       0.94      0.92      0.93     24129
functional needs repair       0.59      0.89      0.71      3230
         non functional       0.96      0.90      0.93     16912

               accuracy                           0.91     44271
              macro avg       0.83      0.90      0.85     44271
           weighted avg       0.92      0.91      0.91     44271



In [17]:
# create our grid for the grid search parameters
params4 = {'rfc__n_estimators': [100, 150],
           'rfc__max_depth': [18, 20, 22],
           'rfc__max_features': ['auto', 'sqrt', 0.2]}

# set up GridSearchCV object
grid_rfc_sm4 = GridSearchCV(rfc_pipe_sm, param_grid=params4, cv=5)

In [22]:
grid_rfc_sm4.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         [0, 2,
                                                                          17]),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                            

In [37]:
grid_rfc_sm4.best_params_

{'rfc__max_depth': 22, 'rfc__max_features': 0.2, 'rfc__n_estimators': 150}

In [38]:
grid_rfc_sm4.best_score_

0.7668451982897923

In [39]:
print(classification_report(y_train, grid_rfc_sm4.predict(X_train)))

                         precision    recall  f1-score   support

             functional       0.92      0.91      0.91     24129
functional needs repair       0.56      0.86      0.68      3230
         non functional       0.95      0.87      0.91     16912

               accuracy                           0.89     44271
              macro avg       0.81      0.88      0.83     44271
           weighted avg       0.91      0.89      0.90     44271



In [18]:
# create our grid for the grid search parameters
params5 = {'rfc__n_estimators': [100, 150, 200],
           'rfc__max_depth': [18, 20, 22],
           'rfc__max_features': [0.1, 0.2, 0.5]}

# set up GridSearchCV object
grid_rfc_sm5 = GridSearchCV(rfc_pipe_sm, param_grid=params5, cv=5)

In [23]:
grid_rfc_sm5.fit(X_train, y_train)

KeyboardInterrupt: 

### XGBoost Prequel

In [None]:
# Instantiate our XGB pipeline
xgb_pipe = ImPipeline(steps=[('CT', CT), 
                             ('sm', sm),
                             ('XGB', XGBClassifier())])


In [None]:
# Fit the xgb pipeline to our training data
xgb_pipe.fit(X_train, y_train)

In [None]:
training_preds = xgb_pipe.predict(X_train)

In [None]:
print(classification_report(y_train, training_preds))

In [None]:
plot_confusion_matrix(xgb_pipe, X_train, y_train);

In [None]:
xgb_pipe.score(X_train, y_train)

In [None]:
cross_val_score(xgb_pipe, X_train, y_train)

### XGBoost


Here, we run an inital XGBoost model.

In [None]:
from xgboost import XGBClassifier

In [None]:
# Instantiate XGBClassifier
XGB_pipe = Pipeline(steps=[('CT', CT), 
                              ('XGB', XGBClassifier())])

In [None]:
#Fit XGBClassifier

In [None]:
XGB_pipe.fit(X_train, y_train)

In [None]:
# Predict on training and test sets
training_preds = XGB_pipe.predict(X_train)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
training_recall = recall_score(y_train, training_preds, average= None)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print (training_recall)

In [None]:
# create our massive grid for the grid search parameters
paramsXGB = {
    'XGB__learning_rate': [0.1, 0.2],
    'XGB__max_depth': range(3, 10, 2),
    'XGB__min_child_weight': range(1, 8, 2),
    'XGB__gamma': [0, .1, .2],
    'XGB__subsample': [.5, .75, 1]
}

In [None]:
grid_XGB = GridSearchCV(XGB_pipe, param_grid=paramsXGB, cv=5)

#Fit grid search object to our training data to check the hyper parameters
##Did not complete, took too long to run

In [None]:
best_parameters = grid_XGB.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
training_accuracy = accuracy_score(y_train, training_preds)
training_recall = recall_score(y_train, training_preds)

print('')
print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))


In [None]:
##ran some recall scores, realizing we may need to adjust data due to 
#undersampling of function but needs repairs class

In [None]:
y_train.value_counts()

In [None]:
y_train.shape

In [None]:
#Current breakdown of our percentages

In [None]:
3225/44271

In [None]:
16887/44271

In [None]:
24159/44271