# Import Necessary Libraries

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn import tree
from imblearn.over_sampling import SMOTE

# Import File Saved From EDA Notebook

In [4]:
# Import file and check columns

clean_df = pd.read_csv('clean_training_set')
clean_df.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,lga,population,...,management,payment,water_quality,quantity,source,waterpoint_type,well_age,status_group,top_funded,top_installers
0,6000.0,1390,34.938093,-9.856322,Lake Nyasa,Iringa,11,5,Ludewa,109,...,vwc,pay annually,soft,enough,spring,communal standpipe,12,functional,Roman,other
1,0.0,1399,34.698766,-2.147466,Lake Victoria,Mara,20,2,Serengeti,280,...,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3,functional,other,other
2,25.0,686,37.460664,-3.821329,Pangani,Manyara,21,4,Simanjiro,250,...,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple,4,functional,other,World Vision
3,0.0,263,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,90,63,Nanyumbu,58,...,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,27,non functional,Unicef,UNICEF
4,0.0,0,31.130847,-1.825359,Lake Victoria,Kagera,18,1,Karagwe,0,...,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,12,functional,other,other


In [5]:
clean_df.shape

(59400, 22)

In [6]:
# Check all column data types to ensure they are prepared to be modeled

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 22 columns):
amount_tsh               59400 non-null float64
gps_height               59400 non-null int64
longitude                59400 non-null float64
latitude                 59400 non-null float64
basin                    59400 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
population               59400 non-null int64
scheme_management        59400 non-null object
extraction_type_class    59400 non-null object
management               59400 non-null object
payment                  59400 non-null object
water_quality            59400 non-null object
quantity                 59400 non-null object
source                   59400 non-null object
waterpoint_type          59400 non-null object
well_age                 59400 non-null int64


In [7]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

clean_df['district_code'] = clean_df['district_code'].astype('str')
clean_df['region_code'] = clean_df['region_code'].astype('str')

In [8]:
# Check for null values before model preparation

clean_df.isnull().sum()

amount_tsh               0
gps_height               0
longitude                0
latitude                 0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
population               0
scheme_management        0
extraction_type_class    0
management               0
payment                  0
water_quality            0
quantity                 0
source                   0
waterpoint_type          0
well_age                 0
status_group             0
top_funded               0
top_installers           0
dtype: int64

In [9]:
# Import test data to use to generate predictions 

# Model Preparation

In [10]:
# Isolate continuous variables 

clean_df_cont = clean_df[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

In [11]:
# Isolate categorical variables 

clean_df_cat = clean_df.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age', 'status_group'], axis = 1)

In [12]:
# One-hot encode categorical variables using pd.get_dummies

cat_dummies = pd.get_dummies(clean_df_cat, drop_first = True)
cat_dummies.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_installers_TASAF,top_installers_TCRS,top_installers_TWESA,top_installers_UNICEF,top_installers_Villagers,top_installers_WEDECO,top_installers_WU,top_installers_WVT,top_installers_World Vision,top_installers_other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
# Convert the outcome column 'status_group' into numeric values

clean_df['status_group'].replace('functional', 1, inplace = True)
clean_df['status_group'].replace('non functional', 0, inplace = True)
clean_df['status_group'].replace('functional needs repair', 2, inplace = True)

In [14]:
# The model outcome is 

clean_df['status_group'].value_counts()

1    32259
0    22824
2     4317
Name: status_group, dtype: int64

In [15]:
clean_df_cont.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,well_age
0,6000.0,1390,34.938093,-9.856322,109,12
1,0.0,1399,34.698766,-2.147466,280,3
2,25.0,686,37.460664,-3.821329,250,4
3,0.0,263,38.486161,-11.155298,58,27
4,0.0,0,31.130847,-1.825359,0,12


In [16]:
# Concatenate the OHE categorical and continuous variables and the target variable back together 

processed_df = pd.concat([cat_dummies, clean_df_cont, clean_df['status_group']], axis = 1)
processed_df.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_installers_WVT,top_installers_World Vision,top_installers_other,amount_tsh,gps_height,longitude,latitude,population,well_age,status_group
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,6000.0,1390,34.938093,-9.856322,109,12,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0.0,1399,34.698766,-2.147466,280,3,1
2,0,0,0,0,1,0,0,0,0,0,...,0,1,0,25.0,686,37.460664,-3.821329,250,4,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0.0,263,38.486161,-11.155298,58,27,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0.0,0,31.130847,-1.825359,0,12,1


In [17]:
processed_df.shape

(59400, 347)

In [18]:
# Save changes to separate file for model experimentation 

processed_df.to_csv('processed_training_set', index = False)

# Modeling

## Partitioning

In [14]:
# Split data into target (y) and predictors (X)

X = processed_df.drop(['status_group'], axis = 1)
y = processed_df['status_group']

In [15]:
# Split data using train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## Decision Trees

In [17]:
# Instantiate a Decision Tree Classifier and fit it to the data

classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [18]:
# Make predictions for test data

y_pred = classifier.predict(X_test)

In [19]:
# Calculate accuracy 

acc = accuracy_score(y_test, y_pred) * 100
print('Accuracy is :{0}'.format(acc))

Accuracy is :75.47138047138047


In [20]:
# Confusion matrix and classification report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[3489  898  185]
 [ 918 5150  389]
 [ 147  377  327]]
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      4572
           1       0.80      0.80      0.80      6457
           2       0.36      0.38      0.37       851

   micro avg       0.75      0.75      0.75     11880
   macro avg       0.64      0.65      0.65     11880
weighted avg       0.76      0.75      0.76     11880



## Bagged Tree

In [21]:
# Instantiate a bagged tree classifier

bagged_tree = BaggingClassifier(DecisionTreeClassifier(criterion = 'gini'), n_estimators = 20)

In [22]:
# Fit the model

bagged_tree.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=20, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [23]:
# Check the score of the training data

bagged_tree.score(X_train, y_train)

0.9912457912457913

In [24]:
# Check the score of the test data

bagged_tree.score(X_test, y_test)

0.7930134680134681

## Random Forest

In [25]:
# Instantiate and fit a RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100, max_depth = 5)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
# Training accuracy score

forest.score(X_train, y_train)

0.6746001683501683

In [27]:
# Test accuracy score

forest.score(X_test, y_test)

0.6739057239057239

In [28]:
# Define a function to plot feature importances

def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

## XGBoost

In [16]:
# Import XGBoost

from xgboost import XGBClassifier

In [20]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(X_train, y_train)

# Predict on training and test sets
training_preds = clf.predict(X_train)
test_preds = clf.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 74.92%
Validation accuracy: 74.71%


## GridSearchCV

In [31]:
# Define a parameter grid to iteratively run the classifier through different hyperparameters

param_grid = {
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [4, 5, 6, 7, 8],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [25, 50, 100, 200],
}

In [17]:
param_grid2 = {
    'learning_rate': [0.1, 0.3, 0.5],
    'max_depth': [2, 3, 5, 7, 9],
    'subsample': [0.5, 1],
    'n_estimators': [250, 500, 750],
}

In [34]:
random_clf = RandomizedSearchCV(clf, param_grid, scoring = 'accuracy', refit = True, cv = 5)
random_clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5], 'max_depth': [4, 5, 6, 7, 8], 'min_child_weight': [1, 2], 'subsample': [0.5, 0.7], 'n_estimators': [25, 50, 100, 200]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [36]:
random_clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [35]:
# Predict on training and test sets
training_preds = random_clf.predict(X_train)
test_preds = random_clf.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 85.64%
Validation accuracy: 80.21%


In [38]:
# Instantiate XGBClassifier
clf2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf2.fit(X_train, y_train)

# Predict on training and test sets
training_preds = clf2.predict(X_train)
test_preds = clf2.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 87.66%
Validation accuracy: 80.7%


In [40]:
# Instantiate XGBClassifier
clf3 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf3.fit(X_train, y_train)

# Predict on training and test sets
training_preds = clf3.predict(X_train)
test_preds = clf3.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 90.85%
Validation accuracy: 80.72%


In [19]:
# Instantiate a classifier
clf = XGBClassifier()
random_clf2 = RandomizedSearchCV(clf, param_grid2, n_iter = 100, scoring = 'accuracy', cv = 3, verbose = 2, n_jobs = -1)

# Fit classifier
random_clf2.fit(X_train, y_train)

# Make predictions
training_preds = random_clf2.predict(X_train)
test_preds = random_clf2.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 380.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 1532.9min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 2309.5min finished


Training Accuracy: 88.79%
Validation accuracy: 81.06%


In [20]:
random_clf2.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=250,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [21]:
#plot confusion matrix
plot_confusion_matrix(random_clf2, X_test, y_test, cmap = plt.cm.Blues)

NameError: name 'plot_confusion_matrix' is not defined