In [24]:
# Import our dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder, \
    MinMaxScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, concatenate
from tensorflow.keras.optimizers import SGD, Adagrad, RMSprop, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau


# Extract and Transform

In [25]:
import sqlite3

# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("Tanzania_Water_Pump.db")

train_values_df = pd.read_sql_query("SELECT * from Water_Pump_Status_Complete", con)
train_values_df = train_values_df.set_index("id").drop(['index'],axis=1)

train_labels_df = pd.DataFrame(train_values_df['status_group'])
train_values_df = train_values_df.drop(['status_group','Unnamed: 0'],axis=1)

con.close()

# Verify that result of SQL query is stored in the dataframe
train_values_df.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,source_class,waterpoint_type,waterpoint_type_group,geo_loc,distance1,population1,distance2,population2,distance3,population3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,2012-11-13,Tasaf,0,TASAF,33.125828,-5.118154,Mratibu,0,Lake Tanganyika,...,groundwater,hand pump,hand pump,"[-5.11815407, 33.1258283]",35.033967,145292.0,43.859263,32900.0,65.659579,15320.0
1,0.0,2011-03-05,Shipo,1978,SHIPO,34.770717,-9.395642,none,0,Rufiji,...,groundwater,hand pump,hand pump,"[-9.39564152, 34.77071669]",5.169916,46724.0,42.274055,19040.0,51.33921,15168.0
2,0.0,2011-03-27,Lvia,0,LVIA,36.115056,-6.279268,Bombani,0,Wami / Ruvu,...,groundwater,communal standpipe multiple,communal standpipe,"[-6.27926803, 36.11505595]",27.054749,5527.0,34.494481,11840.0,37.47747,10000.0
3,10.0,2013-06-03,Germany Republi,1639,CES,37.147432,-3.187555,Area 7 Namba 5,0,Pangani,...,groundwater,communal standpipe,communal standpipe,"[-3.18755455, 37.14743219]",21.108603,22839.0,27.433508,156959.0,38.571507,18726.0
4,0.0,2011-03-22,Cmsr,0,CMSR,36.164893,-6.099289,Ezeleda,0,Wami / Ruvu,...,groundwater,hand pump,hand pump,"[-6.09928949, 36.16489341]",15.549644,5527.0,30.011861,11840.0,42.023175,10000.0


In [26]:
# load training and test datasets
### HIDE - reading from SQLlite in above cell

# train_values_df = pd.read_csv('water_pump_closest_cities.csv').set_index("id")

# train_labels_df = pd.DataFrame(train_values_df['status_group'])
# train_values_df = train_values_df.drop(['status_group','Unnamed: 0'],axis=1)

# train_values_df.head()

In [27]:
#create an integer lookup for the status group
dict = {
    "functional":0,
    "functional needs repair":1,
    "non functional":2
}

# train_labels_df['status_group_int'] = train_labels_df['status_group'].map(dict)
# train_labels_df.head()

In [28]:
# convert dates from object to datetime
train_values_df['date_recorded'] = pd.to_datetime(train_values_df['date_recorded'])

In [29]:
# for testing, fillNA with "OTHER" or zero
for col in train_values_df.columns:
    print(f"{col} : {train_values_df[col].dtype}")
    
    if train_values_df[col].dtype == "object":
            train_values_df[col] = train_values_df[col].fillna("Other")
            
    elif train_values_df[col].dtype == "int64":
        train_values_df[col] = train_values_df[col].fillna("0").astype('int64')
        
    elif train_values_df[col].dtype == "float64":
        train_values_df[col] = train_values_df[col].fillna("0").astype('float64')
        

amount_tsh : float64
date_recorded : datetime64[ns]
funder : object
gps_height : int64
installer : object
longitude : float64
latitude : float64
wpt_name : object
num_private : int64
basin : object
subvillage : object
region : object
region_code : int64
district_code : int64
lga : object
ward : object
population : int64
public_meeting : float64
recorded_by : object
scheme_management : object
scheme_name : object
permit : float64
construction_year : int64
extraction_type : object
extraction_type_group : object
extraction_type_class : object
management : object
management_group : object
payment : object
payment_type : object
water_quality : object
quality_group : object
quantity : object
quantity_group : object
source : object
source_type : object
source_class : object
waterpoint_type : object
waterpoint_type_group : object
geo_loc : object
distance1 : float64
population1 : float64
distance2 : float64
population2 : float64
distance3 : float64
population3 : float64


In [30]:
# Generate categorical variable list
txt_cols = train_values_df.dtypes[train_values_df.dtypes == "object"].index.tolist()

In [31]:
# Check the number of unique values in each column
train_values_df[txt_cols].nunique()

funder                    1898
installer                 2146
wpt_name                 37400
basin                        9
subvillage               19288
region                      21
lga                        125
ward                      2092
recorded_by                  1
scheme_management           12
scheme_name               2697
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
geo_loc                  57520
dtype: int64

In [34]:
many_uniques = ['funder','installer',
                'subvillage','lga','ward', 'scheme_name']

# Determine which values to replace
bucket_df = train_values_df.copy()

for i in many_uniques:
    list_value_counts = bucket_df[i].value_counts()
#     print(list_value_counts)
    
    filter_value = list_value_counts.quantile(0.95)
#     print(filter_value)
    
    replace_list = list(list_value_counts[list_value_counts < 100].index)
    
#   bucket_df.loc[bucket_df[i].isin(replace),i] = "Other"
    for j in replace_list:
        bucket_df[i] = bucket_df[i].replace(j,"Other")
    
    print(f"completed {i}")

completed funder
completed installer
completed subvillage
completed lga
completed ward
completed scheme_name


In [35]:
# Check the number of unique values in each column after bucketing
bucket_df[txt_cols].nunique()

funder                      92
installer                   85
wpt_name                 37400
basin                        9
subvillage                  23
region                      21
lga                        116
ward                        59
recorded_by                  1
scheme_management           12
scheme_name                 33
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
geo_loc                  57520
dtype: int64

In [36]:
# drop name and duplicate columns
cleaned_df = bucket_df.drop(['wpt_name','payment_type','quality_group',
                            'quantity_group','source_type',
                            'waterpoint_type_group','date_recorded','geo_loc'], axis=1)

In [37]:
# encode all categorical variables with OneHotEncoders
enc = OneHotEncoder(sparse=False)
# enc = LabelEncoder()

# Generate categorical variable list
txt_cols = cleaned_df.dtypes[cleaned_df.dtypes == "object"].index.tolist()
print(txt_cols)

# fit and then produce the encoder
encode_df = pd.DataFrame(enc.fit_transform(cleaned_df[txt_cols]), 
                        index=cleaned_df.index)

# Rename encoded columns
encode_df.columns = enc.get_feature_names(txt_cols)
encode_df.head()

['funder', 'installer', 'basin', 'subvillage', 'region', 'lga', 'ward', 'recorded_by', 'scheme_management', 'scheme_name', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'water_quality', 'quantity', 'source', 'source_class', 'waterpoint_type']


Unnamed: 0_level_0,funder_0,funder_Adb,funder_Adra,funder_African,funder_Amref,funder_Anglican Church,funder_Bsf,funder_Ces (gmbh),funder_Ces(gmbh),funder_Co,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
# create an X dataset for unsupervised learning
combined_df = cleaned_df.join(encode_df)
combined_df = combined_df.drop(txt_cols,axis=1)
X = combined_df.copy()
y = train_labels_df['status_group'].map(dict)

dict = {
    "functional":0,
    "functional needs repair":1,
    "non functional":1
}

y_bin = train_labels_df['status_group'].map(dict)

In [39]:
from collections import Counter
Counter(y)

Counter({2: 22824, 0: 32259, 1: 4317})

In [41]:
# Test dropping the longitude with 0
X_filter = X[X['longitude']>0]
y_filter = y[X['longitude']>0]
y_bin_filter = y_bin[X['longitude']>0]

# Test dropping population of
# X_filter = X_filter[X['population']>100]
# y_filter = y_filter[X['population']>100]

X_filter.sample(10)

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,public_meeting,permit,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32914,0.0,1369,34.49638,-2.004379,0,20,2,20,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
36033,500.0,1065,34.227195,-4.208027,0,13,1,183,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2659,1000.0,832,36.139496,-10.461513,0,10,5,250,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
28457,0.0,1267,34.250567,-1.720605,0,20,2,90,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
46773,0.0,0,32.804237,-4.256681,0,14,1,0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
49564,2000.0,1586,34.949976,-8.906287,0,11,4,80,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48668,0.0,0,33.903783,-9.460872,0,12,3,0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
43933,0.0,1103,30.000646,-4.987863,0,16,3,500,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
60932,0.0,1399,34.69213,-2.14696,5,20,2,100,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
33003,0.0,1386,34.425656,-1.784081,0,20,2,300,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Random Forest with Cleaned and Filter Data

In [42]:
# split training and test
X_train, X_test, y_train, y_test = train_test_split(X_filter,y_filter, random_state=24)

In [43]:
# scale all the encoded data for both train and test sets
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [44]:
# Random Forest with filter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=256, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest filter predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")


 Random forest filter predictive accuracy: 0.808


In [45]:
from sklearn.metrics import classification_report
print("Classification Report Filtered")
print(classification_report(y_test, y_pred))

Classification Report Filtered
              precision    recall  f1-score   support

           0       0.79      0.92      0.85      7853
           1       0.57      0.22      0.32       938
           2       0.87      0.75      0.80      5606

    accuracy                           0.81     14397
   macro avg       0.74      0.63      0.66     14397
weighted avg       0.80      0.81      0.80     14397



In [46]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X_filter.columns), reverse=True)

[(0.06269708590132154, 'longitude'),
 (0.06162167360607101, 'latitude'),
 (0.05731666329080277, 'distance2'),
 (0.057181118835348634, 'distance3'),
 (0.05665338620887007, 'distance1'),
 (0.05533848397552239, 'quantity_dry'),
 (0.036865951269379534, 'gps_height'),
 (0.031000544728006136, 'construction_year'),
 (0.02761378611226464, 'population'),
 (0.02377793164341337, 'quantity_enough'),
 (0.02310716029715721, 'population3'),
 (0.021011943535547146, 'population2'),
 (0.01893061528569598, 'population1'),
 (0.017407287856267342, 'amount_tsh'),
 (0.017213068966022295, 'waterpoint_type_other'),
 (0.012819981362950055, 'extraction_type_other'),
 (0.012127504616050637, 'extraction_type_class_other'),
 (0.010761796294880136, 'quantity_insufficient'),
 (0.010737190316720132, 'extraction_type_group_other'),
 (0.010728023542138377, 'district_code'),
 (0.010241747511682064, 'region_code'),
 (0.00933906645848687, 'payment_never pay'),
 (0.00831231703624866, 'waterpoint_type_communal standpipe'),
 

# Deep Learning Multi-class

In [47]:
## define the model
base = len(X_train_scaled[0])
num_input = base
n_branchA = base / 1.5
n_branchB = 10
n_layer1 = base
n_layer2 = base / 2
n_layer3 = base

# Start creating the model inputs, layers and outputs

inputs = Input(shape=X_train_scaled[0].shape)
branchA = Dense(n_branchA, activation="relu")(inputs)
# branchB = Dense(n_branchB, activation="sigmoid")(inputs)

# combined = concatenate([branchA,branchB])

hidden3 = Dense(n_layer2, activation="relu")(branchA)
# hidden3 = Dense(n_layer2, activation="relu")(combined)

output = Dense(3)(hidden3)

nn = Model(inputs=inputs,outputs=output)

nn.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 563)]             0         
_________________________________________________________________
dense (Dense)                (None, 375)               211500    
_________________________________________________________________
dense_1 (Dense)              (None, 281)               105656    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 846       
Total params: 318,002
Trainable params: 318,002
Non-trainable params: 0
_________________________________________________________________


In [48]:
# test changing different hyperparameters

opt = SGD(lr=0.01, momentum=0.9, decay=0.01)
# opt = Adam()

# The patience parameter is the amount of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5,
                                             min_delta=0.005)

rlrop = ReduceLROnPlateau(monitor='loss',factor=0.2,patience=3,
                         min_lr=0.001)

In [49]:
# Compile the Sequential model together and customize metrics
nn.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
           optimizer=opt, metrics=["accuracy"])

# Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=50)
fit_model = nn.fit(X_train_scaled, y_train, epochs=200, callbacks=[rlrop,early_stop])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Train on 43191 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
14397/14397 - 1s - loss: 0.5359 - accuracy: 0.7787
Loss: 0.5358896318836296, Accuracy: 0.7787038683891296


# Not Great: Log Reg and Non-filter Random Forest

In [50]:
from sklearn.linear_model import LogisticRegression

# split training and test
X_train, X_test, y_train, y_test = train_test_split(X_filter,y_bin_filter, random_state=24)
# scale all the encoded data for both train and test sets
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

#Run and create the model
model = LogisticRegression(max_iter=10000)
model = model.fit(X_train_scaled, y_train)

y_pred_bin = model.predict(X_test_scaled)
print(f"Logistic regression predictive accuracy: {accuracy_score(y_test,y_pred_bin):.3f}")

Logistic regression predictive accuracy: 0.775


# Random Forest with Binary

In [51]:
# split training and test with filter and
X_train, X_test, y_train, y_test = train_test_split(X_filter,y_bin_filter, random_state=24)

In [52]:
# scale all the encoded data for both train and test sets
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [53]:
# test random forest
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

#Create a random forest from SMOTE oversample
rf_model2 = RandomForestClassifier(n_estimators=1000, random_state=78)

rf_model2 = rf_model2.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred2 = rf_model2.predict(X_test_scaled)
print(f" Random forest binary predictive accuracy: {accuracy_score(y_test,y_pred2):.3f}")


 Random forest binary predictive accuracy: 0.826


In [54]:
from sklearn.metrics import classification_report
print("Classification Report Non-Filtered")
print(classification_report(y_test, y_pred2))

Classification Report Non-Filtered
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      7853
           1       0.85      0.75      0.80      6544

    accuracy                           0.83     14397
   macro avg       0.83      0.82      0.82     14397
weighted avg       0.83      0.83      0.82     14397

