# Import Libraries and Files

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix


from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn import tree
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [2]:
# Import test values

df_test = pd.read_csv('testsetvalues.csv')
df_test.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [3]:
# Import submission format file

model_submit = pd.read_csv('SubmissionFormat.csv')
model_submit.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


In [4]:
model_submit.drop(['status_group'], axis = 1, inplace = True)
model_submit.head()

Unnamed: 0,id
0,50785
1,51630
2,17168
3,45559
4,49871


## Inspect and Clean Data

In [5]:
# Inspect test data column data types

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 40 columns):
id                       14850 non-null int64
amount_tsh               14850 non-null float64
date_recorded            14850 non-null object
funder                   13981 non-null object
gps_height               14850 non-null int64
installer                13973 non-null object
longitude                14850 non-null float64
latitude                 14850 non-null float64
wpt_name                 14850 non-null object
num_private              14850 non-null int64
basin                    14850 non-null object
subvillage               14751 non-null object
region                   14850 non-null object
region_code              14850 non-null int64
district_code            14850 non-null int64
lga                      14850 non-null object
ward                     14850 non-null object
population               14850 non-null int64
public_meeting           14029 non-null object
r

In [6]:
# Check for missing/null values

df_test.isnull().sum()

id                          0
amount_tsh                  0
date_recorded               0
funder                    869
gps_height                  0
installer                 877
longitude                   0
latitude                    0
wpt_name                    0
num_private                 0
basin                       0
subvillage                 99
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting            821
recorded_by                 0
scheme_management         969
scheme_name              7092
permit                    737
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
payment_type                0
water_quality               0
quality_group               0
quantity  

In [7]:
df_test['construction_year'].value_counts()

0       5260
2010     669
2009     663
2008     630
2000     487
2006     421
2007     373
2011     335
2004     294
2003     293
1995     269
2002     268
2005     264
2012     263
1999     243
1985     232
1978     230
1998     224
1990     222
1996     209
1994     202
1980     194
1984     191
1972     184
1982     182
1997     177
1992     167
2001     140
1974     138
1993     137
1988     136
1975     124
1986     119
1976     111
1983     106
1991      83
1970      82
1989      80
1987      68
1981      53
1979      53
1977      45
1973      43
2013      33
1971      32
1963      22
1960      22
1969      18
1967      18
1968      16
1964       8
1961       7
1962       6
1966       2
1965       2
Name: construction_year, dtype: int64

In [8]:
# Engineer 'well_age' column out of 'construction_year' and 'date_recorded'

df_test['year_recorded'] = pd.to_datetime(df_test['date_recorded']).dt.year
print(df_test['construction_year'].mean)
#df_test['construction_year'] = df_test['construction_year'].replace({0:1999})
#df_test['well_age'] = df_test['year_recorded'] - df_test['construction_year']
#df_test['well_age'] = df_test['well_age'].replace({-1:12, -2:12, -3:12, -4:12, -5:12, -7:12})
#df_test.drop(columns = ['date_recorded', 'year_recorded', 'construction_year'], axis = 1, inplace = True)

<bound method Series.mean of 0        2012
1        2000
2        2010
3        1987
4        2000
5        1990
6        2007
7        1982
8        1997
9        2003
10       2006
11       2002
12          0
13          0
14       1984
15       1989
16          0
17       1978
18          0
19       1982
20       2010
21       2008
22       2005
23       1970
24          0
25       2008
26       2009
27          0
28          0
29       2002
         ... 
14820    2001
14821    2007
14822    1997
14823       0
14824    2008
14825    1978
14826       0
14827       0
14828       0
14829    2005
14830    1996
14831       0
14832    2001
14833    2003
14834       0
14835       0
14836    2009
14837    2010
14838    1986
14839    2005
14840    2009
14841       0
14842    2009
14843    1995
14844       0
14845    1988
14846    1994
14847    2010
14848    2009
14849    2008
Name: construction_year, Length: 14850, dtype: int64>


In [9]:
df_test['year_recorded'].value_counts()

2011    7234
2013    5939
2012    1665
2004      11
2001       1
Name: year_recorded, dtype: int64

In [10]:
print(df_test['year_recorded'].mean)

<bound method Series.mean of 0        2013
1        2013
2        2013
3        2013
4        2013
5        2013
6        2011
7        2013
8        2013
9        2013
10       2013
11       2013
12       2012
13       2011
14       2011
15       2013
16       2011
17       2011
18       2012
19       2013
20       2013
21       2011
22       2013
23       2013
24       2012
25       2011
26       2013
27       2011
28       2013
29       2013
         ... 
14820    2013
14821    2013
14822    2013
14823    2011
14824    2011
14825    2013
14826    2012
14827    2011
14828    2011
14829    2013
14830    2013
14831    2012
14832    2013
14833    2011
14834    2011
14835    2013
14836    2011
14837    2013
14838    2011
14839    2011
14840    2013
14841    2012
14842    2013
14843    2013
14844    2012
14845    2011
14846    2011
14847    2013
14848    2013
14849    2013
Name: year_recorded, Length: 14850, dtype: int64>


In [11]:
df_test['construction_year'] = df_test['construction_year'].replace({0:2012})
df_test['well_age'] = df_test['year_recorded'] - df_test['construction_year']
df_test['well_age'].value_counts()

-1     3359
 0     1489
 1     1149
 3      650
 2      594
 5      510
 13     436
 4      434
 7      405
 6      351
 11     324
 14     293
 8      280
 33     255
 15     249
 16     237
 23     203
 19     203
 9      192
 10     192
 27     187
 31     186
 18     184
 28     182
 17     164
 25     161
 21     149
 29     147
 12     147
 35     140
 41     133
 26     133
 37     131
 20     130
 39     127
 38      90
 36      86
 30      78
 22      77
 43      70
 40      62
 24      61
 34      49
 32      42
 42      23
 50      21
 53      20
 45      14
 44      11
 48       7
 51       7
 46       7
 52       5
-8        4
 47       4
 49       3
-7        1
-4        1
-2        1
Name: well_age, dtype: int64

In [12]:
df_test['well_age'] = df_test['well_age'].replace({-1:0, -2:0, -4:0, -7:0, -8:0})
df_test.drop(columns = ['date_recorded', 'year_recorded', 'construction_year'], axis = 1, inplace = True)

In [13]:
df_test['well_age'].value_counts()

0     4855
1     1149
3      650
2      594
5      510
13     436
4      434
7      405
6      351
11     324
14     293
8      280
33     255
15     249
16     237
23     203
19     203
10     192
9      192
27     187
31     186
18     184
28     182
17     164
25     161
21     149
29     147
12     147
35     140
41     133
26     133
37     131
20     130
39     127
38      90
36      86
30      78
22      77
43      70
40      62
24      61
34      49
32      42
42      23
50      21
53      20
45      14
44      11
51       7
48       7
46       7
52       5
47       4
49       3
Name: well_age, dtype: int64

In [14]:
# Deal with null/missing values

df_test.drop(columns = ['scheme_name', 'subvillage', 'public_meeting'], axis = 1, inplace = True);
df_test['funder'].fillna('other', inplace = True)
df_test['installer'].fillna('other', inplace = True)
df_test['scheme_management'].fillna('other', inplace = True)
df_test['permit'].fillna('other', inplace = True)

In [15]:
# Check work on removing null values

df_test.isnull().sum()

id                       0
amount_tsh               0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
recorded_by              0
scheme_management        0
permit                   0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
well_age                 0
dtype: int64

In [16]:
# Drop columns that were determined to be dropped from training data

df_test.drop(columns = ['num_private', 'wpt_name', 'ward', 'recorded_by', 'permit', 
                       'management_group', 'payment_type', 'quality_group',
                       'quantity_group', 'source_type', 'source_class',
                       'waterpoint_type_group', 'extraction_type_group', 'extraction_type'], axis = 1, inplace = True)

In [17]:
df_test['funder'].value_counts(normalize = True)

Government Of Tanzania        0.149158
other                         0.058519
Danida                        0.053401
Hesawa                        0.039057
World Bank                    0.023704
Kkkt                          0.022626
Rwssp                         0.022155
World Vision                  0.021279
Unicef                        0.017980
Tasaf                         0.017441
Dhv                           0.015892
Private Individual            0.014007
0                             0.013670
Dwsp                          0.013535
District Council              0.013064
Norad                         0.012391
Water                         0.010505
Germany Republi               0.010438
Ministry Of Water             0.009293
Tcrs                          0.008956
Hifab                         0.008552
Netherlands                   0.008215
Dwe                           0.008148
Lga                           0.007003
Adb                           0.006936
Amref                    

In [18]:
#Clean 'Funder' column and reduce values counts

def top_funders(var):
    if var in df_test['funder'].value_counts(normalize = True).head(25).index:
        return var
    else:
        return 'other'
    
df_test['top_funded'] = df_test['funder'].map(top_funders)

In [19]:
# Inspect changes to value counts

df_test['top_funded'].value_counts()

other                     7018
Government Of Tanzania    2215
Danida                     793
Hesawa                     580
World Bank                 352
Kkkt                       336
Rwssp                      329
World Vision               316
Unicef                     267
Tasaf                      259
Dhv                        236
Private Individual         208
0                          203
Dwsp                       201
District Council           194
Norad                      184
Water                      156
Germany Republi            155
Ministry Of Water          138
Tcrs                       133
Hifab                      127
Netherlands                122
Dwe                        121
Lga                        104
Adb                        103
Name: top_funded, dtype: int64

In [20]:
# Correct data entry grammatical errors

df_test['top_funded'].replace('0', 'other', inplace = True)
#df_test['top_funded'].replace('Finw', 'Fini Water', inplace = True)
df_test['top_funded'].replace('Germany Republi', 'Germany Republic', inplace = True)
#df_test['top_funded'].replace('Private', 'Private Individual', inplace = True)
#df_test['top_funded'].replace('Jica', 'Jaica', inplace = True)
df_test['top_funded'].replace('Water', 'Ministry Of Water', inplace = True)
#df_test['top_funded'].replace('Ministry of Water', 'Ministry Of Water', inplace = True)
#df_test['top_funded'].replace('Rural Water Supply And Sanitat', 'Rural Water Supply And Sanitation', inplace = True)

In [21]:
# Clean 'Installer' column and reduce value counts

def top_installer(var):
    if var in df_test['installer'].value_counts(normalize = True).head(25).index:
        return var
    else:
        return 'other'
    
df_test['top_installers'] = df_test['installer'].map(top_installer)

In [22]:
# Inspect changes to value counts

df_test['top_installers'].value_counts()

other                 6723
DWE                   4349
Government             457
RWE                    292
Commu                  287
DANIDA                 255
Hesawa                 230
KKKT                   222
0                      203
TCRS                   180
CES                    155
Central government     142
HESAWA                 140
DANID                  138
Community              134
District Council       112
World vision           109
TASAF                  108
Gover                  100
WEDECO                  99
District council        98
LGA                     93
TWESA                   79
WU                      76
Dmdd                    69
Name: top_installers, dtype: int64

In [23]:
# Correct data entry grammatical errors

df_test['top_installers'].replace('0', 'other', inplace = True)
df_test['top_installers'].replace('District council', 'District Council', inplace = True)
df_test['top_installers'].replace('Gover', 'Government', inplace = True)
df_test['top_installers'].replace('Commu', 'Community', inplace = True)
df_test['top_installers'].replace('World vision', 'World Vision', inplace = True)
df_test['top_installers'].replace('HESAWA', 'Hesawa', inplace = True)
df_test['top_installers'].replace('DANID', 'DANIDA', inplace = True)
#df_test['top_installers'].replace('Centr', 'Central government', inplace = True)
#df_test['top_installers'].replace('Gove', 'Government', inplace = True)
#df_test['top_installers'].replace('Distri', 'District Council', inplace = True)
#df_test['top_installers'].replace('KKKT _ Konde and DWE', 'KKKT', inplace = True)
#df_test['top_installers'].replace('Da', 'DANIDA', inplace = True)
#df_test['top_installers'].replace('Magadini-Makiwaru wa', 'Magadini-Makiwaru Water Supply', inplace = True)
#df_test['top_installers'].replace('Lawatefuka water sup', 'Lawatefuka Water Supply', inplace = True)
#df_test['top_installers'].replace('Handeni Trunk Main(', 'Handeni Trunk Main', inplace = True)

In [24]:
# Clean 'lga' column and reduce value counts

def top_lgas(var):
    if var in df_test['lga'].value_counts(normalize = True).head(25).index:
        return var
    else:
        return 'other'
    
df_test['top_lga'] = df_test['lga'].map(top_lgas)

In [25]:
# Inspect changes to value counts

df_test['top_lga'].value_counts()

other            8649
Njombe            625
Moshi Rural       315
Bariadi           308
Kasulu            275
Rungwe            275
Kilosa            274
Arusha Rural      269
Bagamoyo          266
Mbozi             252
Kilombero         248
Meru              235
Same              229
Kibondo           227
Magu              225
Kahama            222
Maswa             215
Kyela             211
Singida Rural     207
Karagwe           196
Mbinga            193
Kigoma Rural      191
Serengeti         189
Iringa Rural      186
Ngara             185
Songea Rural      183
Name: top_lga, dtype: int64

In [26]:
# Drop id because it is unnecessary, drop funder and installer because new features were added with their clean data.

df_test.drop(columns = ['id', 'funder', 'installer', 'lga'], axis = 1, inplace = True)

In [27]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

df_test['district_code'] = df_test['district_code'].astype('str')
df_test['region_code'] = df_test['region_code'].astype('str')

In [28]:
# Inspect remaining columns for any null values. 

df_test.isnull().sum()

amount_tsh               0
gps_height               0
longitude                0
latitude                 0
basin                    0
region                   0
region_code              0
district_code            0
population               0
scheme_management        0
extraction_type_class    0
management               0
payment                  0
water_quality            0
quantity                 0
source                   0
waterpoint_type          0
well_age                 0
top_funded               0
top_installers           0
top_lga                  0
dtype: int64

In [29]:
# Check all column data types to ensure they are prepared to be modeled

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 21 columns):
amount_tsh               14850 non-null float64
gps_height               14850 non-null int64
longitude                14850 non-null float64
latitude                 14850 non-null float64
basin                    14850 non-null object
region                   14850 non-null object
region_code              14850 non-null object
district_code            14850 non-null object
population               14850 non-null int64
scheme_management        14850 non-null object
extraction_type_class    14850 non-null object
management               14850 non-null object
payment                  14850 non-null object
water_quality            14850 non-null object
quantity                 14850 non-null object
source                   14850 non-null object
waterpoint_type          14850 non-null object
well_age                 14850 non-null int64
top_funded               14850 non-null objec

In [30]:
df_test['top_lga'].replace('Songea Rural', 'other', inplace = True)

In [31]:
df_test['top_lga'].replace('Ngara', 'other', inplace = True)

In [32]:
df_test['top_installers'].replace('Dmdd', 'other', inplace = True)

## Model Preparation 

In [33]:
# Isolate continuous variables 

test_cont = df_test[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

In [34]:
# Isolate categorical variables 

test_cat = df_test.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age'], axis = 1)

In [35]:
# One-hot encode categorical variables using pd.get_dummies

test_dummies = pd.get_dummies(test_cat, drop_first = True)
test_dummies.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Mbinga,top_lga_Mbozi,top_lga_Meru,top_lga_Moshi Rural,top_lga_Njombe,top_lga_Rungwe,top_lga_Same,top_lga_Serengeti,top_lga_Singida Rural,top_lga_other
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [36]:
# Concatenate the OHE categorical and continuous variables back together 

processed_test = pd.concat([test_dummies, test_cont], axis = 1)
processed_test.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Same,top_lga_Serengeti,top_lga_Singida Rural,top_lga_other,amount_tsh,gps_height,longitude,latitude,population,well_age
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0.0,1996,35.290799,-4.059696,321,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0.0,1569,36.656709,-3.309214,300,13
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0.0,1567,34.767863,-5.004344,500,3
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.0,267,38.058046,-9.418672,250,26
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,500.0,1260,35.006123,-10.950412,60,13


In [37]:
# Save processed test data for use in modeling

processed_test.to_csv('preprocessed_test_data', index = False)

In [67]:
# Import processed test data

processed_test2 = pd.read_csv('preprocessed_test_data')
processed_test2.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Same,top_lga_Serengeti,top_lga_Singida Rural,top_lga_other,amount_tsh,gps_height,longitude,latitude,population,well_age
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0.0,1996,35.290799,-4.059696,321,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0.0,1569,36.656709,-3.309214,300,13
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0.0,1567,34.767863,-5.004344,500,3
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.0,267,38.058046,-9.418672,250,26
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,500.0,1260,35.006123,-10.950412,60,13


# Training Data Prep

In [68]:
# Import training data 

df_train = pd.read_csv('experimental2_training_set')
df_train.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,population,scheme_management,...,payment,water_quality,quantity,source,waterpoint_type,well_age,status_group,top_funded,top_installers,top_lga
0,6000.0,1390,34.938093,-9.856322,Lake Nyasa,Iringa,11,5,109,VWC,...,pay annually,soft,enough,spring,communal standpipe,12,functional,other,other,other
1,0.0,1399,34.698766,-2.147466,Lake Victoria,Mara,20,2,280,Other,...,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3,functional,other,other,Serengeti
2,25.0,686,37.460664,-3.821329,Pangani,Manyara,21,4,250,VWC,...,pay per bucket,soft,enough,dam,communal standpipe multiple,4,functional,other,World Vision,other
3,0.0,263,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,90,63,58,VWC,...,never pay,soft,dry,machine dbh,communal standpipe multiple,27,non functional,Unicef,other,other
4,0.0,0,31.130847,-1.825359,Lake Victoria,Kagera,18,1,0,other,...,never pay,soft,seasonal,rainwater harvesting,communal standpipe,12,functional,other,other,Karagwe


In [69]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

df_train['district_code'] = df_test['district_code'].astype('str')
df_train['region_code'] = df_test['region_code'].astype('str')

In [70]:
train_cont = df_train[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

train_cat = df_train.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age', 'status_group'], axis = 1)

train_dummies = pd.get_dummies(train_cat, drop_first = True)

processed_train = pd.concat([train_dummies, train_cont], axis = 1)
processed_train.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Same,top_lga_Serengeti,top_lga_Singida Rural,top_lga_other,amount_tsh,gps_height,longitude,latitude,population,well_age
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,6000.0,1390,34.938093,-9.856322,109,12
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0.0,1399,34.698766,-2.147466,280,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,25.0,686,37.460664,-3.821329,250,4
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.0,263,38.486161,-11.155298,58,27
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0,31.130847,-1.825359,0,12


In [71]:
#df_train['region_code_40'].value_counts()

In [72]:
#['scheme_management_None'].value_counts()

In [None]:
#df_train.drop(columns = ['region_code_40', 'scheme_management_None'], axis =1, inplace = True)

In [41]:
for col in processed_train.columns:
    print(col)

basin_Lake Nyasa
basin_Lake Rukwa
basin_Lake Tanganyika
basin_Lake Victoria
basin_Pangani
basin_Rufiji
basin_Ruvuma / Southern Coast
basin_Wami / Ruvu
region_Dar es Salaam
region_Dodoma
region_Iringa
region_Kagera
region_Kigoma
region_Kilimanjaro
region_Lindi
region_Manyara
region_Mara
region_Mbeya
region_Morogoro
region_Mtwara
region_Mwanza
region_Pwani
region_Rukwa
region_Ruvuma
region_Shinyanga
region_Singida
region_Tabora
region_Tanga
region_code_10
region_code_11
region_code_12
region_code_13
region_code_14
region_code_15
region_code_16
region_code_17
region_code_18
region_code_19
region_code_2
region_code_20
region_code_21
region_code_24
region_code_3
region_code_4
region_code_5
region_code_6
region_code_60
region_code_7
region_code_8
region_code_80
region_code_9
region_code_90
region_code_99
district_code_1
district_code_13
district_code_2
district_code_23
district_code_3
district_code_30
district_code_33
district_code_4
district_code_43
district_code_5
district_code_53
district

In [42]:
for col in processed_test.columns:
    print(col)

basin_Lake Nyasa
basin_Lake Rukwa
basin_Lake Tanganyika
basin_Lake Victoria
basin_Pangani
basin_Rufiji
basin_Ruvuma / Southern Coast
basin_Wami / Ruvu
region_Dar es Salaam
region_Dodoma
region_Iringa
region_Kagera
region_Kigoma
region_Kilimanjaro
region_Lindi
region_Manyara
region_Mara
region_Mbeya
region_Morogoro
region_Mtwara
region_Mwanza
region_Pwani
region_Rukwa
region_Ruvuma
region_Shinyanga
region_Singida
region_Tabora
region_Tanga
region_code_10
region_code_11
region_code_12
region_code_13
region_code_14
region_code_15
region_code_16
region_code_17
region_code_18
region_code_19
region_code_2
region_code_20
region_code_21
region_code_24
region_code_3
region_code_4
region_code_5
region_code_6
region_code_60
region_code_7
region_code_8
region_code_80
region_code_9
region_code_90
region_code_99
district_code_1
district_code_13
district_code_2
district_code_23
district_code_3
district_code_30
district_code_33
district_code_4
district_code_43
district_code_5
district_code_53
district

In [73]:
processed_train['top_lga_Namtumbo'].value_counts()

0    58706
1      694
Name: top_lga_Namtumbo, dtype: int64

In [74]:
processed_train.drop(['top_lga_Namtumbo'], axis = 1, inplace = True)

In [75]:
processed_train.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_lga_Same,top_lga_Serengeti,top_lga_Singida Rural,top_lga_other,amount_tsh,gps_height,longitude,latitude,population,well_age
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,6000.0,1390,34.938093,-9.856322,109,12
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0.0,1399,34.698766,-2.147466,280,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,25.0,686,37.460664,-3.821329,250,4
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.0,263,38.486161,-11.155298,58,27
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0,31.130847,-1.825359,0,12


In [None]:
#df_train['status_group'].value_counts()

In [None]:
# region_code_40
# dftrain scheme_management none - both scheme management other / Other

In [76]:
# Split data into target (y) and predictors (X)

X = processed_train
y = df_train['status_group']

# Split data using train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = None, random_state = 42)

In [77]:
X.shape

(59400, 200)

In [78]:
y.shape

(59400,)

In [50]:
# Scale training data to make sure that all of our data is represented at the same scale
scale = MinMaxScaler()

# Fit
X_train = scale.fit_transform(X) 
X_test = scale.transform(processed_test) 

In [51]:
X_train.shape

(59400, 200)

In [52]:
X_test.shape

(14850, 200)

In [80]:
# Instantiate XGBClassifier
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=250,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf.fit(X, y)

XGBClassifier(max_depth=9, n_estimators=250, n_jobs=-1,
              objective='multi:softprob', subsample=0.5)

In [81]:
# Predict on training and test sets

test_preds2 = clf.predict(processed_test2)

In [82]:
pd.DataFrame(test_preds2)[0].value_counts()

functional                 9714
non functional             4841
functional needs repair     295
Name: 0, dtype: int64

In [83]:
# Accuracy of training and test sets

training_accuracy = clf.score(X, y)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))

Training Accuracy: 88.9%


In [None]:
plot_confusion_matrix(clf, X_test, y, cmap = plt.cm.Blues)

In [84]:
model_submit.head()

Unnamed: 0,id
0,50785
1,51630
2,17168
3,45559
4,49871


In [91]:
# Generate label predictions on the test data 

#test_pred = clf.predict(scaled_test)

# Format the test predictions into a pandas dataframe

test_pred_df2 = pd.DataFrame(data = test_preds2)

# Name the outcome label column

test_pred_df2 = test_pred_df2.rename(columns = {0: 'status_group'})

# Concatenate the test labels and submission format file

test_pred_df2 = pd.concat([model_submit, test_pred_df2], axis = 1)

In [92]:
test_pred_df2['status_group'].value_counts()

functional                 9714
non functional             4841
functional needs repair     295
Name: status_group, dtype: int64

In [60]:
test_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 2 columns):
id              14850 non-null int64
status_group    14850 non-null object
dtypes: int64(1), object(1)
memory usage: 232.1+ KB


In [93]:
test_pred_df2['status_group'] = test_pred_df2['status_group'].astype(str)

In [94]:
# Convert the outcome column 'status_group' into numeric values

test_pred_df2['status_group'].replace('1', 'functional', inplace = True)
test_pred_df2['status_group'].replace('0', 'non_functional', inplace = True)
test_pred_df2['status_group'].replace('2', 'functional_needs_repair', inplace = True)

In [95]:
test_pred_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 2 columns):
id              14850 non-null int64
status_group    14850 non-null object
dtypes: int64(1), object(1)
memory usage: 232.1+ KB


In [96]:
test_pred_df2['status_group'].value_counts()

functional                 9714
non functional             4841
functional needs repair     295
Name: status_group, dtype: int64

In [None]:
# Pre-Column Correction Model: Score = 0.5057

#test_pred_df.to_csv('test_set_predictions_3', index = False)

In [65]:
# MinMaxScaled Model: Score = 0.8003

#test_pred_df.to_csv('test_set_predictions_4', index = False)

In [99]:
# Non MinMaxScaled Model: Score = 0.7996

#test_pred_df2.to_csv('test_set_predictions_5', index = False)