In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

from sklearn import tree

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
pump_target = pd.read_csv('data/Training_set_labels.csv')
pump_target.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [4]:
pump_features = pd.read_csv('data/Training_set_values.csv')
pump_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [5]:
pumps = pump_features.merge(pump_target)
pumps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [6]:
X = pumps.drop(['id', 'status_group', 'date_recorded', 'funder', 'installer', 'wpt_name', 
                'subvillage', 'ward', 'scheme_name', 'lga'], axis=1)
y = pumps['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=pumps['status_group'])

In [7]:
# Numeric fearures from the data set.
numeric_features = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 
                    'district_code', 'population', 'construction_year']

# Boolean fearures from the data set.
boolean_features = ['public_meeting', 'permit']

# Categorical fearures from the data set.
categorical_features = ['basin', 'region', 'recorded_by', 'scheme_management', 'extraction_type', 
                        'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 
                        'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 
                        'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group']

'funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name', 'lga'

('funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name', 'lga')

In [8]:
# Setting up the numeric, boolean, and cartegorical transformers.
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer()), 
                                      ('scaler', StandardScaler())])

boolean_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=False))]) 

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), 
                                          ('encoder', OrdinalEncoder())])

# Using ColumnTransformer to trasformer the columns to avoid data leakage.
preprocessor = ColumnTransformer(transformers=[('numeric', numeric_transformer, numeric_features), 
                                               ('boolean', boolean_transformer, boolean_features),
                                               ('categorical', categorical_transformer, categorical_features)])

In [9]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [10]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier(random_state=42)

In [11]:
clf.score(X_test_transformed,y_test)

0.7562289562289563

In [12]:
pd.DataFrame(X_train_transformed, columns=(numeric_features + boolean_features + categorical_features))

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,public_meeting,permit,basin,region,recorded_by,scheme_management,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,-0.0380649,1.56901,0.0775246,-1.38406,-0.0354544,-0.242416,-0.0632746,-0.22649,0.734822,True,False,1,3,0,8,3,1,0,7,4,3,1,6,2,1,1,8,6,0,1,1
1,-0.0990184,1.43482,-0.356983,-1.03893,-0.0354544,-0.014388,-0.37587,0.26031,0.727462,True,False,2,15,0,8,8,5,1,7,4,0,2,6,2,1,1,7,5,0,4,3
2,-0.0990184,-0.966202,-0.0240003,-1.28622,-0.0354544,-0.185409,-0.271672,-0.388756,-1.36893,True,True,1,10,0,8,3,1,0,7,4,3,1,6,2,1,1,8,6,0,1,1
3,-0.0990184,1.24435,0.112828,-1.83338,-0.0354544,-0.299423,-0.271672,0.173768,0.739027,False,True,1,16,0,10,3,1,0,11,4,6,6,6,2,1,1,8,6,0,1,1
4,-0.08378,-1.00516,0.734974,-0.250245,-0.0354544,-0.527452,-0.480069,-0.345485,0.743233,True,True,8,14,0,0,6,10,5,4,0,4,5,6,2,1,1,6,4,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44545,-0.0990184,-0.612687,0.450978,-0.702345,-0.0354544,-0.584459,-0.271672,0.368488,0.716949,True,True,6,11,0,8,3,1,0,7,4,0,2,6,2,2,2,6,4,1,1,1
44546,-0.0380649,-0.966202,0.439193,-0.144481,-0.0354544,-0.584459,-0.480069,-0.172401,0.740079,True,True,8,11,0,8,6,10,5,7,4,4,5,4,4,1,1,3,0,0,1,1
44547,-0.0929231,0.105887,0.784682,-1.63202,-0.0354544,4.26114,2.85429,5.0872,-1.36893,True,False,7,12,0,8,14,10,5,7,4,4,5,6,2,2,2,3,0,0,2,1
44548,-0.0898754,-0.351518,0.803291,-1.67473,-0.0354544,4.26114,2.85429,0.303581,0.721154,True,False,7,12,0,8,14,10,5,7,4,4,5,6,2,2,2,3,0,0,2,1


In [13]:
pumps['recorded_by'].value_counts()

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64