In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

from sklearn import tree

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
pump_target = pd.read_csv('data/Training_set_labels.csv')
pump_target.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [4]:
pump_features = pd.read_csv('data/Training_set_values.csv')
pump_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [5]:
pumps = pump_features.merge(pump_target)
pumps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [6]:
pumps.dropna(inplace=True)

In [7]:
pumps_columns = list(pumps.columns)

In [8]:
for column in pumps_columns:
    print('-----------')
    print(column)
    print('-----------')
    print(pumps[column].value_counts())

-----------
id
-----------
10235    1
5544     1
19843    1
13708    1
11663    1
        ..
51756    1
8753     1
12851    1
565      1
4098     1
Name: id, Length: 27813, dtype: int64
-----------
amount_tsh
-----------
0.0        15679
500.0       1993
50.0        1540
20.0        1027
1000.0       886
           ...  
6300.0         1
60000.0        1
900.0          1
3500.0         1
45000.0        1
Name: amount_tsh, Length: 85, dtype: int64
-----------
date_recorded
-----------
2011-03-17    383
2011-03-15    379
2011-03-14    341
2011-03-09    305
2011-03-18    278
             ... 
2011-07-03      1
2004-05-01      1
2013-01-13      1
2013-12-01      1
2012-10-01      1
Name: date_recorded, Length: 304, dtype: int64
-----------
funder
-----------
Government Of Tanzania          6050
Danida                          1908
World Bank                       763
Unicef                           689
Norad                            653
                                ... 
D Ct         

Name: quantity, dtype: int64
-----------
quantity_group
-----------
enough          16862
insufficient     7067
dry              2875
seasonal          886
unknown           123
Name: quantity_group, dtype: int64
-----------
source
-----------
spring                  13121
river                    7576
machine dbh              4461
shallow well             1089
lake                      544
dam                       459
rainwater harvesting      291
other                     139
hand dtw                  117
unknown                    16
Name: source, dtype: int64
-----------
source_type
-----------
spring                  13121
river/lake               8120
borehole                 4578
shallow well             1089
dam                       459
rainwater harvesting      291
other                     155
Name: source_type, dtype: int64
-----------
source_class
-----------
groundwater    18788
surface         8870
unknown          155
Name: source_class, dtype: int64
-----------
waterp

In [9]:
no_good = ['id', 'date_recorded', 'recorded_by']

num = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'population', 'construction_year']

cat_num = ['region_code', 'district_code']

cat = ['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 
       'scheme_name', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 
       'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 
       'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group']

boolean = ['public_meeting', 'permit']

target = ['status_group']

In [10]:
X = pumps.drop(no_good + cat + target, axis=1)
y = pumps[target]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=pumps['status_group'])

In [12]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train,y_train)

DecisionTreeClassifier(random_state=42)

In [13]:
tree.score(X_train, y_train)

0.9944388513351551

In [14]:
tree.score(X_test, y_test)

0.725050330744895