# Data Cleaning Pipeline

In [6]:
# Import Libraries

import pandas as pd
from pathlib import Path
import re

## Load Data

In [7]:
pump_data_val = pd.read_csv("data/training_set_values.csv")
pump_data_lab = pd.read_csv("data/training_set_labels.csv")

print("Pump Data Values Shape: \n")
print(pump_data_val.shape)

print("\nPump Data Labels Shape: \n")
print(pump_data_lab.shape)

Pump Data Values Shape: 

(59400, 40)

Pump Data Labels Shape: 

(59400, 2)


In [8]:
pump_data_val.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [9]:
pump_data_lab.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [10]:
print("Check class Imbalance: ")
print(pump_data_lab.status_group.value_counts())

Check class Imbalance: 
functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64


## Data Cleaning

In [45]:
df = pump_data_val.iloc[:, 20:]

In [47]:
# clean all object/text columns. lower and strip

# df_obj = df.select_dtypes(['object'])

def trim_all_columns(df):
    """
    Trim whitespace from ends & lower each value across all series in dataframe
    """
    trim_strings = lambda x: x.lower().strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

In [48]:
df = trim_all_columns(df)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   scheme_management      55523 non-null  object
 1   scheme_name            31234 non-null  object
 2   permit                 56344 non-null  object
 3   construction_year      59400 non-null  int64 
 4   extraction_type        59400 non-null  object
 5   extraction_type_group  59400 non-null  object
 6   extraction_type_class  59400 non-null  object
 7   management             59400 non-null  object
 8   management_group       59400 non-null  object
 9   payment                59400 non-null  object
 10  payment_type           59400 non-null  object
 11  water_quality          59400 non-null  object
 12  quality_group          59400 non-null  object
 13  quantity               59400 non-null  object
 14  quantity_group         59400 non-null  object
 15  source             

In [50]:
# check for missing values
df.isna().sum()

scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
dtype: int64

In [52]:
df.head()

Unnamed: 0,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,vwc,roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,vwc,nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,vwc,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


#### Year

In [51]:
# more than 30% of construction year is 0, we will probably drop this column
df.construction_year.value_counts()[:10]/df.shape[0]

0       0.348636
2010    0.044529
2008    0.043990
2009    0.042643
2000    0.035202
2007    0.026717
2006    0.024764
2003    0.021650
2011    0.021145
2004    0.018906
Name: construction_year, dtype: float64

In [121]:
# there are around 53 years since the earliest pump was made.
df.construction_year.value_counts().reset_index().sort_values(by='index')

Unnamed: 0,index,construction_year
0,0,20709
45,1960,102
52,1961,21
51,1962,30
47,1963,85
50,1964,40
53,1965,19
54,1966,17
46,1967,88
48,1968,77


In [133]:
years = df.construction_year.to_list()
years = [3000 if i == 0 else i for i in years]

In [146]:
import pandas as pd

years = years
bin_edges = [1960, 1975, 1990, 2013, max(years) + 1]  # Customize the bin edges
labels = ["Group 1", "Group 2", "Group 3", "Others"]  # Customize the labels

# Perform the cut and assign labels
categories = pd.cut(years, bins=bin_edges, labels=labels, retbins=False)

print(categories, '-------')


Group 3 -------


In [147]:
df['year_bins'] = categories

In [148]:
df[['construction_year', 'year_bins']]

Unnamed: 0,construction_year,year_bins
0,1999,Group 3
1,2010,Group 3
2,2009,Group 3
3,1986,Group 2
4,0,Others
...,...,...
59395,1999,Group 3
59396,1996,Group 3
59397,0,Others
59398,0,Others


#### Scheme Management

In [58]:
df.loc[:, ['scheme_management', 'scheme_name']].value_counts()

scheme_management  scheme_name                
vwc                k                              574
wua                chalinze wate                  405
vwc                danida                         378
                   m                              331
wug                none                           325
                                                 ... 
water authority    center                           1
                   community                        1
vwc                ikungi                           1
                   ikuna gravity water project      1
wug                windmili system                  1
Length: 2953, dtype: int64

In [66]:
# most of scheme_management is null when scheme_name is null.
df.loc[df.loc[:, 'scheme_management'].isna(), ['scheme_management', 'scheme_name']].isna().sum()

scheme_management    3877
scheme_name          3571
dtype: int64

#### Check if we can get scheme_management from scheme_name

In [69]:
scheme = df.loc[~df.loc[:, 'scheme_name'].isna() & df.loc[:, 'scheme_management'].isna(),
       ['scheme_management', 'scheme_name']]

In [73]:
scheme['token'] = scheme.scheme_name.str.split()

In [75]:
df.scheme_management.value_counts()

vwc                 36793
wug                  5206
water authority      3153
wua                  2883
water board          2748
parastatal           1680
private operator     1063
company              1061
other                 766
swc                    97
trust                  72
none                    1
Name: scheme_management, dtype: int64

In [84]:
from collections import Counter
from itertools import chain

In [91]:
scheme_name_tokens = list(chain(*scheme.token.to_list()))

In [94]:
# tokens don't seems to have common elements with scheme_management, not much to impute
Counter(scheme_name_tokens)

Counter({'b': 16,
         'tassaf': 2,
         'mwigumbi': 1,
         'piped': 5,
         'scheme': 58,
         'hesawa': 1,
         'sofi': 13,
         'maj': 13,
         'mkutimango': 3,
         'water': 81,
         'supply': 61,
         'mshewa': 1,
         'machumba': 1,
         'estate': 1,
         'pipe': 64,
         'line': 10,
         'tank': 1,
         'refu': 1,
         'mtakuja': 1,
         'shaba': 1,
         'segese': 15,
         'nasula': 1,
         'gravity': 23,
         'olchoronyokye': 2,
         'projec': 2,
         'migoli': 14,
         'muhalala': 1,
         "ng'au": 18,
         'msaginya': 1,
         'mwendakulima': 4,
         'borehole': 3,
         'msitu': 1,
         'wa': 22,
         'tembo': 1,
         'k': 8,
         'chela': 13,
         'mnyawi': 2,
         'vulue': 9,
         'zo': 8,
         'world': 1,
         'bank': 1,
         'lake': 19,
         'victoria': 19,
         'ichonde': 2,
         'adra': 1,
        

#### Permit

In [96]:
df.loc[:, 'permit'].value_counts()

True     38852
False    17492
Name: permit, dtype: int64

In [103]:
# change datatype to bool
df['permit'] = df.permit.apply(bool)

In [104]:
df.permit

0        False
1         True
2         True
3         True
4         True
         ...  
59395     True
59396     True
59397    False
59398     True
59399     True
Name: permit, Length: 59400, dtype: bool

#### Extraction Type

In [111]:
# These columns don't need much cleaning. Will be interesting to see how it relates with our labels
df.loc[:, ['extraction_type_class', 'extraction_type_group', 'extraction_type']].value_counts()

extraction_type_class  extraction_type_group  extraction_type          
gravity                gravity                gravity                      26780
handpump               nira/tanira            nira/tanira                   8154
other                  other                  other                         6430
submersible            submersible            submersible                   4764
handpump               swn 80                 swn 80                        3670
motorpump              mono                   mono                          2865
handpump               india mark ii          india mark ii                 2400
                       afridev                afridev                       1770
submersible            submersible            ksb                           1415
rope pump              rope pump              other - rope pump              451
handpump               other handpump         other - swn 81                 229
wind-powered           wind-powered  

#### Management

In [113]:
df.columns

Index(['scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [114]:
# management and scheme management seem to be similar, so we can drop scheme_management as it had missing values too
df.loc[:, ['management', 'management_group']].value_counts()

management        management_group
vwc               user-group          40507
wug               user-group           6515
water board       user-group           2933
wua               user-group           2535
private operator  commercial           1971
parastatal        parastatal           1768
water authority   commercial            904
other             other                 844
company           commercial            685
unknown           unknown               561
other - school    other                  99
trust             commercial             78
dtype: int64

In [152]:
df.columns

Index(['scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'year_bins'],
      dtype='object')

In [116]:
df.scheme_management.value_counts()

vwc                 36793
wug                  5206
water authority      3153
wua                  2883
water board          2748
parastatal           1680
private operator     1063
company              1061
other                 766
swc                    97
trust                  72
none                    1
Name: scheme_management, dtype: int64

#### Payment

In [151]:
# drop payment type
df.loc[:, ['payment', 'payment_type']].value_counts()

payment                payment_type
never pay              never pay       25348
pay per bucket         per bucket       8985
pay monthly            monthly          8300
unknown                unknown          8157
pay when scheme fails  on failure       3914
pay annually           annually         3642
other                  other            1054
dtype: int64

#### Water 

In [156]:
# largely have the same values apart from flouride (which is a very small number). Drop quality_group
df.loc[:, ['quality_group', 'water_quality']].value_counts()

quality_group  water_quality     
good           soft                  50818
salty          salty                  4856
unknown        unknown                1876
milky          milky                   804
colored        coloured                490
salty          salty abandoned         339
fluoride       fluoride                200
               fluoride abandoned       17
dtype: int64

#### Quantity

In [159]:
# for same reasons above, drop quantity_group
df.loc[:, ['quantity', 'quantity_group']].value_counts()

quantity      quantity_group
enough        enough            33186
insufficient  insufficient      15129
dry           dry                6246
seasonal      seasonal           4050
unknown       unknown             789
dtype: int64

#### Source

In [162]:
# we can drop source, since it's information is covered largely in source and source_class.
df.loc[:, ['source_class', 'source_type', 'source']].value_counts()

source_class  source_type           source              
groundwater   spring                spring                  17021
              shallow well          shallow well            16824
              borehole              machine dbh             11075
surface       river/lake            river                    9612
              rainwater harvesting  rainwater harvesting     2295
groundwater   borehole              hand dtw                  874
surface       river/lake            lake                      765
              dam                   dam                       656
unknown       other                 other                     212
                                    unknown                    66
dtype: int64

#### Waterpoint

In [165]:
# largely the same, drop waterpoint_type_group
df.loc[:, ['waterpoint_type','waterpoint_type_group']].value_counts()

waterpoint_type              waterpoint_type_group
communal standpipe           communal standpipe       28522
hand pump                    hand pump                17488
other                        other                     6380
communal standpipe multiple  communal standpipe        6103
improved spring              improved spring            784
cattle trough                cattle trough              116
dam                          dam                          7
dtype: int64

In [174]:
drop_columns = ['scheme_management', 'scheme_name', 'payment', 'construction_year', 'quality_group',
               'quantity_group', 'source_type', 'waterpoint_type_group']

In [178]:
final = df.drop(columns=drop_columns, axis=1)

In [None]:
final.to_csv('./data/cleaned.csv', index=False)