# Clean dataset before injecting into pipeline
- In this notebook, we analyze each individual column to determine the most useful features to include in the training of our final model

In [1]:
import numpy as np
import pandas as pd

import matplotlib as plt
import seaborn as sns
%matplotlib inline

pd.options.display.max_columns = 50

## Load in data:

In [2]:
features = pd.read_csv('data/x_train.csv')
labels = pd.read_csv('data/y_train.csv')

print(f'Total wells in original dataset: {features.shape[0]}\nNumber of features: {features.shape[1]}')

Total wells in original dataset: 59400
Number of features: 40


### Dataset:
- Almost 60,000 total wells to analyze
- 39 features (not including id)

In [3]:
# look at dtype of each col
features.dtypes.to_frame().T

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,int64,float64,object,object,int64,object,float64,float64,object,int64,object,object,object,int64,int64,object,object,int64,object,object,object,object,object,int64,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object


# Null values:

In [4]:
features.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [5]:
null_cols = ['funder', 'installer', 'subvillage', 'public_meeting', 'scheme_management', 'scheme_name', 'permit']

- Most columns don't have null values
- Even the columns with nulls generally don't have many
- Going to leave the handling of the null values to my Pipeline. Will also include MissingIndicator for these values

## Add age in years since well was built:
- Adding this feature because I'm predicting that age of the well will be important

In [6]:
# check date_recorded for missing values
features[features['date_recorded'] == '0'].shape

(0, 40)

In [7]:
# check construction_year for missing values
features[features['construction_year'] == 0].shape

(20709, 40)

- Over 20,000 wells missing construction_year (1/3 of the dataset)
- If I want to use the age of the well as a feature, I need to fill these values with mean or median

In [8]:
features[['construction_year']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
construction_year,59400.0,1300.652475,951.620547,0.0,0.0,1986.0,2004.0,2013.0


- Mean (year 1300) is unrealistic. Median (1986) is a much better fill value.

In [9]:
# mask for missing construction_year
mask = features['construction_year'].astype(int) == 0

# set construction_year to median (1986) if missing
features.loc[mask, 'construction_year'] = features['construction_year'].median()

# calculate age in years
features['age_years'] = features['date_recorded'].str[:4].astype(int) - features['construction_year']

In [10]:
features.age_years

0        12.0
1         3.0
2         4.0
3        27.0
4        25.0
         ... 
59395    14.0
59396    15.0
59397    25.0
59398    25.0
59399     9.0
Name: age_years, Length: 59400, dtype: float64

# Individual column analysis:

In [11]:
features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,age_years
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,12.0
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,3.0
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,4.0
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,27.0
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,1986.0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,25.0


## total static head:
- Change in elevatin between water source and discharge (where water is released). The distance that the water must travel to get from source to output.

In [12]:
features.amount_tsh.to_frame().describe()

Unnamed: 0,amount_tsh
count,59400.0
mean,317.650385
std,2997.574558
min,0.0
25%,0.0
50%,0.0
75%,20.0
max,350000.0


- This is likely a very important column
- lots of TSH of 0. Even the median (50%) value is 0. This could be due to missing values or different pump types
- Standard deviation is massive (almost 10x the mean)

## funder:

In [45]:
features.funder.nunique()

1897

- almost 1900 unique funders
- This may be too many columns to encode

In [72]:
# get counts of num wells funded for each funder
funders = features.funder.value_counts()

# funders that have funded at least 100 wells
top_funders = funders[funders >= 100].index.tolist()
vals = funders[funders >= 100].values.tolist()

In [73]:
num_funders = len(top_funders)
num_funders

91

- Only 91 funders have built more than 100 wells

In [75]:
print(f'''The top {num_funders / features.funder.nunique() * 100 :.1f}% of funders make up \
{sum(vals) / features.shape[0] * 100 :.1f}% of the data''')

The top 4.8% of funders make up 73.0% of the data


- Going to take funders with 100+ wells funded in dataset
- This will result in less One-hot encoded columns and make training process smoother

In [76]:
# replace other funders with "other"
features['funder'] = features['funder'].where(features['funder'].isin(top_funders), 'other')

## gps_height:

In [68]:
features[['gps_height']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gps_height,59400.0,668.297239,693.11635,-90.0,0.0,369.0,1319.25,2770.0


- indicates the altitude of the well, presumably compared to sea level
- the minimum is -90, indicating a well located below sea level
- max is 2770, indicating a well in an elevated region

## installer:

In [79]:
features.installer.nunique()

2145

- over 2100 unique installers
- Once again, too many columns to encode

In [87]:
# get counts of num wells funded for each funder
installers = features.installer.value_counts()

# installers that have installed at least 100 wells
top_installers = installers[installers >= 100].index.tolist()
vals = installers[installers >= 100].values.tolist()

In [88]:
num_installers = len(top_installers)
num_installers

84

- Only 84 installers have built more than 100 wells

In [90]:
print(f'''The top {num_installers / features.installer.nunique() * 100 :.1f}% of installers make up \
{sum(vals) / features.shape[0] * 100 :.1f}% of the data''')

The top 3.9% of installers make up 71.0% of the data


- Going to take installers with 100+ wells installed in dataset

In [91]:
# replace other installers with "other"
features['installer'] = features['installer'].where(features['installer'].isin(top_installers), 'other')

## longitude and latitude:

- Tanzania's longitude spans from roughly 29 to 41 degrees
- Tanzania's latitude spans from roughly -12 to -1 degrees
- All vals should fall in between these ranges

In [101]:
features[['longitude']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,59400.0,34.077427,6.567432,0.0,33.090347,34.908743,37.178387,40.345193


- Seems like several of the longitude vals are below 29 degrees. None are over 41 degrees

In [117]:
features[features['longitude'] < 29]['longitude'].to_frame().T

Unnamed: 0,21,53,168,177,253,256,285,301,306,321,323,326,346,370,433,659,678,697,720,733,753,755,798,839,911,...,58545,58550,58555,58607,58678,58735,58771,58837,58859,58969,58997,59017,59033,59111,59120,59135,59158,59162,59166,59184,59189,59208,59295,59324,59374
longitude,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- 1812 rows where longitude is invalid (vals are 0)

In [119]:
# set the above indices to nan
indices = features[features['longitude'] < 29].index
features.loc[indices, 'longitude'] = np.nan

In [120]:
# show that all <29 rows are gone
features[features['longitude'] < 29]['longitude'].to_frame().T

longitude


In [102]:
features[['latitude']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,59400.0,-5.706033,2.946019,-11.64944,-8.540621,-5.021597,-3.326156,-2e-08


- Several of the latitude vals are greater than -1 degrees. None are below -12 degrees

In [134]:
# using -0.9 instead of -1 as there were 7 valid values in this range
features[features['latitude'] > -0.9]['latitude'].to_frame().T

Unnamed: 0,21,53,168,177,253,256,285,301,306,321,323,326,346,370,433,659,678,697,720,733,753,755,798,839,911,...,58545,58550,58555,58607,58678,58735,58771,58837,58859,58969,58997,59017,59033,59111,59120,59135,59158,59162,59166,59184,59189,59208,59295,59324,59374
latitude,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,...,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08,-2e-08


- Once again, 1812 rows with invalid latitude

In [138]:
# set the above indices to nan
indices = features[features['latitude'] > -0.9].index
features.loc[indices, 'latitude'] = np.nan

In [139]:
# show that all >-0.9 rows are gone
features[features['latitude'] > -0.9]['latitude'].to_frame().T

latitude


## wpt_name:

In [94]:
features.wpt_name.nunique()

37400

- This column is for well name. Don't think it has impact on pump functionality, so we're going to drop

## num_private:

In [22]:
features['num_private'].describe()

count    59400.000000
mean         0.474141
std         12.236230
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       1776.000000
Name: num_private, dtype: float64

In [23]:
features['num_private'].value_counts(normalize=True).head(3)

0    0.987256
6    0.001364
1    0.001229
Name: num_private, dtype: float64

- Going to drop this col, as it has an unclear meaning and the majority (98.7%) of vals are 0.

## basin and subvillage:

In [98]:
features.basin.nunique(), features.basin.unique()

(9,
 array(['Lake Nyasa', 'Lake Victoria', 'Pangani',
        'Ruvuma / Southern Coast', 'Internal', 'Lake Tanganyika',
        'Wami / Ruvu', 'Rufiji', 'Lake Rukwa'], dtype=object))

In [97]:
features.subvillage.nunique()

19287

- Going to drop subvillage. Region code, latitude, longitude, and basin should capture enough locational information

## region and region_code:

In [24]:
features.region.nunique()

21

In [25]:
features.region_code.nunique()

27

In [26]:
features.groupby(['region', 'region_code']).groups.keys()

dict_keys([('Arusha', 2), ('Arusha', 24), ('Dar es Salaam', 7), ('Dodoma', 1), ('Iringa', 11), ('Kagera', 18), ('Kigoma', 16), ('Kilimanjaro', 3), ('Lindi', 8), ('Lindi', 18), ('Lindi', 80), ('Manyara', 21), ('Mara', 20), ('Mbeya', 12), ('Morogoro', 5), ('Mtwara', 9), ('Mtwara', 90), ('Mtwara', 99), ('Mwanza', 17), ('Mwanza', 19), ('Pwani', 6), ('Pwani', 40), ('Pwani', 60), ('Rukwa', 15), ('Ruvuma', 10), ('Shinyanga', 11), ('Shinyanga', 14), ('Shinyanga', 17), ('Singida', 13), ('Tabora', 14), ('Tanga', 4), ('Tanga', 5)])

- 7 regions have multiple codes
- Going to drop "region" when training model as it is redundant and less specific than region code

In [92]:
features.region_code.dtypes

dtype('int64')

- Need to convert the region_code to 'object' to ensure it's treated as a category

In [93]:
# convert from int to str (Object)
features['region_code'] = features['region_code'].astype(str)
features.region_code.dtypes

dtype('O')

## district_code

In [140]:
features.district_code.nunique()

20

In [141]:
features.district_code.unique()

array([ 5,  2,  4, 63,  1,  8,  3,  6, 43,  7, 23, 33, 53, 62, 60, 30, 13,
        0, 80, 67], dtype=int64)

- Like region_code, need to convert to categorical column

In [142]:
# convert from int to str (Object)
features['district_code'] = features['district_code'].astype(str)
features.district_code.dtypes

dtype('O')

- May already have enough geographical info with latitude, longitude, basin, and region_code. Still going to keep for first model

## lga and ward
- Local Government Area: an administrative division within the country
- Ward: a smaller division of an LGA

In [143]:
features.lga.nunique()

125

In [152]:
features.ward.nunique()

2092

- Going to drop these two for now, as i think I have captured enough geographical data

## population

In [163]:
features.population.describe().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
population,59400.0,179.909983,471.482176,0.0,0.0,25.0,215.0,30500.0


- min and 25th percentile both show populations of 0

In [173]:
features[features.population == 0].shape

(21381, 41)

- over 21,000 rows with a population of 0. Could be a missing value
- could also just be wells located in remote areas with populations of 0

In [174]:
features[features.population == 1].shape

(7025, 41)

- 7,000 rows with a population of 1. This may indicate that the population values are accurate.
- Going to keep the column unmodified on first model

## scheme_management, management , and management_group

In [27]:
features['scheme_management'].nunique(), features['scheme_management'].unique()

(12,
 array(['VWC', 'Other', nan, 'Private operator', 'WUG', 'Water Board',
        'WUA', 'Water authority', 'Company', 'Parastatal', 'Trust', 'SWC',
        'None'], dtype=object))

In [28]:
features['scheme_management'].value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [29]:
features['management'].nunique(), features['management'].unique()

(12,
 array(['vwc', 'wug', 'other', 'private operator', 'water board', 'wua',
        'company', 'water authority', 'parastatal', 'unknown',
        'other - school', 'trust'], dtype=object))

In [30]:
features['management'].value_counts()

vwc                 40507
wug                  6515
water board          2933
wua                  2535
private operator     1971
parastatal           1768
water authority       904
other                 844
company               685
unknown               561
other - school         99
trust                  78
Name: management, dtype: int64

In [31]:
features['management_group'].nunique(), features['management_group'].unique()

(5,
 array(['user-group', 'other', 'commercial', 'parastatal', 'unknown'],
       dtype=object))

In [32]:
features['management_group'].value_counts()

user-group    52490
commercial     3638
parastatal     1768
other           943
unknown         561
Name: management_group, dtype: int64

- scheme_management and management seem to capture the same data. Going to use management as it has less unknown values
- Dropping scheme_management

## permit

In [33]:
features['permit'].nunique(), features['permit'].unique()

(2, array([False, True, nan], dtype=object))

In [34]:
features['permit'].value_counts(normalize=True)

True     0.68955
False    0.31045
Name: permit, dtype: float64

- permit is a boolean column that holds whether the builder of the well had a permit
- column does have around 3000 nulls. These will be addressed in the pipeline implementation

## extraction_type, extraction_type_group, extraction_type_class

In [35]:
features['extraction_type'].nunique(), features['extraction_type'].unique()

(18,
 array(['gravity', 'submersible', 'swn 80', 'nira/tanira', 'india mark ii',
        'other', 'ksb', 'mono', 'windmill', 'afridev', 'other - rope pump',
        'india mark iii', 'other - swn 81', 'other - play pump', 'cemo',
        'climax', 'walimi', 'other - mkulima/shinyanga'], dtype=object))

In [36]:
features['extraction_type_class'].nunique(), features['extraction_type_class'].unique()

(7,
 array(['gravity', 'submersible', 'handpump', 'other', 'motorpump',
        'wind-powered', 'rope pump'], dtype=object))

In [37]:
features['extraction_type_group'].nunique(), features['extraction_type_group'].unique()

(13,
 array(['gravity', 'submersible', 'swn 80', 'nira/tanira', 'india mark ii',
        'other', 'mono', 'wind-powered', 'afridev', 'rope pump',
        'india mark iii', 'other handpump', 'other motorpump'],
       dtype=object))

In [38]:
features.groupby(['extraction_type_class', 'extraction_type']).groups.keys()

dict_keys([('gravity', 'gravity'), ('handpump', 'afridev'), ('handpump', 'india mark ii'), ('handpump', 'india mark iii'), ('handpump', 'nira/tanira'), ('handpump', 'other - mkulima/shinyanga'), ('handpump', 'other - play pump'), ('handpump', 'other - swn 81'), ('handpump', 'swn 80'), ('handpump', 'walimi'), ('motorpump', 'cemo'), ('motorpump', 'climax'), ('motorpump', 'mono'), ('other', 'other'), ('rope pump', 'other - rope pump'), ('submersible', 'ksb'), ('submersible', 'submersible'), ('wind-powered', 'windmill')])

- 3 columns describe the type of pump used
- extraction_type seems to hold the most detail with 18 unique values
- dropping the other two may lose a small amount of info, but will reduce collinearity and make model training smoother
- choosing to drop both extraction_type_class and extraction_type_group

## payment, payment_type

In [39]:
features['payment'].nunique(), features['payment'].unique()

(7,
 array(['pay annually', 'never pay', 'pay per bucket', 'unknown',
        'pay when scheme fails', 'other', 'pay monthly'], dtype=object))

In [40]:
features['payment'].value_counts()

never pay                25348
pay per bucket            8985
pay monthly               8300
unknown                   8157
pay when scheme fails     3914
pay annually              3642
other                     1054
Name: payment, dtype: int64

In [41]:
features['payment_type'].nunique(), features['payment_type'].unique()

(7,
 array(['annually', 'never pay', 'per bucket', 'unknown', 'on failure',
        'other', 'monthly'], dtype=object))

In [42]:
features['payment_type'].value_counts()

never pay     25348
per bucket     8985
monthly        8300
unknown        8157
on failure     3914
annually       3642
other          1054
Name: payment_type, dtype: int64

- Duplicate cols. dropping "payment" as it is less succinct

## water_quality, quality_group

In [43]:
features['water_quality'].nunique(), features['water_quality'].unique()

(8,
 array(['soft', 'salty', 'milky', 'unknown', 'fluoride', 'coloured',
        'salty abandoned', 'fluoride abandoned'], dtype=object))

In [44]:
features['water_quality'].value_counts()

soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64

In [45]:
features['quality_group'].nunique(), features['quality_group'].unique()

(6,
 array(['good', 'salty', 'milky', 'unknown', 'fluoride', 'colored'],
       dtype=object))

In [46]:
features['quality_group'].value_counts()

good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

- water_quality seems to be more granular. Dropping quality_group

## quantity, quantity_group

In [47]:
features['quantity'].nunique(), features['quantity'].unique()

(5,
 array(['enough', 'insufficient', 'dry', 'seasonal', 'unknown'],
       dtype=object))

In [48]:
features['quantity'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity, dtype: int64

In [49]:
features['quantity_group'].nunique(), features['quantity_group'].unique()

(5,
 array(['enough', 'insufficient', 'dry', 'seasonal', 'unknown'],
       dtype=object))

In [50]:
features['quantity_group'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity_group, dtype: int64

- Duplicate columns. Dropping quantity_group

## source, source_type, source_class	

In [51]:
features['source'].nunique(), features['source'].unique()

(10,
 array(['spring', 'rainwater harvesting', 'dam', 'machine dbh', 'other',
        'shallow well', 'river', 'hand dtw', 'lake', 'unknown'],
       dtype=object))

In [52]:
features['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [53]:
features['source_type'].nunique(), features['source_type'].unique()

(7,
 array(['spring', 'rainwater harvesting', 'dam', 'borehole', 'other',
        'shallow well', 'river/lake'], dtype=object))

In [54]:
features['source_type'].value_counts()

spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

In [55]:
features['source_class'].nunique(), features['source_class'].unique()

(3, array(['groundwater', 'surface', 'unknown'], dtype=object))

In [56]:
features['source_class'].value_counts()

groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

- Dropping source_type. "source" is more specific
- going to keep source_class as it distinguishes above and below ground water sources

## waterpoint_type, waterpoint_type_group

In [57]:
features['waterpoint_type'].nunique(), features['waterpoint_type'].unique()

(7,
 array(['communal standpipe', 'communal standpipe multiple', 'hand pump',
        'other', 'improved spring', 'cattle trough', 'dam'], dtype=object))

In [58]:
features['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [59]:
features['waterpoint_type_group'].nunique(), features['waterpoint_type_group'].unique()

(6,
 array(['communal standpipe', 'hand pump', 'other', 'improved spring',
        'cattle trough', 'dam'], dtype=object))

In [60]:
features['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

- Dropping waterpoint_type_group as it is slightly less specific than waterpoint_type. It combines the two communal standpipe cats

### Drop x redundant columns:

In [288]:
features = features.copy().drop(['date_recorded', 'wpt_name', 'num_private', 'subvillage', 'region', 'lga', 'ward', 
                                 'scheme_management', 
                                 'extraction_type_class', 
                                 'extraction_type_group', 'payment', 'quality_group', 'quantity_group', 'source_type', 
                                 'waterpoint_type_group'], axis=1)

In [289]:
features.shape

(59400, 37)

- id column
- x features

## Check for null values in labels:

In [11]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            59400 non-null  int64 
 1   status_group  59400 non-null  object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


- No nulls

In [12]:
labels.status_group.unique().tolist()

['functional', 'non functional', 'functional needs repair']

### 3 potential targets for water well condition:
- Functional
- Functional, needs repair
- Non-functional

In [13]:
labels.status_group.value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

### Class proportions are imbalanced:
- "functional" and "non-functional" make up good chunks with 54% and 38% respectively
- the "needs repair" category only makes up 7% of the dataset
- will use SMOTE to upsample this class (in pipeline)

## Combine features with labels to get final df:

In [62]:
# adds 'status_group' col
df = features.merge(labels, on='id')

In [63]:
df.shape

(59400, 42)

In [None]:
df.to_csv('data/training_data.csv')