In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm


pd.options.display.max_rows = 35 
pd.options.display.max_columns = None

In [3]:
col_ignore = ['scheme_name', 'wpt_name', 'source_class', 'quality_group', 'quantity_group', 'recorded_by', 
             'extraction_type_group', 'extraction_type_class','management_group', 'payment_type', 'region_code',
              'district_code', 'subvillage', 'scheme_management', 'source_type', 'waterpoint_type_group', 
              'num_private', 'funder']

water_values = pd.read_csv('../data/raw/WaterValuesOriginal.csv')
water_values = water_values[[i for i in water_values.columns if i not in col_ignore]]
water_labels = pd.read_csv('../data/raw/WaterLabelsOriginal.csv')

water_labels['target'] = water_labels.status_group.values  #change target column
water_labels.drop('status_group', axis = 1, inplace = True) # drop the original column

water_values = pd.merge(water_values, water_labels, on = 'id') #merge target and features
water_values['date_recorded_date'] = pd.to_datetime(water_values.date_recorded)
water_values['year_recorded'] = pd.DatetimeIndex(water_values.date_recorded_date).year
print(water_values.shape)
print(len(water_values[water_values.year_recorded < 2011]))

#target distribution is roughly the same as the population distribution so we decided to drop them because there
#are not many entried and because the recorded was earlier than construction (all in 2004)

water_values = water_values[water_values.year_recorded >= 2011]


(59400, 25)
31


In [4]:
#fixing dates 
median_con = water_values[water_values.construction_year != 0].construction_year.median()
water_values.construction_year.replace(0, median_con, inplace = True)
cons_year = water_values.construction_year.values 
rec_date = water_values.date_recorded.values 
def get_time_since_built(cons, rec): 
    time_array = []
    for c, r in zip(cons, rec): 
        r = int(r.split('-')[0])
        c = int(c)
        diff = r -c
        time_array.append(diff)
    return time_array

t_array = get_time_since_built(cons_year, rec_date)
water_values['time_passed'] = t_array
water_values.drop(['construction_year', 'date_recorded_date', 'year_recorded', 'date_recorded'], 
                  axis = 1, inplace = True)

In [5]:
water_values.head(2)

Unnamed: 0,id,amount_tsh,gps_height,installer,longitude,latitude,basin,region,lga,ward,population,public_meeting,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,target,time_passed
0,69572,6000.0,1390,Roman,34.938093,-9.856322,Lake Nyasa,Iringa,Ludewa,Mundindi,109,True,False,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,functional,12
1,8776,0.0,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Mara,Serengeti,Natta,280,,True,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional,3


In [6]:
#changes binary true-false to 0-1
water_values.public_meeting.fillna(False, inplace = True)
water_values.public_meeting = water_values.public_meeting.astype(int)
water_values.permit.fillna(False, inplace = True)
water_values.permit = water_values.permit.astype(int)
water_values.head(2)

Unnamed: 0,id,amount_tsh,gps_height,installer,longitude,latitude,basin,region,lga,ward,population,public_meeting,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,target,time_passed
0,69572,6000.0,1390,Roman,34.938093,-9.856322,Lake Nyasa,Iringa,Ludewa,Mundindi,109,1,0,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,functional,12
1,8776,0.0,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Mara,Serengeti,Natta,280,0,1,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional,3


In [7]:
## Combining 'other' in extract 
def fix_extract_other(x): 
    if 'other' in x: 
        return 'extract_other'
    else: 
        return x

water_values['extraction_type'] = water_values.extraction_type.map(fix_extract_other)
water_values.head()

Unnamed: 0,id,amount_tsh,gps_height,installer,longitude,latitude,basin,region,lga,ward,population,public_meeting,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,target,time_passed
0,69572,6000.0,1390,Roman,34.938093,-9.856322,Lake Nyasa,Iringa,Ludewa,Mundindi,109,1,0,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,functional,12
1,8776,0.0,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Mara,Serengeti,Natta,280,0,1,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional,3
2,34310,25.0,686,World vision,37.460664,-3.821329,Pangani,Manyara,Simanjiro,Ngorika,250,1,1,gravity,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple,functional,4
3,67743,0.0,263,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,Nanyumbu,Nanyumbu,58,1,1,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,non functional,27
4,19728,0.0,0,Artisan,31.130847,-1.825359,Lake Victoria,Kagera,Karagwe,Nyakasimbi,0,1,1,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional,11


In [8]:
#fixing population to replace value with mdeian population within its basin 
def fix_population_basin(df): 
    new_df = df
    for idx in tqdm(range(len(new_df))): 
        pop = new_df.iloc[idx].population 
        if pop != 0: 
            continue 
        basin_value = new_df.iloc[idx].basin
        basin_median_pop = new_df[(new_df.basin == basin_value) & (new_df.population > 0)].population.median() 
        new_df.loc[idx, 'population'] = basin_median_pop
        
    return new_df     

water_values = fix_population_basin(water_values)


100%|██████████| 59369/59369 [02:36<00:00, 380.01it/s]


In [9]:
water_values.head()

Unnamed: 0,id,amount_tsh,gps_height,installer,longitude,latitude,basin,region,lga,ward,population,public_meeting,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,target,time_passed
0,69572.0,6000.0,1390.0,Roman,34.938093,-9.856322,Lake Nyasa,Iringa,Ludewa,Mundindi,109.0,1.0,0.0,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,functional,12.0
1,8776.0,0.0,1399.0,GRUMETI,34.698766,-2.147466,Lake Victoria,Mara,Serengeti,Natta,280.0,0.0,1.0,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional,3.0
2,34310.0,25.0,686.0,World vision,37.460664,-3.821329,Pangani,Manyara,Simanjiro,Ngorika,250.0,1.0,1.0,gravity,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple,functional,4.0
3,67743.0,0.0,263.0,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,Nanyumbu,Nanyumbu,58.0,1.0,1.0,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,non functional,27.0
4,19728.0,0.0,0.0,Artisan,31.130847,-1.825359,Lake Victoria,Kagera,Karagwe,Nyakasimbi,320.0,1.0,1.0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional,11.0


In [10]:
water_values.to_csv('../data/interim/WaterUpdated.csv', index = False)