# EDA across three parts and all data manipulation (to produce cleaned training/testing data)

## Import packages

In [1]:
import numpy as np
import pandas as pd
import os

## Read  and merge the data

In [2]:
path = '../../data/raw'
training_values = pd.read_csv(path + '/TrainingSetValues.csv')
training_labels = pd.read_csv(path + '/TrainingSetLabels.csv')
testing = pd.read_csv(path + '/TestSetValues.csv')

training = pd.merge(training_values, training_labels, how='inner', on='id')
print (list(training.columns.values)) # column names of df
training.head(5)

['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


## Columns to delete

In [None]:
cols_to_eliminate = ['funder', 'installer', 'wpt_name', 'num_private', 'subvillage', 'region_code', 'lga', 'ward', 'district_code', # from EDA_1
                    'recorded_by', 'scheme_management', 'scheme_name' # from EDA_2
                    ]

## Columns to check (gps_height, construction_year, population, region, basin)

### gps_height (currently keep)

In [14]:
print (training[training['gps_height'] <= 0][['construction_year','gps_height', 'population', 'status_group']].sort_values(by='gps_height'))
print ('----------------------------------')
print (training[training['gps_height'] >0][['construction_year','gps_height', 'population', 'status_group']].sort_values(by='gps_height'))

       construction_year  gps_height  population             status_group
58934                  0         -90         550               functional
50296               2007         -63         353               functional
15193               2000         -63         150               functional
14285               1970         -59          90           non functional
41165               1974         -57           1               functional
35046               2007         -55          50           non functional
3045                1970         -54         140           non functional
39076               2008         -53           1               functional
48241               2000         -52          50               functional
54063               1986         -52          56               functional
27679               1975         -51         100           non functional
9713                2006         -51         150  functional needs repair
29127               1970         -50  

### construction year

In [9]:
training[training['construction_year'] != 0][['construction_year', 'status_group']].sort_values(by='construction_year')

Unnamed: 0,construction_year,status_group
4562,1960,non functional
252,1960,non functional
54434,1960,non functional
29002,1960,non functional
51754,1960,functional
1407,1960,non functional
56544,1960,non functional
42937,1960,non functional
18629,1960,functional needs repair
24663,1960,non functional


#### Seems like more recent ones tend to function, change year 0 to NA

### Population vs Region and Basin

In [22]:
print(training[training['population'] == 0].groupby('region').size())
print ('--------------')
print (training[training['population'] > 0].groupby('region').size())
print ('--------------')
print(training[training['population'] == 0].groupby('basin').size())
print ('--------------')
print (training[training['population'] > 0].groupby('basin').size())
print ('--------------')

region
Arusha          1
Dodoma       2201
Iringa       1164
Kagera       3316
Kigoma         31
Mbeya        4639
Mwanza       2735
Pwani           1
Rukwa          47
Ruvuma        467
Shinyanga    4818
Singida         2
Tabora       1959
dtype: int64
--------------
region
Arusha           3349
Dar es Salaam     805
Iringa           4130
Kigoma           2785
Kilimanjaro      4379
Lindi            1546
Manyara          1583
Mara             1969
Morogoro         4006
Mtwara           1730
Mwanza            367
Pwani            2634
Rukwa            1761
Ruvuma           2173
Shinyanga         164
Singida          2091
Tanga            2547
dtype: int64
--------------
basin
Internal                   3111
Lake Nyasa                 3186
Lake Rukwa                 1466
Lake Tanganyika            2865
Lake Victoria              7716
Pangani                       1
Rufiji                     1765
Ruvuma / Southern Coast     256
Wami / Ruvu                1015
dtype: int64
--------------


#### Population and Region are related, but Population and Basin are not

### Check Population and Construction Year

In [32]:
print(training[training['population'] == 0].shape) # 21381 population = 0
print(training[training['construction_year'] == 0].shape) # 20709 construction year = 0
print(training[(training['construction_year'] == 0) & (training['population'] == 0)].shape) # 20034 both 0

training[(training['construction_year'] == 0) & (training['population'] == 0)].groupby('region').size()

(21381, 41)
(20709, 41)
(20034, 41)


region
Dodoma       2201
Iringa        365
Kagera       3316
Mbeya        4639
Mwanza       2735
Pwani           1
Shinyanga    4818
Tabora       1959
dtype: int64

In [33]:
training[training['population'] > 0]['population'].describe() # right-skewed

count    38019.000000
mean       281.087167
std        564.687660
min          1.000000
25%         40.000000
50%        150.000000
75%        324.000000
max      30500.000000
Name: population, dtype: float64

#### Current idea: Population = 0 is fine, just change year=0 to na

## Columns to change

### 1. Construction year 0 to NA

### 2. Create a new variable "Age" based on Construction Year

### 3. Management Group None to Other category (later after first regression model and decide whether this feature is valuable or not)

## Archived code

In [None]:
# correlation:
cor = training[['population', 'region', 'basin']].apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)
print (cor)