# Preprocess test data into the format corresponding to preprocessing_final

## Import packages

In [1]:
import numpy as np
import pandas as pd
import os

## Read test data

In [3]:
path = '../../data/raw'
testing = pd.read_csv(path + '/TestSetValues.csv')

## Columns to delete

In [4]:
cols_to_eliminate = ['funder', 'installer', 'wpt_name', 'num_private', 'subvillage', 'region_code', 'lga', 'ward', 'district_code', # from EDA_1
                    'recorded_by', 'scheme_management', 'scheme_name', # from EDA_2
                     'payment_type', 'water_quality', 'quantity', 'waterpoint_type_group' # from EDA_3
                    ]
testing = testing.drop(cols_to_eliminate, axis=1)

In [5]:
testing.shape

(14850, 24)

## Change columns

### 1. Consturction year 0 to NA

In [7]:
testing.construction_year = testing['construction_year'].replace(0,np.NaN)
print(testing['construction_year'].describe(include='All'))
sum(testing['construction_year'].isnull())

count    9590.000000
mean     1997.097914
std        12.310565
min      1960.000000
25%      1988.000000
50%      2000.000000
75%      2008.000000
max      2013.000000
Name: construction_year, dtype: float64


5260

### 2. Create a new variable "Age" based on Construction Year

In [8]:
testing['age'] = 2013 - testing.construction_year
print(testing['age'].describe(include='All'))
sum(testing['age'].isnull())

count    9590.000000
mean       15.902086
std        12.310565
min         0.000000
25%         5.000000
50%        13.000000
75%        25.000000
max        53.000000
Name: age, dtype: float64


5260

### 3. Management Group None to Other category (later after first regression model and decide whether this feature is valuable or not)

In [9]:
testing['management_group_new'] = np.where((testing['management_group'] == 'unknown'), 
                                                 'other', testing['management_group'])
testing.groupby('management_group_new').size()

management_group_new
commercial      953
other           388
parastatal      461
user-group    13048
dtype: int64

### 4. quality_group 'salty', 'milky', 'colored', 'fluoride' to bad

In [10]:
testing['quality_group_new'] = testing['quality_group']
testing.loc[testing['quality_group_new'].isin(['salty', 'milky', 'colored', 'fluoride']),'quality_group_new'] = 'bad'
testing['quality_group_new'].value_counts()

good       12687
bad         1694
unknown      469
Name: quality_group_new, dtype: int64

### 5. waterpoint_type 'other','improved spring', 'cattle trough', 'dam' to 'other' (insufficient size)

In [13]:
testing['waterpoint_type_new'] = testing['waterpoint_type']
testing.loc[testing['waterpoint_type_new'].isin(['other','improved spring', 'cattle trough', 'dam']),'waterpoint_type_new'] = 'other'
testing['waterpoint_type_new'].value_counts()

communal standpipe             7106
hand pump                      4396
other                          1840
communal standpipe multiple    1508
Name: waterpoint_type_new, dtype: int64

### 6. Split date recorded to three columns (year_recorded, month_recorded, day_recorded) (Corresponding to original 7)

In [14]:
testing['date_recorded'] = testing['date_recorded'].astype('datetime64[ns]')
testing['year_recorded'] = testing['date_recorded'].dt.year
testing['month_recorded'] = testing['date_recorded'].dt.month
testing['day_recorded'] = testing['date_recorded'].dt.day

testing[['date_recorded', 'year_recorded', 'month_recorded', 'day_recorded']].head(5)

Unnamed: 0,date_recorded,year_recorded,month_recorded,day_recorded
0,2013-02-04,2013,2,4
1,2013-02-04,2013,2,4
2,2013-02-01,2013,2,1
3,2013-01-22,2013,1,22
4,2013-03-27,2013,3,27


## Final testing set

In [15]:
testing_final = testing.drop(['longitude', 'latitude', 'construction_year',
                                'management_group', 'quality_group', 'waterpoint_type', 'date_recorded'], axis = 1)
print(testing_final.shape)
testing_final.head(5)

(14850, 24)


Unnamed: 0,id,amount_tsh,gps_height,basin,region,population,public_meeting,permit,extraction_type,extraction_type_group,...,source,source_type,source_class,age,management_group_new,quality_group_new,waterpoint_type_new,year_recorded,month_recorded,day_recorded
0,50785,0.0,1996,Internal,Manyara,321,True,True,other,other,...,rainwater harvesting,rainwater harvesting,surface,1.0,parastatal,good,other,2013,2,4
1,51630,0.0,1569,Pangani,Arusha,300,True,True,gravity,gravity,...,spring,spring,groundwater,13.0,user-group,good,communal standpipe,2013,2,4
2,17168,0.0,1567,Internal,Singida,500,True,,other,other,...,rainwater harvesting,rainwater harvesting,surface,3.0,user-group,good,other,2013,2,1
3,45559,0.0,267,Ruvuma / Southern Coast,Lindi,250,,True,other,other,...,shallow well,shallow well,groundwater,26.0,user-group,good,other,2013,1,22
4,49871,500.0,1260,Ruvuma / Southern Coast,Ruvuma,60,,True,gravity,gravity,...,spring,spring,groundwater,13.0,user-group,good,communal standpipe,2013,3,27


In [17]:
testing_final_out = testing_final[['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded', 'gps_height', 'basin', 'region', 
                                'population', 'public_meeting', 'permit', 'age', 'extraction_type', 'extraction_type_group', 'extraction_type_class',
                                'management', 'management_group_new', 'payment', 'quantity_group', 'quality_group_new', 'source', 'source_type',
                                'source_class', 'waterpoint_type_new']]

testing_final_out.shape

(14850, 24)

### write out csv

In [18]:
path_out = '../../data/cleaned'
testing_final_out.to_csv(path_out + '/testing_cleaned_v1.csv', index = False)