# Preprocess test data into the format corresponding to preprocessing_final

## Import packages

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

## Read test data

In [3]:
path = '../../data/raw'
testing = pd.read_csv(path + '/TestSetValues.csv')

## Columns to delete

In [4]:
cols_to_eliminate = ['funder', 'installer', 'wpt_name', 'num_private', 'subvillage', 'region_code', 'lga', 'ward', 'district_code', # from EDA_1
                    'recorded_by', 'scheme_management', 'scheme_name', # from EDA_2
                     'payment_type', 'water_quality', 'quantity', 'waterpoint_type_group' # from EDA_3
                    ]
testing = testing.drop(cols_to_eliminate, axis=1)

In [5]:
testing.shape

(14850, 24)

## Change columns

### 1. Consturction year 0 to NA

In [6]:
testing.construction_year = testing['construction_year'].replace(0,np.NaN)
print(testing['construction_year'].describe(include='All'))
sum(testing['construction_year'].isnull())

count    9590.000000
mean     1997.097914
std        12.310565
min      1960.000000
25%      1988.000000
50%      2000.000000
75%      2008.000000
max      2013.000000
Name: construction_year, dtype: float64


5260

### 2. Create a new variable "Age" based on Construction Year

In [7]:
testing['age'] = 2013 - testing.construction_year
print(testing['age'].describe(include='All'))
sum(testing['age'].isnull())

count    9590.000000
mean       15.902086
std        12.310565
min         0.000000
25%         5.000000
50%        13.000000
75%        25.000000
max        53.000000
Name: age, dtype: float64


5260

### 3. Management Group None to Other category (later after first regression model and decide whether this feature is valuable or not)

In [8]:
testing['management_group_new'] = np.where((testing['management_group'] == 'unknown'), 
                                                 'other', testing['management_group'])
testing.groupby('management_group_new').size()

management_group_new
commercial      953
other           388
parastatal      461
user-group    13048
dtype: int64

### 4. quality_group 'salty', 'milky', 'colored', 'fluoride' to bad

In [9]:
testing['quality_group_new'] = testing['quality_group']
testing.loc[testing['quality_group_new'].isin(['salty', 'milky', 'colored', 'fluoride']),'quality_group_new'] = 'bad'
testing['quality_group_new'].value_counts()

good       12687
bad         1694
unknown      469
Name: quality_group_new, dtype: int64

### 5. waterpoint_type 'other','improved spring', 'cattle trough', 'dam' to 'other' (insufficient size)

In [10]:
testing['waterpoint_type_new'] = testing['waterpoint_type']
testing.loc[testing['waterpoint_type_new'].isin(['other','improved spring', 'cattle trough', 'dam']),'waterpoint_type_new'] = 'other'
testing['waterpoint_type_new'].value_counts()

communal standpipe             7106
hand pump                      4396
other                          1840
communal standpipe multiple    1508
Name: waterpoint_type_new, dtype: int64

### 6. Split date recorded to three columns (year_recorded, month_recorded, day_recorded) (Corresponding to original 7)

In [11]:
testing['date_recorded'] = testing['date_recorded'].astype('datetime64[ns]')
testing['year_recorded'] = testing['date_recorded'].dt.year
testing['month_recorded'] = testing['date_recorded'].dt.month
testing['day_recorded'] = testing['date_recorded'].dt.day

testing[['date_recorded', 'year_recorded', 'month_recorded', 'day_recorded']].head(5)

Unnamed: 0,date_recorded,year_recorded,month_recorded,day_recorded
0,2013-02-04,2013,2,4
1,2013-02-04,2013,2,4
2,2013-02-01,2013,2,1
3,2013-01-22,2013,1,22
4,2013-03-27,2013,3,27


### 7. create numerical encodings for more non-na category vars (basin, region, extraction (3 cols), management (2 cols), payment, quantity_group, source (3 cols), quality_group_new, waterpoint_type_new)

In [12]:
le = preprocessing.LabelEncoder()

#### basin

In [13]:
le.fit(testing['basin'])
testing['basin_encoded'] = le.transform(testing['basin'])

#### region

In [14]:
le.fit(testing['region'])
testing['region_encoded'] = le.transform(testing['region'])

#### extraction (3 cols)

In [15]:
le.fit(testing['extraction_type'])
testing['extraction_type_encoded'] = le.transform(testing['extraction_type'])
le.fit(testing['extraction_type_group'])
testing['extraction_type_group_encoded'] = le.transform(testing['extraction_type_group'])
le.fit(testing['extraction_type_class'])
testing['extraction_type_class_encoded'] = le.transform(testing['extraction_type_class'])

#### management (2 cols)

In [16]:
le.fit(testing['management'])
testing['management_encoded'] = le.transform(testing['management'])
le.fit(testing['management_group_new'])
testing['management_group_new_encoded'] = le.transform(testing['management_group_new'])

#### payment

In [17]:
le.fit(testing['payment'])
testing['payment_encoded'] = le.transform(testing['payment'])

#### quantity_group

In [18]:
le.fit(testing['quantity_group'])
testing['quantity_group_encoded'] = le.transform(testing['quantity_group'])

#### source (3 cols)

In [19]:
le.fit(testing['source'])
testing['source_encoded'] = le.transform(testing['source'])
le.fit(testing['source_type'])
testing['source_type_encoded'] = le.transform(testing['source_type'])
le.fit(testing['source_class'])
testing['source_class_encoded'] = le.transform(testing['source_class'])

#### quality_group_new

In [20]:
le.fit(testing['quality_group_new'])
testing['quality_group_new_encoded'] = le.transform(testing['quality_group_new'])

#### waterpoint_type_new

In [21]:
le.fit(testing['waterpoint_type_new'])
testing['waterpoint_type_new_encoded'] = le.transform(testing['waterpoint_type_new'])

### 8. create numerical encodings for more na-containing category vars (public meeting, permit)

In [22]:
testing['public_meeting_new'] = np.where(testing['public_meeting'] == True, 
                                                 1, np.where(testing['public_meeting'] == False, 0, np.nan))
testing['public_meeting_new'] = testing.public_meeting_new.astype('category')

testing['permit_new'] = np.where(testing['permit'] == True, 
                                                 1, np.where(testing['permit'] == False, 0, np.nan))
testing['permit_new'] = testing.permit_new.astype('category')

## Final testing set

In [23]:
testing_final = testing.drop(['longitude', 'latitude', 'construction_year',
                                'management_group', 'quality_group', 'waterpoint_type', 'date_recorded',
                                'public_meeting', 'permit'], axis = 1)
print(testing_final.shape)
testing_final.head(5)

(14850, 38)


Unnamed: 0,id,amount_tsh,gps_height,basin,region,population,extraction_type,extraction_type_group,extraction_type_class,management,...,management_group_new_encoded,payment_encoded,quantity_group_encoded,source_encoded,source_type_encoded,source_class_encoded,quality_group_new_encoded,waterpoint_type_new_encoded,public_meeting_new,permit_new
0,50785,0.0,1996,Internal,Manyara,321,other,other,other,parastatal,...,2,0,3,5,3,1,1,3,1.0,1.0
1,51630,0.0,1569,Pangani,Arusha,300,gravity,gravity,gravity,vwc,...,3,0,2,8,6,0,1,0,1.0,1.0
2,17168,0.0,1567,Internal,Singida,500,other,other,other,vwc,...,3,0,2,5,3,1,1,3,1.0,
3,45559,0.0,267,Ruvuma / Southern Coast,Lindi,250,other,other,other,vwc,...,3,6,0,7,5,0,1,3,,1.0
4,49871,500.0,1260,Ruvuma / Southern Coast,Ruvuma,60,gravity,gravity,gravity,water board,...,3,3,1,8,6,0,1,0,,1.0


In [25]:
testing_final_out = testing_final[['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded', 'gps_height', 'basin', 'basin_encoded','region',
                                     'region_encoded', 'population', 'public_meeting_new', 'permit_new', 'age', 'extraction_type', 'extraction_type_encoded', 
                                     'extraction_type_group', 'extraction_type_group_encoded', 'extraction_type_class', 'extraction_type_class_encoded', 
                                     'management', 'management_encoded','management_group_new', 'management_group_new_encoded', 'payment', 'payment_encoded',
                                     'quantity_group', 'quantity_group_encoded', 'quality_group_new', 'quality_group_new_encoded', 'source', 'source_encoded',
                                     'source_type', 'source_type_encoded', 'source_class', 'source_class_encoded', 'waterpoint_type_new', 
                                     'waterpoint_type_new_encoded']]

print (testing_final_out.shape)
print (testing_final_out.columns)

(14850, 38)
Index(['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded',
       'gps_height', 'basin', 'basin_encoded', 'region', 'region_encoded',
       'population', 'public_meeting_new', 'permit_new', 'age',
       'extraction_type', 'extraction_type_encoded', 'extraction_type_group',
       'extraction_type_group_encoded', 'extraction_type_class',
       'extraction_type_class_encoded', 'management', 'management_encoded',
       'management_group_new', 'management_group_new_encoded', 'payment',
       'payment_encoded', 'quantity_group', 'quantity_group_encoded',
       'quality_group_new', 'quality_group_new_encoded', 'source',
       'source_encoded', 'source_type', 'source_type_encoded', 'source_class',
       'source_class_encoded', 'waterpoint_type_new',
       'waterpoint_type_new_encoded'],
      dtype='object')


### write out csv

In [18]:
path_out = '../../data/cleaned'
testing_final_out.to_csv(path_out + '/testing_cleaned_v2.csv', index = False)