In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

pd.options.display.max_rows = 35 
pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('WaterData/WaterUpdated.csv').drop('id', axis = 1)
df.target.replace({'functional': 0, 'non functional': 1, 'functional needs repair': 1}, inplace = True)
df.head(2)

Unnamed: 0,amount_tsh,gps_height,installer,basin,region,lga,ward,population,public_meeting,permit,extraction_type,management,payment,water_quality,quantity,source,waterpoint_type,target,time_passed
0,6000.0,1390,Roman,Lake Nyasa,Iringa,Ludewa,Mundindi,109.0,1,0,gravity,vwc,pay annually,soft,enough,spring,communal standpipe,0,12
1,0.0,1399,GRUMETI,Lake Victoria,Mara,Serengeti,Natta,280.0,0,1,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,0,3


In [11]:
new_df = df
new_df = df.join(pd.get_dummies(df.basin, prefix = 'basin'))
new_df = new_df.join(pd.get_dummies(df.extraction_type, prefix = 'extract'))
new_df = new_df.join(pd.get_dummies(df.payment, prefix = 'payment'))
new_df = new_df.join(pd.get_dummies(df.quantity, prefix = 'quantity'))
new_df = new_df.join(pd.get_dummies(df.water_quality, prefix = 'quality'))
new_df = new_df.join(pd.get_dummies(df.source, prefix = 'source'))
new_df = new_df.join(pd.get_dummies(df.waterpoint_type, prefix = 'waterpoint'))


unique_basin = [f'basin_{i}' for i in df.basin.unique()]
unique_extract = [f'extract_{i}' for i in df.extraction_type.unique()]
unique_waterpoint = [f'waterpoint_{i}' for i in df.waterpoint_type.unique()]
unique_source = [f'source_{i}' for i in df.source.unique() if i != 'unknown']
unique_quality = [f'quality_{i}' for i in df.water_quality.unique() if i != 'unknown']
unique_quantity = [f'quantity_{i}' for i in df.quantity.unique() if i != 'unknown']
unique_payment = [f'payment_{i}' for i in df.payment.unique() if i not in ['other', 'unknown']]

col = ['amount_tsh', 'gps_height', 'population', 'permit', 'time_passed', 'target']
col = col + unique_basin + unique_extract + unique_waterpoint + unique_source + unique_quality + unique_quantity + unique_payment
new_df = new_df[col]
func_df = new_df[new_df.target == 0]
repair_df = new_df[new_df.target ==1]

print(len(func_df), len(repair_df))

resamp_repair = resample(repair_df, n_samples = len(repair_df), random_state = 10)

resampled_concat = pd.concat([func_df, resamp_repair])
X = resampled_concat
y = resampled_concat[['target']]

print(len(X), len(new_df))

pickle.dump(X, open('pickles/X.p', 'wb'))
pickle.dump(y, open('pickles/y.p', 'wb'))

x_train, x_test, y_train, y_test = train_test_split(X,y, stratify = X.target, random_state = 10, train_size = .85)
print(y_train.target.value_counts(), y_test.target.value_counts())
x_train = x_train.drop('target', axis =1)
x_test = x_test.drop('target', axis =1)

pickle.dump(y_test, open('pickles/y_test.p', 'wb'))
pickle.dump(y_train, open('pickles/y_train.p', 'wb'))
pickle.dump(x_train, open('pickles/x_train.p', 'wb'))
pickle.dump(x_test, open('pickles/x_test.p', 'wb'))

32248 27121
59369 59369
0    27410
1    23053
Name: target, dtype: int64 0    4838
1    4068
Name: target, dtype: int64
