In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline

In [2]:
x = pd.read_csv('./Data/TrainingValues.csv', parse_dates=['date_recorded'])
y = pd.read_csv('./Data/TrainingLabels.csv')

In [3]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     59400 non-null  int64         
 1   amount_tsh             59400 non-null  float64       
 2   date_recorded          59400 non-null  datetime64[ns]
 3   funder                 55765 non-null  object        
 4   gps_height             59400 non-null  int64         
 5   installer              55745 non-null  object        
 6   longitude              59400 non-null  float64       
 7   latitude               59400 non-null  float64       
 8   wpt_name               59400 non-null  object        
 9   num_private            59400 non-null  int64         
 10  basin                  59400 non-null  object        
 11  subvillage             59029 non-null  object        
 12  region                 59400 non-null  object        
 13  r

In [4]:
x.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [5]:
x.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [6]:
percent_missing = x.isnull().sum()*100 /len(x)
missing_value_df = pd.DataFrame({'column_name' : x.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
id,id,0.0
amount_tsh,amount_tsh,0.0
date_recorded,date_recorded,0.0
funder,funder,6.119529
gps_height,gps_height,0.0
installer,installer,6.153199
longitude,longitude,0.0
latitude,latitude,0.0
wpt_name,wpt_name,0.0
num_private,num_private,0.0


In [7]:
x.date_recorded.describe(datetime_is_numeric=True)

count                            59400
mean     2012-03-29 09:11:33.818181888
min                2002-10-14 00:00:00
25%                2011-04-01 00:00:00
50%                2012-10-10 00:00:00
75%                2013-02-09 00:00:00
max                2013-12-03 00:00:00
Name: date_recorded, dtype: object

In [8]:
def selecting_top(df, column, num):
    return df[column].value_counts()[lambda x : x > num].to_frame().index.values

In [9]:
def apply_top(df, column, values):
    df[column] = df[column].apply(lambda x: x if x in values else 'Other')

In [10]:
def select_apply(df, column, num):
    values = selecting_top(df, column, num)
    apply_top(df,column, values)

In [11]:
def filling_na(x, column):
    x[column] = x[column].fillna('Unknown')

In [12]:
def day_update(x):
    x['days_since_recorded'] = (dt.datetime(2013,12,3) - (x.date_recorded)).dt.days.astype(int)

In [13]:
def year_update(x):
    if not isinstance(x.construction_year[2],str):
        bins = [-1, 1960, 1970,1980,1990,2000,2010,2020]
        labels = ['Unknown','60s', '70s', '80s', '90s', '00s', '10s']
        x['construction_year'] = pd.cut(x.construction_year, 
                                    bins = bins, 
                                    labels = labels, 
                                    right = False)

In [14]:
day_update(x)
x['days_since_recorded'].describe()

count    59400.000000
mean       613.616970
std        334.216374
min          0.000000
25%        297.000000
50%        419.000000
75%        977.000000
max       4068.000000
Name: days_since_recorded, dtype: float64

In [15]:
select_apply(x, 'funder', 300)

In [16]:
x.gps_height.value_counts()
x.drop(['gps_height'],axis =1, inplace=True)

In [17]:
x.installer.value_counts()

DWE                        17402
Government                  1825
RWE                         1206
Commu                       1060
DANIDA                      1050
                           ...  
DIMON                          1
TANEDAPS Society               1
LWI &CENTRAL GOVERNMENT        1
MOSQUE                         1
MP Mloka                       1
Name: installer, Length: 2145, dtype: int64

In [18]:
select_apply(x, 'installer', 200)

In [19]:
x.drop(['latitude', 'longitude', 'wpt_name',
        'num_private','region_code', 'district_code'], axis = 1, inplace=True)

In [20]:
x.basin.value_counts()

Lake Victoria              10248
Pangani                     8940
Rufiji                      7976
Internal                    7785
Lake Tanganyika             6432
Wami / Ruvu                 5987
Lake Nyasa                  5085
Ruvuma / Southern Coast     4493
Lake Rukwa                  2454
Name: basin, dtype: int64

In [21]:
x.subvillage.value_counts()
filling_na(x, 'subvillage')

In [22]:
x.region.value_counts()

Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64

In [23]:
x.ward.value_counts()

Igosi        307
Imalinyi     252
Siha Kati    232
Mdandu       231
Nduruma      217
            ... 
Themi          1
Sungwisi       1
Thawi          1
Kitete         1
Simbay         1
Name: ward, Length: 2092, dtype: int64

In [24]:
filling_na(x, 'public_meeting')

In [25]:
x.scheme_management.value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [26]:
select_apply(x, 'scheme_management', 100)

In [27]:
filling_na(x, 'permit')

In [28]:
x.scheme_name.value_counts()

K                              682
None                           644
Borehole                       546
Chalinze wate                  405
M                              400
                              ... 
Chusuro water sup                1
Maswa Water supplier supply      1
water supply Katungulu           1
Iwil                             1
Mfinga Water Supply              1
Name: scheme_name, Length: 2696, dtype: int64

In [29]:
x.drop(['scheme_name'], axis = 1, inplace=True)

In [30]:
x.construction_year.value_counts().head()

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
Name: construction_year, dtype: int64

In [31]:
year_update(x)
x.construction_year.value_counts()

Unknown    20709
00s        15330
90s         7678
80s         5578
10s         5161
70s         4406
60s          538
Name: construction_year, dtype: int64

In [32]:
x.drop(['extraction_type_group', 'extraction_type_class'], axis = 1, inplace=True)
x.drop(['management', 'management_group'], axis = 1, inplace=True)
x.drop('payment_type', axis=1, inplace=True)
x.drop(['quality_group', 'quantity_group', 'source_type'], axis = 1, inplace=True)

In [33]:
train = pd.merge(x, y, on='id')
train.drop('id', axis =1, inplace=True)

In [34]:
train.to_csv('./Data/TrainCleaned.csv', index = False)

In [1]:
test = pd.read_csv('./Data/TestValues.csv',parse_dates=['date_recorded'])

test = test.drop(['gps_height','latitude', 'longitude', 'wpt_name','num_private',
                  'region_code', 'district_code','scheme_name','extraction_type_group', 
                  'extraction_type_class','management', 'management_group',
                  'payment_type','quality_group', 'quantity_group', 'source_type','id'],axis = 1)
day_update(test)
select_apply(test, 'funder', 300)
select_apply(test, 'scheme_management', 100)
select_apply(test, 'installer', 200)
year_update(test)
filling_na(test, 'public_meeting')
filling_na(test, 'permit')
filling_na(test, 'subvillage')

test.to_csv('./Data/TestValuesCleaned.csv', index = False)

NameError: name 'pd' is not defined

In [36]:
test.shape

(14850, 24)

In [37]:
train.shape

(59400, 25)

In [38]:
percent_missing = train.isnull().sum()*100 /len(train)
missing_value_df = pd.DataFrame({'column_name' : train.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
amount_tsh,amount_tsh,0.0
date_recorded,date_recorded,0.0
funder,funder,0.0
installer,installer,0.0
basin,basin,0.0
subvillage,subvillage,0.0
region,region,0.0
lga,lga,0.0
ward,ward,0.0
population,population,0.0
