In [88]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score, precision_recall_curve, f1_score

## Define the data path - Customize for your own data

In [42]:
PATH = "/Users/iremn/PythonClass/Spiced/gradient_garlic-code_Work/Week_02/Day_5"
FEATURES = "pumps.csv"
TARGET = "pumps_y.csv"

In [43]:
#Join
FEATURES = os.path.join(PATH, FEATURES)
TARGET = os.path.join(PATH, TARGET)

## Read the Pumps Data

In [44]:
pumps_f = pd.read_csv(FEATURES, index_col = 0)
pumps_y = pd.read_csv(TARGET, index_col = 0)


## Train_Test Split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(pumps_f, pumps_y, test_size = 0.2, random_state=42)

## Data Peek

In [46]:
X_train.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,50.0,2013-02-27,Dmdd,2092,DMDD,35.42602,-4.227446,Narmo,0,Internal,...,per bucket,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
510,0.0,2011-03-17,Cmsr,0,Gove,35.510074,-5.724555,Lukali,0,Internal,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
14146,0.0,2011-07-10,Kkkt,0,KKKT,32.499866,-9.081222,Mahakama,0,Lake Rukwa,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other
47410,0.0,2011-04-12,,0,,34.060484,-8.830208,Shule Ya Msingi Chosi A,0,Rufiji,...,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
1288,300.0,2011-04-05,Ki,1023,Ki,37.03269,-6.040787,Kwa Mjowe,0,Wami / Ruvu,...,on failure,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other


In [47]:
X_train.shape

(47520, 39)

In [48]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 454 to 23812
Data columns (total 39 columns):
amount_tsh               47520 non-null float64
date_recorded            47520 non-null object
funder                   44644 non-null object
gps_height               47520 non-null int64
installer                44631 non-null object
longitude                47520 non-null float64
latitude                 47520 non-null float64
wpt_name                 47520 non-null object
num_private              47520 non-null int64
basin                    47520 non-null object
subvillage               47224 non-null object
region                   47520 non-null object
region_code              47520 non-null int64
district_code            47520 non-null int64
lga                      47520 non-null object
ward                     47520 non-null object
population               47520 non-null int64
public_meeting           44831 non-null object
recorded_by              47520 non-null objec

In [49]:
#Check the unique vals within those columns
X_train.nunique()

amount_tsh                  96
date_recorded              351
funder                    1698
gps_height                2401
installer                 1923
longitude                46043
latitude                 46044
wpt_name                 30742
num_private                 59
basin                        9
subvillage               17232
region                      21
region_code                 27
district_code               20
lga                        125
ward                      2076
population                 971
public_meeting               2
recorded_by                  1
scheme_management           12
scheme_name               2541
permit                       2
construction_year           55
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity

* Seems like there are some redundant columns such as "quantity" and "quantity_group" and so on. One can drop those columns
* Split the categorical and numerical features
* Do EDA, wrangle data but don't forget to do all with test data too

In [50]:
pumps_num = X_train.select_dtypes(exclude ='object') 
pumps_cat = X_train.select_dtypes(include = 'object')

In [51]:
pumps_num.head()

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
454,50.0,2092,35.42602,-4.227446,0,21,1,160,1998
510,0.0,0,35.510074,-5.724555,0,1,6,0,0
14146,0.0,0,32.499866,-9.081222,0,12,6,0,0
47410,0.0,0,34.060484,-8.830208,0,12,7,0,0
1288,300.0,1023,37.03269,-6.040787,0,5,1,120,1997


In [52]:
pumps_cat.head()

Unnamed: 0_level_0,date_recorded,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,2013-02-27,Dmdd,DMDD,Narmo,Internal,Bashnet Kati,Manyara,Babati,Bashinet,True,...,per bucket,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
510,2011-03-17,Cmsr,Gove,Lukali,Internal,Lukali,Dodoma,Bahi,Lamaiti,True,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
14146,2011-07-10,Kkkt,KKKT,Mahakama,Lake Rukwa,Chawalikozi,Mbeya,Mbozi,Ndalambo,True,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other
47410,2011-04-12,,,Shule Ya Msingi Chosi A,Rufiji,Shuleni,Mbeya,Mbarali,Chimala,True,...,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
1288,2011-04-05,Ki,Ki,Kwa Mjowe,Wami / Ruvu,Ngholong,Morogoro,Kilosa,Chakwale,True,...,on failure,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other


In [53]:
pumps_num['date_recorded'] = pumps_cat['date_recorded']
pumps_cat = pumps_cat.drop('date_recorded', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Work on the categorical columns

In [54]:
pumps_cat.head()

Unnamed: 0_level_0,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,Dmdd,DMDD,Narmo,Internal,Bashnet Kati,Manyara,Babati,Bashinet,True,GeoData Consultants Ltd,...,per bucket,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
510,Cmsr,Gove,Lukali,Internal,Lukali,Dodoma,Bahi,Lamaiti,True,GeoData Consultants Ltd,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump
14146,Kkkt,KKKT,Mahakama,Lake Rukwa,Chawalikozi,Mbeya,Mbozi,Ndalambo,True,GeoData Consultants Ltd,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other
47410,,,Shule Ya Msingi Chosi A,Rufiji,Shuleni,Mbeya,Mbarali,Chimala,True,GeoData Consultants Ltd,...,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe
1288,Ki,Ki,Kwa Mjowe,Wami / Ruvu,Ngholong,Morogoro,Kilosa,Chakwale,True,GeoData Consultants Ltd,...,on failure,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other


In [55]:
pumps_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 454 to 23812
Data columns (total 29 columns):
funder                   44644 non-null object
installer                44631 non-null object
wpt_name                 47520 non-null object
basin                    47520 non-null object
subvillage               47224 non-null object
region                   47520 non-null object
lga                      47520 non-null object
ward                     47520 non-null object
public_meeting           44831 non-null object
recorded_by              47520 non-null object
scheme_management        44418 non-null object
scheme_name              24997 non-null object
permit                   45081 non-null object
extraction_type          47520 non-null object
extraction_type_group    47520 non-null object
extraction_type_class    47520 non-null object
management               47520 non-null object
management_group         47520 non-null object
payment                  47520 non-null obj

In [56]:
pumps_cat.nunique()

funder                    1698
installer                 1923
wpt_name                 30742
basin                        9
subvillage               17232
region                      21
lga                        125
ward                      2076
public_meeting               2
recorded_by                  1
scheme_management           12
scheme_name               2541
permit                       2
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
dtype: int64

Looks like "recorded by" has a single value - drop it

In [57]:
pumps_cat = pumps_cat.drop('recorded_by', axis=1)

### Fill cat NaNs with "not available"

In [69]:
pumps_cat = pumps_cat.fillna('not available')

In [74]:
pumps_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 454 to 23812
Data columns (total 28 columns):
funder                   47520 non-null object
installer                47520 non-null object
wpt_name                 47520 non-null object
basin                    47520 non-null object
subvillage               47520 non-null object
region                   47520 non-null object
lga                      47520 non-null object
ward                     47520 non-null object
public_meeting           47520 non-null object
scheme_management        47520 non-null object
scheme_name              47520 non-null object
permit                   47520 non-null object
extraction_type          47520 non-null object
extraction_type_group    47520 non-null object
extraction_type_class    47520 non-null object
management               47520 non-null object
management_group         47520 non-null object
payment                  47520 non-null object
payment_type             47520 non-null obj

In [75]:
pumps_cat.nunique()

funder                    1699
installer                 1924
wpt_name                 30742
basin                        9
subvillage               17233
region                      21
lga                        125
ward                      2076
public_meeting               3
scheme_management           13
scheme_name               2542
permit                       3
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
dtype: int64

### Remove Unnecessary Redundant Cols

In [76]:
#Remove some categorical vars
garbage_cat = ['extraction_type_group','extraction_type_class',
               'waterpoint_type_group','source_type',
              'payment_type','quality_group','quantity_group']

In [77]:
pumps_cat = pumps_cat.drop(columns=garbage_cat)

In [78]:
pumps_cat.columns

Index(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
       'lga', 'ward', 'public_meeting', 'scheme_management', 'scheme_name',
       'permit', 'extraction_type', 'management', 'management_group',
       'payment', 'water_quality', 'quantity', 'source', 'source_class',
       'waterpoint_type'],
      dtype='object')

## Handle Numerical data

In [80]:
pumps_num.head()

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,date_recorded
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
454,50.0,2092,35.42602,-4.227446,0,21,1,160,1998,2013-02-27
510,0.0,0,35.510074,-5.724555,0,1,6,0,0,2011-03-17
14146,0.0,0,32.499866,-9.081222,0,12,6,0,0,2011-07-10
47410,0.0,0,34.060484,-8.830208,0,12,7,0,0,2011-04-12
1288,300.0,1023,37.03269,-6.040787,0,5,1,120,1997,2011-04-05


In [81]:
pumps_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 454 to 23812
Data columns (total 10 columns):
amount_tsh           47520 non-null float64
gps_height           47520 non-null int64
longitude            47520 non-null float64
latitude             47520 non-null float64
num_private          47520 non-null int64
region_code          47520 non-null int64
district_code        47520 non-null int64
population           47520 non-null int64
construction_year    47520 non-null int64
date_recorded        47520 non-null object
dtypes: float64(3), int64(6), object(1)
memory usage: 4.0+ MB


In [83]:
#Convert date recorded to datetime
pumps_num['recorded'] = pd.to_datetime(pumps_num['date_recorded'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [85]:
pumps_num['recorded_year'] = pumps_num['recorded'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [86]:
pumps_num.head()

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,date_recorded,recorded,recorded_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
454,50.0,2092,35.42602,-4.227446,0,21,1,160,1998,2013-02-27,2013-02-27,2013
510,0.0,0,35.510074,-5.724555,0,1,6,0,0,2011-03-17,2011-03-17,2011
14146,0.0,0,32.499866,-9.081222,0,12,6,0,0,2011-07-10,2011-07-10,2011
47410,0.0,0,34.060484,-8.830208,0,12,7,0,0,2011-04-12,2011-04-12,2011
1288,300.0,1023,37.03269,-6.040787,0,5,1,120,1997,2011-04-05,2011-04-05,2011


In [93]:
#Impute mean construction_year
mean_imputer = SimpleImputer(missing_values=0, strategy='mean')
pumps_num['construction_year_imp'] = mean_imputer.fit(pumps_num['construction_year'].values)

ValueError: Expected 2D array, got 1D array instead:
array=[1998.    0.    0. ... 2005.    0. 1976.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [68]:
#Check if the target variables are balanced - ADDRESS this in the end but DO NOT Apply to TEST
y_train['status_group'].value_counts()

functional                 25802
non functional             18252
functional needs repair     3466
Name: status_group, dtype: int64