- Do the random forest on the selected categorical data and try variations

## Notes of what I have done in the prev version ##


In [294]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, roc_auc_score, roc_curve

In [295]:
#Read two data sets and put them into two different DFs
df_p = pd.read_csv('pumps.csv', index_col = 0)
df_py = pd.read_csv('pumps_y.csv', index_col = 0)

In [296]:
#Check shape
df_p.shape, df_py.shape

((59400, 39), (59400, 1))

In [297]:
#Merging pumps_y as a new column on pumps
df_p['status_group'] = df_py['status_group']

In [298]:
#Do train/test split
Xtrain, Xtest, ytrain, ytest = train_test_split(df_p.loc[:,'amount_tsh':'waterpoint_type_group'], df_p.loc[:,'status_group'], test_size = 0.2, random_state = 42)

# This is the point where Feature Engineering Starts. 
# After model building, replace Xtrain by Xtest and so on for y

In [299]:
#Check the sizes
Xtrain.shape, ytrain.shape

((47520, 39), (47520,))

In [300]:
#Merge the training data back together
df_p = pd.concat([Xtrain, ytrain], axis = 1)

In [301]:
df_p.head(20)

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,50.0,2013-02-27,Dmdd,2092,DMDD,35.42602,-4.227446,Narmo,0,Internal,...,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional
510,0.0,2011-03-17,Cmsr,0,Gove,35.510074,-5.724555,Lukali,0,Internal,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
14146,0.0,2011-07-10,Kkkt,0,KKKT,32.499866,-9.081222,Mahakama,0,Lake Rukwa,...,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other,non functional
47410,0.0,2011-04-12,,0,,34.060484,-8.830208,Shule Ya Msingi Chosi A,0,Rufiji,...,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,non functional
1288,300.0,2011-04-05,Ki,1023,Ki,37.03269,-6.040787,Kwa Mjowe,0,Wami / Ruvu,...,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other,non functional
13095,0.0,2011-08-08,Hesawa,0,DWE,33.509112,-2.648505,Kwa Mudaba,0,Lake Victoria,...,salty,salty,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
558,0.0,2013-03-01,World Vision,0,World vision,33.731347,-3.284633,Mwamagulya,0,Internal,...,soft,good,seasonal,seasonal,shallow well,shallow well,groundwater,hand pump,hand pump,functional
35626,0.0,2011-03-21,Selous G,298,Selous G,36.864072,-7.935517,Kwamligo,0,Rufiji,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional
8696,0.0,2011-08-02,Government Of Tanzania,0,Government,33.423658,-2.606991,Kwa Nuhu,0,Lake Victoria,...,soft,good,enough,enough,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,non functional
48650,0.0,2013-01-22,Government Of Tanzania,1141,DWE,30.381136,-4.640729,Msebei,0,Lake Tanganyika,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [302]:
#Replace "functional needs repair" by "non functional"
#The rationale behind it is simply we don't want a bad shape pump go under the radar

df_p['status_group'] = df_p['status_group'].str.replace('functional needs repair', 'non functional')
df_p['status_group'].value_counts()

functional        25802
non functional    21718
Name: status_group, dtype: int64

In [303]:
#Missing Values
#Notice when you split train/test this has changed
df_p.isnull().sum() # number of missing values 

amount_tsh                   0
date_recorded                0
funder                    2876
gps_height                   0
installer                 2889
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 296
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            2689
recorded_by                  0
scheme_management         3102
scheme_name              22523
permit                    2439
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity

In [304]:
#Remove scheme_name and date_recorded because 
#scheme_name is mostly empty and date_recorded is something that cant be correlated

df_p = df_p.drop(['scheme_name', 'date_recorded'], axis=1)

In [305]:
#Divide df into numeric and categorical
#Numeric df
#Get rid of num_private

df_num = df_p[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'construction_year']]

In [306]:
#Categorical df

df_cat = df_p[['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']]

In [307]:
#Imputation with mean, median and most frequent

#Construction_Year
#year_to_replace_with = df_num['construction_year'].value_counts()
#year_to_replace_with.index[1]
#df_num['construction_year'].replace(0, year_to_replace_with.index[1], inplace = True)

#Population
#population_to_replace_with = df_num['population'].median()
#df_num['population'].replace(0, population_to_replace_with, inplace = True)

#amount_tsh
#amount_to_replace_with = round(df_num['amount_tsh'].mean())
#df_num['amount_tsh'].replace(0, amount_to_replace_with, inplace = True)

In [308]:
#df_num

In [309]:
#df_num.shape, df_cat.shape

In [310]:
df_cat.nunique()

funder                    1698
installer                 1923
wpt_name                 30742
basin                        9
subvillage               17232
region                      21
lga                        125
ward                      2076
public_meeting               2
recorded_by                  1
scheme_management           12
permit                       2
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
status_group                 2
dtype: int64

In [311]:
#Fill all NaNs with 'no data'
df_cat_fillna = df_cat.fillna('not available')

In [312]:
df_cat_fillna.head(20)

Unnamed: 0_level_0,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,Dmdd,DMDD,Narmo,Internal,Bashnet Kati,Manyara,Babati,Bashinet,True,GeoData Consultants Ltd,...,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional
510,Cmsr,Gove,Lukali,Internal,Lukali,Dodoma,Bahi,Lamaiti,True,GeoData Consultants Ltd,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
14146,Kkkt,KKKT,Mahakama,Lake Rukwa,Chawalikozi,Mbeya,Mbozi,Ndalambo,True,GeoData Consultants Ltd,...,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other,non functional
47410,not available,not available,Shule Ya Msingi Chosi A,Rufiji,Shuleni,Mbeya,Mbarali,Chimala,True,GeoData Consultants Ltd,...,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,non functional
1288,Ki,Ki,Kwa Mjowe,Wami / Ruvu,Ngholong,Morogoro,Kilosa,Chakwale,True,GeoData Consultants Ltd,...,salty,salty,enough,enough,shallow well,shallow well,groundwater,other,other,non functional
13095,Hesawa,DWE,Kwa Mudaba,Lake Victoria,Lumeji,Mwanza,Magu,Sukuma,True,GeoData Consultants Ltd,...,salty,salty,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
558,World Vision,World vision,Mwamagulya,Internal,Ngomeni,Shinyanga,Maswa,Busilili,True,GeoData Consultants Ltd,...,soft,good,seasonal,seasonal,shallow well,shallow well,groundwater,hand pump,hand pump,functional
35626,Selous G,Selous G,Kwamligo,Rufiji,Namisatu,Morogoro,Kilombero,Kiberege,True,GeoData Consultants Ltd,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional
8696,Government Of Tanzania,Government,Kwa Nuhu,Lake Victoria,Nyamiselya,Mwanza,Magu,Nyigogo,True,GeoData Consultants Ltd,...,soft,good,enough,enough,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,non functional
48650,Government Of Tanzania,DWE,Msebei,Lake Tanganyika,Msebei,Kigoma,Kasulu,Ruhita,True,GeoData Consultants Ltd,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [313]:
#Check NaNs in here
#Check the NaNs - num
df_cat_fillna.isnull().sum()

funder                   0
installer                0
wpt_name                 0
basin                    0
subvillage               0
region                   0
lga                      0
ward                     0
public_meeting           0
recorded_by              0
scheme_management        0
permit                   0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
status_group             0
dtype: int64

In [314]:
#Dummify the target 
dummy_target_var = pd.get_dummies(df_cat_fillna['status_group'])

In [315]:
#Check the dummy
dummy_target_var.head()

Unnamed: 0_level_0,functional,non functional
id,Unnamed: 1_level_1,Unnamed: 2_level_1
454,1,0
510,1,0
14146,0,1
47410,0,1
1288,0,1


In [316]:
#Concat original cat df and dummified target

df_cat_fillna_dummy_target = pd.concat([df_cat_fillna, dummy_target_var], axis = 1)
df_cat_fillna_dummy_target.head(3)

Unnamed: 0_level_0,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,functional,non functional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,Dmdd,DMDD,Narmo,Internal,Bashnet Kati,Manyara,Babati,Bashinet,True,GeoData Consultants Ltd,...,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional,1,0
510,Cmsr,Gove,Lukali,Internal,Lukali,Dodoma,Bahi,Lamaiti,True,GeoData Consultants Ltd,...,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional,1,0
14146,Kkkt,KKKT,Mahakama,Lake Rukwa,Chawalikozi,Mbeya,Mbozi,Ndalambo,True,GeoData Consultants Ltd,...,enough,enough,shallow well,shallow well,groundwater,other,other,non functional,0,1


In [317]:
#Dummify everything
column_list_to_dummify = ['basin', 'public_meeting', 'recorded_by', 'permit', 'extraction_type', 'management', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type']


#Dummify everything
dummy_cat_var = pd.get_dummies(df_cat_fillna[column_list_to_dummify], drop_first = True )
dummy_cat_var.head()

Unnamed: 0_level_0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,public_meeting_True,public_meeting_not available,...,source_river,source_shallow well,source_spring,source_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
510,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
14146,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
47410,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
1288,0,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1


In [251]:
#Target Encoding a Subset of columns

#all_cols = ['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']

#column_list_to_target_encode = ['basin', 'public_meeting', 'recorded_by', 'permit', 'extraction_type', 'management', 'payment_type', 'water_quality', 'quantity_group', 'source', 'waterpoint_type']
#for column in column_list_to_target_encode:
#    target_means = df_cat_fillna_dummy_target.groupby(column).mean()
#    df_cat_fillna_dummy_target[f'{column}_func'] = df_cat_fillna_dummy_target[column].replace(target_means['functional'])
#    df_cat_fillna_dummy_target[f'{column}_nonfunc'] = df_cat_fillna_dummy_target[column].replace(target_means['non functional'])
#df['cat_nonf'] = df['cat'].replace(target_means['nonf'])
#df_cat_fillna_dummy_target.head()

Unnamed: 0_level_0,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,...,water_quality_func,water_quality_nonfunc,quantity_group_func,quantity_group_nonfunc,source_func,source_nonfunc,source_class_func,source_class_nonfunc,waterpoint_type_func,waterpoint_type_nonfunc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,Dmdd,DMDD,Narmo,Internal,Bashnet Kati,Manyara,Babati,Bashinet,True,GeoData Consultants Ltd,...,0.565821,0.434179,0.524537,0.475463,0.624363,0.375637,0.543124,0.456876,0.622646,0.377354
510,Cmsr,Gove,Lukali,Internal,Lukali,Dodoma,Bahi,Lamaiti,True,GeoData Consultants Ltd,...,0.565821,0.434179,0.652461,0.347539,0.494018,0.505982,0.543124,0.456876,0.615505,0.384495
14146,Kkkt,KKKT,Mahakama,Lake Rukwa,Chawalikozi,Mbeya,Mbozi,Ndalambo,True,GeoData Consultants Ltd,...,0.565821,0.434179,0.652461,0.347539,0.494018,0.505982,0.543124,0.456876,0.132013,0.867987
47410,not available,not available,Shule Ya Msingi Chosi A,Rufiji,Shuleni,Mbeya,Mbarali,Chimala,True,GeoData Consultants Ltd,...,0.565821,0.434179,0.524537,0.475463,0.563804,0.436196,0.541889,0.458111,0.622646,0.377354
1288,Ki,Ki,Kwa Mjowe,Wami / Ruvu,Ngholong,Morogoro,Kilosa,Chakwale,True,GeoData Consultants Ltd,...,0.455127,0.544873,0.652461,0.347539,0.494018,0.505982,0.543124,0.456876,0.132013,0.867987


In [237]:
#Check the column string to make sure that everything is there
#df_cat_fillna_dummy_target.columns
#['basin', 'extraction_type_class', 'management_group', 
#                                                       'payment_type', 'quality_group', 'quantity_group', 'source_class', 
#                                                       'waterpoint_type_group']

In [318]:
# DO NOTmerge with numerical df, just merge with the y dummies
num_dumm_cat = pd.concat([dummy_cat_var,dummy_target_var], axis = 1)

In [319]:
num_dumm_cat.head()

Unnamed: 0_level_0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,public_meeting_True,public_meeting_not available,...,source_spring,source_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,functional,non functional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454,0,0,0,0,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
510,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
14146,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
47410,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
1288,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,1


In [320]:
num_dumm_cat.columns

Index(['basin_Lake Nyasa', 'basin_Lake Rukwa', 'basin_Lake Tanganyika',
       'basin_Lake Victoria', 'basin_Pangani', 'basin_Rufiji',
       'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu',
       'public_meeting_True', 'public_meeting_not available', 'permit_True',
       'permit_not available', 'extraction_type_cemo',
       'extraction_type_climax', 'extraction_type_gravity',
       'extraction_type_india mark ii', 'extraction_type_india mark iii',
       'extraction_type_ksb', 'extraction_type_mono',
       'extraction_type_nira/tanira', 'extraction_type_other',
       'extraction_type_other - mkulima/shinyanga',
       'extraction_type_other - play pump',
       'extraction_type_other - rope pump', 'extraction_type_other - swn 81',
       'extraction_type_submersible', 'extraction_type_swn 80',
       'extraction_type_walimi', 'extraction_type_windmill',
       'management_other', 'management_other - school',
       'management_parastatal', 'management_private operator',
   

In [341]:
#Define data and train sets

X = num_dumm_cat.loc[:,'basin_Lake Nyasa':'waterpoint_type_other']
y_func = num_dumm_cat['functional']
#y_nonfunc = num_dumm_cat.loc[:, 'non functional']
#y_repair = num_dumm_cat.loc[:, 'functional needs repair']

In [342]:
#Check the arrays!!
y_func

id
454      1
510      1
14146    0
47410    0
1288     0
13095    1
558      1
35626    1
8696     0
48650    1
61055    0
27298    1
52899    0
61616    0
54295    0
18191    0
22057    1
39797    0
20856    1
59340    1
52944    1
8638     1
6390     1
39662    0
69518    0
15222    0
37453    1
9535     1
51552    1
3903     1
        ..
51183    0
31998    1
69563    1
44588    1
70949    1
34371    1
50958    1
27804    1
62422    1
48262    0
71861    1
9438     0
72282    0
63836    1
50530    0
48237    1
5457     1
29653    1
205      1
73111    1
45990    1
47858    1
15504    1
46009    0
51225    1
68525    1
11980    1
35778    0
49444    1
23812    0
Name: functional, Length: 47520, dtype: uint8

In [345]:

rf = RandomForestClassifier(n_estimators=100, max_depth=5)
m_rf=rf.fit(X,y_func)
y_pred_rf=m_rf.predict(X)
accuracy_score(y_func,y_pred_rf)

0.7075547138047138

In [346]:
print('Accuracy for functional pump prediction is', accuracy_score(y_func,y_pred_rf)) # calculates the accuracy (% of correct points) for non_func


Accuracy for functional pump prediction is 0.7075547138047138


In [347]:
import numpy as np
#model = svm.SVC(kernel='linear', C=1.0, probability=True)

accuracy_func = cross_val_score(m_rf, X_func, y_func, cv=5, scoring='accuracy')
#accuracy_nonfunc = cross_val_score(m_nf, X_nonfunc, y_nonfunc, cv=5, scoring='accuracy')

mean_func = np.mean(accuracy_func)
std_func = np.std(accuracy_func)
#mean_nf = np.mean(accuracy_nonfunc)
#std_nf = np.std(accuracy_nonfunc)

print(
"Mean cross-validation score for functional-pumps:", mean_func, '\n',
"St.dev of cross-validation score for functional-pumps:", std_func)#,   
#    '\n',
#"Mean cross-validation score for nonfunctional-pumps:", mean_nf, 
#    '\n',
#"St.dev of cross-validation score for nonfunctional-pumps:", std_nf,   
 #   '\n',)

Mean cross-validation score for functional-pumps: 0.7357114564685657 
 St.dev of cross-validation score for functional-pumps: 0.0060988479962742655


In [348]:
feature_label = X.columns
feature_importance =m_rf.feature_importances_

print(feature_label,feature_importance)

Index(['basin_Lake Nyasa', 'basin_Lake Rukwa', 'basin_Lake Tanganyika',
       'basin_Lake Victoria', 'basin_Pangani', 'basin_Rufiji',
       'basin_Ruvuma / Southern Coast', 'basin_Wami / Ruvu',
       'public_meeting_True', 'public_meeting_not available', 'permit_True',
       'permit_not available', 'extraction_type_cemo',
       'extraction_type_climax', 'extraction_type_gravity',
       'extraction_type_india mark ii', 'extraction_type_india mark iii',
       'extraction_type_ksb', 'extraction_type_mono',
       'extraction_type_nira/tanira', 'extraction_type_other',
       'extraction_type_other - mkulima/shinyanga',
       'extraction_type_other - play pump',
       'extraction_type_other - rope pump', 'extraction_type_other - swn 81',
       'extraction_type_submersible', 'extraction_type_swn 80',
       'extraction_type_walimi', 'extraction_type_windmill',
       'management_other', 'management_other - school',
       'management_parastatal', 'management_private operator',
   