In [68]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing

In [2]:
path = '../../data/cleaned'
training = pd.read_csv(path + '/training_cleaned_v2.csv')

In [11]:
training.shape

(59400, 40)

In [10]:
sum((training.age).isnull())

20709

In [14]:
training_age_null = training[training.age.isnull()]
training_age_null.head(5)

Unnamed: 0,id,amount_tsh,year_recorded,month_recorded,day_recorded,gps_height,basin,basin_encoded,region,region_encoded,...,source,source_encoded,source_type,source_type_encoded,source_class,source_class_encoded,waterpoint_type_new,waterpoint_type_new_encoded,status_group,status_group_encoded
4,19728,0.0,2011,7,13,0,Lake Victoria,4,Kagera,4,...,rainwater harvesting,5,rainwater harvesting,3,surface,1,communal standpipe,0,functional,3
6,19816,0.0,2012,10,1,0,Internal,0,Shinyanga,17,...,machine dbh,3,borehole,0,groundwater,0,hand pump,2,non functional,1
7,54551,0.0,2012,10,9,0,Lake Tanganyika,3,Shinyanga,17,...,shallow well,7,shallow well,5,groundwater,0,hand pump,2,non functional,1
8,53934,0.0,2012,11,3,0,Lake Tanganyika,3,Tabora,19,...,machine dbh,3,borehole,0,groundwater,0,hand pump,2,non functional,1
9,46144,0.0,2011,8,3,0,Lake Victoria,4,Kagera,4,...,shallow well,7,shallow well,5,groundwater,0,hand pump,2,functional,3


In [15]:
training_age_null.shape

(20709, 40)

In [18]:
training_age_not_null = training[~training.age.isnull()]
training_age_not_null.head(5)
training_age_not_null.shape

(38691, 40)

In [20]:
training_age_not_null.dtypes

id                                 int64
amount_tsh                       float64
year_recorded                      int64
month_recorded                     int64
day_recorded                       int64
gps_height                         int64
basin                             object
basin_encoded                      int64
region                            object
region_encoded                     int64
population                         int64
public_meeting_new               float64
permit_new                       float64
age                              float64
extraction_type                   object
extraction_type_encoded            int64
extraction_type_group             object
extraction_type_group_encoded      int64
extraction_type_class             object
extraction_type_class_encoded      int64
management                        object
management_encoded                 int64
management_group_new              object
management_group_new_encoded       int64
payment         

In [56]:
df_train_not_null = training_age_not_null.select_dtypes(exclude = 'object')
print(df_train_not_null.columns)
df_train_not_null = df_train_not_null.drop(['id','year_recorded', 'month_recorded', 'day_recorded','public_meeting_new', 'permit_new'], axis = 1)
df_train_not_null.columns

Index(['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded',
       'gps_height', 'basin_encoded', 'region_encoded', 'population',
       'public_meeting_new', 'permit_new', 'age', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded', 'status_group_encoded'],
      dtype='object')


Index(['amount_tsh', 'gps_height', 'basin_encoded', 'region_encoded',
       'population', 'age', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded', 'status_group_encoded'],
      dtype='object')

In [57]:
X_train_not_null = df_train_not_null.iloc[:,:-1]
y_train_not_null = df_train_not_null['status_group_encoded']
X_train_not_null.columns

Index(['amount_tsh', 'gps_height', 'basin_encoded', 'region_encoded',
       'population', 'age', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded'],
      dtype='object')

In [59]:
# fit with 10 fold 2-replicate cross validation
num_replicates = 2
min_samples_leaf = list([1,5,10,50,100]) # minimum number of observations in leaf (node size)
max_features = list(range(2,5)) # number of features to consider at each split (mtry)
cur_best_nsize = -1
cur_best_mtry = -1
cur_best_cv = -1

for node_size in min_samples_leaf:
    for mtry in max_features: 
        rf = RandomForestClassifier(random_state=0, min_samples_leaf=node_size, max_features=mtry)
        rf.fit(X_train_not_null, y_train_not_null)
        rep_cvs = [] # to store cv of replicates
        for i in range(num_replicates):
            cur_cv = np.mean(cross_val_score(rf, X_train_not_null, y_train_not_null, cv=KFold(n_splits=10, shuffle=True, random_state=i)))
            rep_cvs.append(cur_cv)
        avg_rep_cv = np.mean(rep_cvs) # average cv of replicates
        print (node_size)
        print (mtry)
        print ("current avg rep cv: " + str(avg_rep_cv))
        
        if avg_rep_cv > cur_best_cv:
            cur_best_cv = avg_rep_cv
            cur_best_nsize = node_size
            cur_best_mtry = mtry
            
print ('-----------------------------------------------------------------')
print (cur_best_cv)
print (cur_best_nsize)
print (cur_best_mtry)

1
2
current avg rep cv: 0.7890982319543873
1
3
current avg rep cv: 0.7915665633475656
1
4
current avg rep cv: 0.791799074736376
5
2
current avg rep cv: 0.7951980093541522
5
3
current avg rep cv: 0.8011296845060752
5
4
current avg rep cv: 0.8046057177471762
10
2
current avg rep cv: 0.7860226954731273
10
3
current avg rep cv: 0.7921482158253874
10
4
current avg rep cv: 0.7963481372841703
50
2
current avg rep cv: 0.7580704473309678
50
3
current avg rep cv: 0.7663281413314472
50
4
current avg rep cv: 0.7704376502284441
100
2
current avg rep cv: 0.7460262585461994
100
3
current avg rep cv: 0.7548009621299097
100
4
current avg rep cv: 0.7595824358863903
-----------------------------------------------------------------
0.8046057177471762
5
4


In [50]:
df_train_null = training_age_null.select_dtypes(exclude = 'object')
print(df_train_null.columns)
df_train_null = df_train_null.drop(['age','id','year_recorded', 'month_recorded', 'day_recorded','public_meeting_new', 'permit_new'], axis = 1)
df_train_null.columns

Index(['id', 'amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded',
       'gps_height', 'basin_encoded', 'region_encoded', 'population',
       'public_meeting_new', 'permit_new', 'age', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded', 'status_group_encoded'],
      dtype='object')


Index(['amount_tsh', 'gps_height', 'basin_encoded', 'region_encoded',
       'population', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded', 'status_group_encoded'],
      dtype='object')

In [51]:
X_train_null = df_train_null.iloc[:,:-1]
y_train_null = df_train_null['status_group_encoded']
X_train_null.columns

Index(['amount_tsh', 'gps_height', 'basin_encoded', 'region_encoded',
       'population', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded'],
      dtype='object')

In [61]:
# fit with 10 fold 2-replicate cross validation
num_replicates = 5
min_samples_leaf = list([1,5,6]) # minimum number of observations in leaf (node size)
max_features = list(range(2,5)) # number of features to consider at each split (mtry)
cur_best_nsize = -1
cur_best_mtry = -1
cur_best_cv = -1

for node_size in min_samples_leaf:
    for mtry in max_features: 
        rf = RandomForestClassifier(random_state=0, min_samples_leaf=node_size, max_features=mtry)
        rf.fit(X_train_null, y_train_null)
        rep_cvs = [] # to store cv of replicates
        for i in range(num_replicates):
            cur_cv = np.mean(cross_val_score(rf, X_train_null, y_train_null, cv=KFold(n_splits=10, shuffle=True, random_state=i)))
            rep_cvs.append(cur_cv)
        avg_rep_cv = np.mean(rep_cvs) # average cv of replicates
        print (node_size)
        print (mtry)
        print ("current avg rep cv: " + str(avg_rep_cv))
        
        if avg_rep_cv > cur_best_cv:
            cur_best_cv = avg_rep_cv
            cur_best_nsize = node_size
            cur_best_mtry = mtry
            
print ('-----------------------------------------------------------------')
print (cur_best_cv)
print (cur_best_nsize)
print (cur_best_mtry)

1
2
current avg rep cv: 0.7475784808384477
1
3
current avg rep cv: 0.747790901265929
1
4
current avg rep cv: 0.7479358614592592
5
2
current avg rep cv: 0.7390219432372982
5
3
current avg rep cv: 0.7452895961483285
5
4
current avg rep cv: 0.7489305126931141
6
2
current avg rep cv: 0.7369551454757091
6
3
current avg rep cv: 0.7445364628163947
6
4
current avg rep cv: 0.7481388159935806
-----------------------------------------------------------------
0.7489305126931141
5
4


## Variable Importance

In [60]:
rf_best = RandomForestClassifier(random_state=0, min_samples_leaf=5, max_features=4)
rf_best.fit(X_train_not_null, y_train_not_null)

importance_val = rf_best.feature_importances_

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train_not_null.columns, importance_val):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
print (importances.sort_values(['Gini-importance'], ascending=False))

                               Gini-importance
quantity_group_encoded                0.183951
age                                   0.124547
gps_height                            0.112368
amount_tsh                            0.101955
population                            0.068893
waterpoint_type_new_encoded           0.066150
region_encoded                        0.064076
extraction_type_class_encoded         0.055704
basin_encoded                         0.037600
payment_encoded                       0.033003
extraction_type_group_encoded         0.028590
management_encoded                    0.027804
source_encoded                        0.024753
source_type_encoded                   0.018372
management_group_new_encoded          0.018016
extraction_type_encoded               0.014390
quality_group_new_encoded             0.010096
source_class_encoded                  0.009734


In [55]:
X_train_not_null.columns

Index(['amount_tsh', 'gps_height', 'basin_encoded', 'region_encoded',
       'population', 'extraction_type_encoded',
       'extraction_type_group_encoded', 'extraction_type_class_encoded',
       'management_encoded', 'management_group_new_encoded', 'payment_encoded',
       'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
       'source_type_encoded', 'source_class_encoded',
       'waterpoint_type_new_encoded'],
      dtype='object')

## Neural Network

In [77]:
X_train_not_null_scaled = pd.DataFrame(preprocessing.scale(X_train_not_null),columns = X_train_not_null.columns)

In [81]:
alpha = [0.01]
node_size = [20,50,100,200,500]

for a in alpha:
    for n in node_size:
        clf = MLPClassifier(solver='adam', alpha=a ,hidden_layer_sizes=(20,), random_state=1)
        clf.fit(X_train_not_null_scaled, y_train_not_null)

        rep_cvs = []
        for i in range(2):
            cur_cv = np.mean(cross_val_score(clf, X_train_not_null_scaled, y_train_not_null, cv=KFold(n_splits=10, shuffle=True, random_state=i)))
            rep_cvs.append(cur_cv)
        avg_rep_cv = np.mean(rep_cvs)
        print(a)
        print(avg_rep_cv)

0.01
0.7523843904673937
0.01
0.7523843904673937
0.01
0.7523843904673937
0.01
0.7523843904673937
0.01
0.7523843904673937


0.7519578802687231