# Modeling Random Forest 

## Import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold

## Read data

### This data is after MICE-Imputed of three features (age, public_meeting_new, permit_new) using R.

In [2]:
path = '../../data/cleaned'
training = pd.read_csv(path + '/training_imputed_r_mice_v1.csv')
testing = pd.read_csv(path + '/test_imputed_r_mice_v1.csv')

## Some EDA and quality check

### Check NA

In [3]:
print (training.isnull().sum())

id                               0
amount_tsh                       0
year_recorded                    0
month_recorded                   0
day_recorded                     0
gps_height                       0
basin                            0
basin_encoded                    0
region                           0
region_encoded                   0
population                       0
public_meeting_new               0
permit_new                       0
age                              0
extraction_type                  0
extraction_type_encoded          0
extraction_type_group            0
extraction_type_group_encoded    0
extraction_type_class            0
extraction_type_class_encoded    0
management                       0
management_encoded               0
management_group_new             0
management_group_new_encoded     0
payment                          0
payment_encoded                  0
quantity_group                   0
quantity_group_encoded           0
quality_group_new   

### Change data types

In [30]:
initial_features = ['amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded', 'gps_height', 'basin_encoded','region_encoded', 
                    'population', 'extraction_type_encoded', 'extraction_type_group_encoded','extraction_type_class_encoded',
                    'management_encoded', 'management_group_new_encoded', 'payment_encoded',  'quantity_group_encoded', 
                    'quality_group_new_encoded', 'source_encoded', 'source_type_encoded', 'source_class_encoded', 
                    'waterpoint_type_new_encoded', 'age', 'permit_new', 'public_meeting_new']

In [31]:
cols_to_change = ['basin_encoded','region_encoded','extraction_type_encoded',
      'extraction_type_group_encoded', 'extraction_type_class_encoded',
      'management_encoded', 'management_group_new_encoded', 'payment_encoded',
      'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
      'source_type_encoded', 'source_class_encoded',
      'waterpoint_type_new_encoded']

for col in cols_to_change:
    training[col] = training[col].astype('category')
    testing[col] = testing[col].astype('category')
    
training['status_group_encoded'] = training['status_group_encoded'].astype('category')

training[initial_features].dtypes

amount_tsh                        float64
year_recorded                       int64
month_recorded                      int64
day_recorded                        int64
gps_height                          int64
basin_encoded                    category
region_encoded                   category
population                          int64
extraction_type_encoded          category
extraction_type_group_encoded    category
extraction_type_class_encoded    category
management_encoded               category
management_group_new_encoded     category
payment_encoded                  category
quantity_group_encoded           category
quality_group_new_encoded        category
source_encoded                   category
source_type_encoded              category
source_class_encoded             category
waterpoint_type_new_encoded      category
age                               float64
dtype: object

## Random Forest

In [32]:
X_train = training.loc[:,initial_features]
y_train = training['status_group_encoded']

In [34]:
# fit with 10 fold 10-replicate cross validation
num_replicates = 10
min_samples_leaf = list(range(1,11)) # minimum number of observations in leaf (node size)
max_features = list(range(1,11)) # number of features to consider at each split (mtry)
cur_best_nsize = -1
cur_best_mtry = -1
cur_best_cv = -1

for node_size in min_samples_leaf:
    for mtry in max_features: 
        rf = RandomForestClassifier(random_state=0, min_samples_leaf=node_size, max_features=mtry)
        rf.fit(X_train, y_train)
        rep_cvs = [] # to store cv of replicates
        for i in range(num_replicates):
            cur_cv = np.mean(cross_val_score(rf, X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=i)))
            rep_cvs.append(cur_cv)
        avg_rep_cv = np.mean(rep_cvs) # average cv of replicates
        print ('current CV score for node size ' + str(node_size) + ' mtry ' + str(mtry) + ': ' + str(avg_rep_cv))
        
        if avg_rep_cv > cur_best_cv:
            cur_best_cv = avg_rep_cv
            cur_best_nsize = node_size
            cur_best_mtry = mtry

print ('-----------------------------------------------------------------')
print (cur_best_cv)
print (cur_best_nsize)
print (cur_best_mtry)

current CV score for node size 1 mtry 1: 0.7845151515151516
current CV score for node size 1 mtry 2: 0.7863821548821549
current CV score for node size 1 mtry 3: 0.7861565656565656
current CV score for node size 1 mtry 4: 0.7869225589225589
current CV score for node size 1 mtry 5: 0.7853956228956228
current CV score for node size 1 mtry 6: 0.7855387205387205
current CV score for node size 1 mtry 7: 0.7851346801346801
current CV score for node size 1 mtry 8: 0.7843670033670033
current CV score for node size 1 mtry 9: 0.78406734006734
current CV score for node size 1 mtry 10: 0.7845538720538722
current CV score for node size 2 mtry 1: 0.7961986531986532
current CV score for node size 2 mtry 2: 0.8013097643097643
current CV score for node size 2 mtry 3: 0.8025420875420876
current CV score for node size 2 mtry 4: 0.8026195286195286
current CV score for node size 2 mtry 5: 0.8006043771043769
current CV score for node size 2 mtry 6: 0.8003888888888888
current CV score for node size 2 mtry 7: 

### Best results

In [35]:
print ('best node size: ' + str(cur_best_nsize))
print ('best mtry: ' + str(cur_best_mtry))
print ('best cv score: ' + str(cur_best_cv))
print ('-------------------------------------------------')

best node size: 2
best mtry: 4
best cv score: 0.8026195286195286
-------------------------------------------------


In [36]:
rf_out = RandomForestClassifier(random_state=0, min_samples_leaf=cur_best_nsize, max_features=cur_best_mtry)
rf_out.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Feature importance

In [37]:
importance_val = rf_out.feature_importances_

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(initial_features, importance_val):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
print (importances.sort_values(['Gini-importance'], ascending=False))

                               Gini-importance
quantity_group_encoded                0.164512
age                                   0.153805
gps_height                            0.092545
day_recorded                          0.078262
population                            0.062779
waterpoint_type_new_encoded           0.057352
amount_tsh                            0.045873
extraction_type_class_encoded         0.043621
region_encoded                        0.040952
payment_encoded                       0.038046
basin_encoded                         0.035601
month_recorded                        0.033643
extraction_type_group_encoded         0.028752
source_encoded                        0.024605
management_encoded                    0.024302
source_type_encoded                   0.018854
quality_group_new_encoded             0.014932
extraction_type_encoded               0.014321
year_recorded                         0.010545
management_group_new_encoded          0.009934
source_class_

## Output

### Generate the prediction set

In [38]:
X_test = testing.loc[:,initial_features]
prediction_test = rf_out.predict(X_test)

### get the submission format

In [39]:
testing['prediction_label'] = prediction_test
testing['status_group'] = np.where((testing['prediction_label'] == 1), 
                                                 'non functional', np.where((testing['prediction_label'] == 3), 'functional', 'functional needs repair'))
out = testing[['id', 'status_group']]

### Write out

In [40]:
path_out = '../../data/submissions'
out.to_csv(path_out + '/out_v6.csv', index = False)