# Modeling Random Forest 

## Import packages

In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold

## Read data

### This data is after MICE-Imputed of three features (age, public_meeting_new, permit_new) using R.

In [3]:
path = '../../data/cleaned'
training = pd.read_csv(path + '/training_imputed_r_mice_v1.csv')
testing = pd.read_csv(path + '/test_imputed_r_mice_v1.csv')

## Some EDA and quality check

### Check NA

In [4]:
print (training.isnull().sum())

id                               0
amount_tsh                       0
year_recorded                    0
month_recorded                   0
day_recorded                     0
gps_height                       0
basin                            0
basin_encoded                    0
region                           0
region_encoded                   0
population                       0
public_meeting_new               0
permit_new                       0
age                              0
extraction_type                  0
extraction_type_encoded          0
extraction_type_group            0
extraction_type_group_encoded    0
extraction_type_class            0
extraction_type_class_encoded    0
management                       0
management_encoded               0
management_group_new             0
management_group_new_encoded     0
payment                          0
payment_encoded                  0
quantity_group                   0
quantity_group_encoded           0
quality_group_new   

### Change data types

In [10]:
initial_features = ['amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded', 'gps_height', 'basin_encoded','region_encoded', 
                    'population', 'extraction_type_encoded', 'extraction_type_group_encoded','extraction_type_class_encoded',
                    'management_encoded', 'management_group_new_encoded', 'payment_encoded',  'quantity_group_encoded', 
                    'quality_group_new_encoded', 'source_encoded', 'source_type_encoded', 'source_class_encoded', 
                    'waterpoint_type_new_encoded', 'age', 'permit_new', 'public_meeting_new']

In [11]:
cols_to_change = ['basin_encoded','region_encoded','extraction_type_encoded',
      'extraction_type_group_encoded', 'extraction_type_class_encoded',
      'management_encoded', 'management_group_new_encoded', 'payment_encoded',
      'quantity_group_encoded', 'quality_group_new_encoded', 'source_encoded',
      'source_type_encoded', 'source_class_encoded',
      'waterpoint_type_new_encoded']

for col in cols_to_change:
    training[col] = training[col].astype('category')
    testing[col] = testing[col].astype('category')
    
training['status_group_encoded'] = training['status_group_encoded'].astype('category')

training[initial_features].dtypes

amount_tsh                        float64
year_recorded                       int64
month_recorded                      int64
day_recorded                        int64
gps_height                          int64
basin_encoded                    category
region_encoded                   category
population                          int64
extraction_type_encoded          category
extraction_type_group_encoded    category
extraction_type_class_encoded    category
management_encoded               category
management_group_new_encoded     category
payment_encoded                  category
quantity_group_encoded           category
quality_group_new_encoded        category
source_encoded                   category
source_type_encoded              category
source_class_encoded             category
waterpoint_type_new_encoded      category
age                               float64
permit_new                          int64
public_meeting_new                  int64
dtype: object

## Random Forest

In [12]:
X_train = training.loc[:,initial_features]
y_train = training['status_group_encoded']

In [30]:
# fit with 10 fold 2-replicate cross validation
num_replicates = 2
min_samples_leaf = [1, 5] # minimum number of observations in leaf (node size)
max_features = [1, 3] # number of features to consider at each split (mtry)
cur_best_nsize = -1
cur_best_mtry = -1
cur_best_cv = -1

for node_size in min_samples_leaf:
    for mtry in max_features: 
        rf = RandomForestClassifier(random_state=0, min_samples_leaf=node_size, max_features=mtry)
        rf.fit(X_train, y_train)
        rep_cvs = [] # to store cv of replicates
        for i in range(num_replicates):
            cur_cv = np.mean(cross_val_score(rf_1, X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=i)))
            rep_cvs.append(cur_cv)
        avg_rep_cv = np.mean(rep_cvs) # average cv of replicates
        print (node_size)
        print (mtry)
        print ("current avg rep cv: " + str(avg_rep_cv))
        
        if avg_rep_cv > cur_best_cv:
            cur_best_cv = avg_rep_cv
            cur_best_nsize = node_size
            cur_best_mtry = mtry

1
1
current avg rep cv: 0.7871548821548822
1
3
current avg rep cv: 0.7871548821548822
5
1
current avg rep cv: 0.7871548821548822
5
3
current avg rep cv: 0.7871548821548822


### It seems that the random forest is able to converge!

In [31]:
print ('best node size: ' + str(cur_best_nsize))
print ('best mtry: ' + str(cur_best_mtry))
print ('best cv score: ' + str(cur_best_cv))
print ('-------------------------------------------------')

best node size: 1
best mtry: 1
best cv score: 0.7871548821548822
-------------------------------------------------


In [32]:
rf_out = RandomForestClassifier(random_state=0, min_samples_leaf=5, max_features=3)
rf_out.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Output

### Generate the prediction set

In [33]:
X_test = testing.loc[:,initial_features]
prediction_test = rf_out.predict(X_test)

### get the submission format

In [34]:
testing['prediction_label'] = prediction_test
testing['status_group'] = np.where((testing['prediction_label'] == 1), 
                                                 'non functional', np.where((testing['prediction_label'] == 3), 'functional', 'functional needs repair'))
out = testing[['id', 'status_group']]

### Write out

In [35]:
path_out = '../../data/submissions'
out.to_csv(path_out + '/out_v4.csv', index = False)