# Modeling

## Import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score



## Read data

In [2]:
path = '../../data/cleaned'
training = pd.read_csv(path + '/training_cleaned_v2.csv')
testing = pd.read_csv(path + '/testing_cleaned_v2.csv')

In [3]:
print (training.isnull().sum())
print ('----------------------------------')
print (testing.isnull().sum())

id                                   0
amount_tsh                           0
year_recorded                        0
month_recorded                       0
day_recorded                         0
gps_height                           0
basin                                0
basin_encoded                        0
region                               0
region_encoded                       0
population                           0
public_meeting_new                3334
permit_new                        3056
age                              20709
extraction_type                      0
extraction_type_encoded              0
extraction_type_group                0
extraction_type_group_encoded        0
extraction_type_class                0
extraction_type_class_encoded        0
management                           0
management_encoded                   0
management_group_new                 0
management_group_new_encoded         0
payment                              0
payment_encoded          

In [4]:
initial_features = ['amount_tsh', 'year_recorded', 'month_recorded', 'day_recorded', 'gps_height', 'basin_encoded','region_encoded', 
                    'population', 'extraction_type_encoded', 'extraction_type_group_encoded','extraction_type_class_encoded',
                    'management_encoded', 'management_group_new_encoded', 'payment_encoded',  'quantity_group_encoded', 
                    'quality_group_new_encoded', 'source_encoded', 'source_type_encoded', 'source_class_encoded', 
                    'waterpoint_type_new_encoded']
## with NA's 'public_meeting_new','permit_new', 'age'

## Random Forest

#### with non-imputed

In [5]:
X_train = training.loc[:,initial_features]
y_train = training['status_group_encoded']

In [6]:
rf_1 = RandomForestClassifier(max_depth=50, random_state=0)
rf_1.fit(X_train, y_train)

# 10-Fold Cross validation
cv_scores = cross_val_score(rf_1, X_train, y_train, cv=10)
print (cv_scores)
print ('-------------------------------------------------')
print (np.mean(cv_scores))

[0.77731022 0.78538967 0.78034001 0.77613196 0.78737374 0.77424242
 0.78215488 0.7774036  0.77757198 0.77787134]
-------------------------------------------------
0.7795789822626739


In [7]:
print(rf_1.feature_importances_)

[0.04405297 0.01094124 0.04000185 0.13235261 0.16436566 0.03748047
 0.04127217 0.09380327 0.02607772 0.021188   0.03150953 0.02963456
 0.01354877 0.0412393  0.14325146 0.02096819 0.02409128 0.01906031
 0.00767475 0.0574859 ]


#### what if impuate age?

In [8]:
print (np.mean(training['age']))

16.185314414204854


In [9]:
training['age_imputed'] = np.where(training['age'].isnull(),np.mean(training['age']), training['age'])

In [10]:
initial_features.append('age_imputed')

In [11]:
X_train_2 = training.loc[:,initial_features]

In [12]:
rf_2 = RandomForestClassifier(max_depth=50, random_state=0)
rf_2.fit(X_train_2, y_train)

# 10-Fold Cross validation
cv_scores = cross_val_score(rf_2, X_train_2, y_train, cv=10)
print (cv_scores)
print ('-------------------------------------------------')
print (np.mean(cv_scores))

[0.78926107 0.78909275 0.78808281 0.77815183 0.79579125 0.78232323
 0.78905724 0.78009766 0.77639333 0.78477602]
-------------------------------------------------
0.7853027180905109


## Make a submission

### Geneerate the predict set

In [13]:
testing['age_imputed'] = np.where(testing['age'].isnull(),np.mean(testing['age']), testing['age'])
X_test_2 = testing.loc[:,initial_features]
prediction_test = rf_2.predict(X_test_2)

In [14]:
print (prediction_test)
print (sum(prediction_test==1))
print (sum(prediction_test==2))
print (sum(prediction_test==3))

[3 3 3 ... 3 3 1]
5758
834
8258


### get the submission format

In [15]:
testing['prediction_label'] = prediction_test
testing['status_group'] = np.where((testing['prediction_label'] == 1), 
                                                 'non functional', np.where((testing['prediction_label'] == 3), 'functional', 'functional needs repair'))
out = testing[['id', 'status_group']]

### Write out

In [16]:
path_out = '../../data/submissions'
out.to_csv(path_out + '/out_v3.csv', index = False)