In [61]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
sns.set_style('darkgrid')
#try plotly and cufflinks later

import plotly.plotly as py
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True) 

train_data = pd.read_csv('pump_it_up/train_data_clean.csv')
test_values = pd.read_csv('pump_it_up/test_data_clean.csv')



In [62]:
train_data.drop('Unnamed: 0',axis=1,inplace=True)
test_values.drop('Unnamed: 0',axis=1,inplace=True)


In [41]:
# Test removing amount tsh
#train_data.drop('amount_tsh',axis=1,inplace=True)
#test_values.drop('amount_tsh',axis=1,inplace=True)

### Quick initial accuracy benchmark with Random Forest

In [42]:
from sklearn.model_selection import train_test_split

y_values = train_data[['status_group_functional','status_group_functional needs repair','status_group_non functional'] ]

X_values = train_data.drop(['status_group_functional','status_group_functional needs repair','status_group_non functional'],axis=1,inplace=False)


X_train, X_test, y_train, y_test = train_test_split(X_values, 
                                                    y_values, test_size=0.30, 
                                                    random_state=101)

X_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Columns: 548 entries, 0 to waterpoint_type_other
dtypes: float64(5), int64(543)
memory usage: 248.3 MB


In [58]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000,min_samples_split=4)

rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)


In [59]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
#print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

print(accuracy_score(y_test,rfc_pred))

             precision    recall  f1-score   support

          0       0.82      0.87      0.84      9550
          1       0.62      0.28      0.39      1330
          2       0.86      0.77      0.81      6940

avg / total       0.82      0.79      0.80     17820

0.7850729517396184


In [5]:
# Accuracy is 0.7850168350168351

In [23]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rfc, X_values, y_values, cv=10)

In [24]:
## Let's try figuring out what features are important

In [55]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variable']), 
           pd.DataFrame(rfc.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:10]

Unnamed: 0,variable,importance
1,1,0.10724
0,0,0.104566
525,quantity_dry,0.060999
3,date_recorded,0.052452
5,population,0.042315
6,construction_year,0.031524
526,quantity_enough,0.025651
547,waterpoint_type_other,0.025547
494,extraction_type_other,0.02233
2,amount_tsh,0.018663


#### Hyper parameter tuning

In [48]:
from sklearn.model_selection import GridSearchCV
rfc_2 = RandomForestClassifier(n_estimators=500)

param_grid = {"min_samples_split" : [4, 6, 8],
             "n_estimators" : [500, 700, 1000]}

grid_search = GridSearchCV(estimator=rfc_2,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

grid_search = grid_search.fit(X_values, y_values.values)





print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.grid_scores_)

0.7760942760942761
{'min_samples_split': 4, 'n_estimators': 700}
[mean: 0.77589, std: 0.00121, params: {'min_samples_split': 4, 'n_estimators': 500}, mean: 0.77609, std: 0.00114, params: {'min_samples_split': 4, 'n_estimators': 700}, mean: 0.77596, std: 0.00162, params: {'min_samples_split': 4, 'n_estimators': 1000}, mean: 0.77554, std: 0.00113, params: {'min_samples_split': 6, 'n_estimators': 500}, mean: 0.77556, std: 0.00094, params: {'min_samples_split': 6, 'n_estimators': 700}, mean: 0.77549, std: 0.00098, params: {'min_samples_split': 6, 'n_estimators': 1000}, mean: 0.77242, std: 0.00108, params: {'min_samples_split': 8, 'n_estimators': 500}, mean: 0.77288, std: 0.00140, params: {'min_samples_split': 8, 'n_estimators': 700}, mean: 0.77281, std: 0.00116, params: {'min_samples_split': 8, 'n_estimators': 1000}]



The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20



#### Feature elimination

In [84]:
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.feature_selection import SelectKBest,SelectPercentile,SelectFromModel
from sklearn.feature_selection import mutual_info_classif



#convert status group label into numerical data


val_status_group={'functional':2, 'functional needs repair':1,
                   'non functional':0}
y_alternative_values=pd.read_csv('pump_it_up/training_labels.csv').replace(val_status_group).drop('id',axis=1)


####################


#X_new = SelectPercentile(mutual_info_classif,percentile=75).fit_transform(X_values,y_alternative_values.values.ravel())


X_train, X_test, y_train, y_test = train_test_split(X_values, 
                                                    y_values, test_size=0.30, 
                                                    random_state=101)




model = SelectFromModel(RandomForestClassifier(n_estimators=1000,min_samples_split=4))

model.fit(X_values, y_values)

#model.transform(X).shape



SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold=None)

In [88]:
X_new = pd.DataFrame(model.transform(X_values))


In [89]:


from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_new), 
                                                    y_values, test_size=0.30, 
                                                    random_state=101)


rfc = RandomForestClassifier(n_estimators=1000,min_samples_split=4)

rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [90]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
#print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

print(accuracy_score(y_test,rfc_pred))

             precision    recall  f1-score   support

          0       0.82      0.87      0.84      9550
          1       0.61      0.28      0.38      1330
          2       0.86      0.76      0.81      6940

avg / total       0.82      0.78      0.80     17820

0.7845679012345679
