In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
import warnings
warnings.filterwarnings('ignore')
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [3]:
x_train = pd.read_csv('https://raw.githubusercontent.com/kayoyin/datasets/master/xtrain_clean.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/kayoyin/datasets/master/y_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/kayoyin/datasets/master/xtest_clean.csv')

In [4]:
# Turn all data types into categorical data types
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): 
            df[n] = pd.factorize(df[n])[0]
    return df

In [5]:
x_train = train_cats(x_train)

In [6]:
y_num = y_train.replace(['functional','non functional', 'functional needs repair'], [0,1,2]) # replace labels by numerical values


In [7]:
#x_train.drop('id', axis = 1, inplace = True)
#y_train.drop('id', axis = 1, inplace = True)
x_test.drop('id', axis = 1, inplace = True)

In [8]:
xtrain, xvalid, ytrain, yvalid = train_test_split(x_train, y_num, test_size=0.2)

In [9]:
xtrain.head()

Unnamed: 0.1,Unnamed: 0,id,gps_height,longitude,latitude,basin,public_meeting,scheme_management,construction_year,extraction_type_class,...,water_quality,quantity,source,source_class,waterpoint_type,year_recorded,date_recordedMonth,date_recordedElapsed,population_log,age
31620,31620,59881,1602.0,35.54,-4.195354,4,1,0,2012.0,0,...,0,0,0,0,0,2013,2,782,5.484797,1.0
56169,56169,17065,709.166667,32.77,-3.90973,4,1,0,2010.0,0,...,0,0,0,0,1,2012,10,639,6.249975,2.0
20334,20334,55939,781.09743,33.37,-2.797178,1,1,0,2000.0,2,...,0,0,5,0,2,2011,8,219,5.525453,11.0
41760,41760,51800,1529.0,37.44,-3.295832,2,1,0,1974.0,0,...,0,0,0,0,0,2013,2,785,4.110874,39.0
50380,50380,19671,1450.0,34.76,-8.812594,7,1,6,1978.0,0,...,0,2,6,1,0,2011,3,85,4.330733,33.0


# Decision Tree Classifier

In [33]:
dtc = DecisionTreeClassifier(max_depth=5).fit(xtrain, ytrain)
pred1 = dtc.predict(xvalid)

In [34]:
print('Training accuracy: ', accuracy_score(ytrain, dtc.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred1))

Training accuracy:  0.7111952861952862
Validation accuracy:  0.7082491582491582


# Random Forest Classifier

In [35]:
rf = RandomForestClassifier(max_depth=100).fit(xtrain,ytrain)
pred2 = rf.predict(xvalid)

In [36]:
print('Training accuracy: ', accuracy_score(ytrain, rf.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred2))

Training ccuracy:  0.9840488215488216
Validation accuracy:  0.7871212121212121


# Support Vector Machine

In [38]:
svm = SVC().fit(xtrain,ytrain)
pred3 = svm.predict(xvalid)

In [39]:
print('Training accuracy: ', accuracy_score(ytrain, svm.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred3))

Training ccuracy:  1.0
Validation accuracy:  0.5353535353535354


# SGD

In [41]:
sgd = SGDClassifier(loss="hinge", penalty="l2").fit(xtrain,ytrain)
pred4 = sgd.predict(xvalid)

In [42]:
print('Training accuracy: ', accuracy_score(ytrain, sgd.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred4))

Training accuracy:  0.54503367003367
Validation accuracy:  0.5361111111111111


# KNN

In [44]:
knn = KNeighborsClassifier(n_neighbors=7).fit(xtrain,ytrain)
pred5 = knn.predict(xvalid)

In [45]:
print('Training accuracy: ', accuracy_score(ytrain, knn.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred5))

Training accuracy:  0.6333122895622896
Validation accuracy:  0.49595959595959593


# Naive Bayes

In [47]:
gnb = GaussianNB().fit(xtrain,ytrain)
pred6 = gnb.predict(xvalid)

In [48]:
print('Training accuracy: ', accuracy_score(ytrain, gnb.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred6))

Training accuracy:  0.6289772727272728
Validation accuracy:  0.6266835016835017


# Neural Network

In [49]:
mlp = MLPClassifier().fit(xtrain,ytrain)
pred7 = mlp.predict(xvalid)

In [50]:
print('Training accuracy: ', accuracy_score(ytrain, mlp.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred7))

Training accuracy:  0.4540824915824916
Validation accuracy:  0.4535353535353535


# Optimizing the Random Forest

In [54]:
param_grid = { 
    'n_estimators': [300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5,10,30,50],
    'criterion' :['gini', 'entropy']
}

In [56]:
gridRF = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= 5)
gridRF.fit(xtrain, ytrain)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [300, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [5, 10, 30, 50], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [57]:
gridRF.best_params_


{'criterion': 'gini',
 'max_depth': 30,
 'max_features': 'sqrt',
 'n_estimators': 500}

In [60]:
rf1 = RandomForestClassifier(max_depth=30, criterion = 'gini', max_features='sqrt', n_estimators=500).fit(xtrain,ytrain)
predrf1 = rf1.predict(xvalid)

In [61]:
print('Training accuracy: ', accuracy_score(ytrain, rf1.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, predrf1))

Training accuracy:  0.9999368686868687
Validation accuracy:  0.803030303030303


In [62]:
rf2 = RandomForestClassifier(max_depth=30, criterion = 'gini', max_features='sqrt', n_estimators=700).fit(xtrain,ytrain)
predrf2 = rf2.predict(xvalid)
print('Training accuracy: ', accuracy_score(ytrain, rf2.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, predrf2))


Training accuracy:  0.9998947811447811
Validation accuracy:  0.8035353535353535


In [68]:
rf3 = RandomForestClassifier(n_estimators=300, 
                                min_samples_split=8,
                                max_features='sqrt',
                                max_depth=95,
                                random_state=42)
rf3.fit(xtrain,ytrain)
predrf3 = rf3.predict(xvalid)
print('Training accuracy: ', accuracy_score(ytrain, rf3.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, predrf3))

Training accuracy:  0.9409722222222222
Validation accuracy:  0.8053872053872054


# Extremely Randomized Trees

In [None]:
etc = ExtraTreesClassifier(n_estimators=300, 
                                min_samples_split=8,
                                max_features='sqrt',
                                max_depth=95,
                                random_state=42)
etc.fit(xtrain,ytrain)
predetc = etc.predict(xvalid)
print('Training accuracy: ', accuracy_score(ytrain, etc.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, predetc))

# Combining 1 vs All Random Forests

In [11]:
y_func = y_train.copy(True)
y_nfunc = y_train.copy(True)
y_rep = y_train.copy(True)

y_func = y_func.replace(['functional','non functional', 'functional needs repair'], [1,0,0]) 
y_nfunc = y_nfunc.replace(['functional','non functional', 'functional needs repair'], [0,1,0]) 
y_rep = y_rep.replace(['functional','non functional', 'functional needs repair'], [0,0,1]) 


In [12]:
xtrain, xvalid, yftrain, yfvalid, yntrain, ynvalid, yrtrain, yrvalid = train_test_split(x_train, y_func, y_nfunc, y_rep, test_size=0.2)

In [None]:
# TODO: Use GridCV to optimize hyperparmeters for each sub forest

rffunc = RandomForestClassifier(n_estimators=300, 
                                min_samples_split=8,
                                max_features='sqrt',
                                max_depth=95,
                                random_state=42)

rfnfunc = RandomForestClassifier(n_estimators=300, 
                                min_samples_split=5,
                                max_features='log2',
                                max_depth=50,
                                random_state=42)

rfrep = RandomForestClassifier(n_estimators=50, 
                                min_samples_split=8,
                                max_features='sqrt',
                                max_depth=70,
                                random_state=42)

rffunc.fit(xtrain, yftrain)


In [None]:
rfnfunc.fit(xtrain, yntrain)


In [None]:
rfrep.fit(xtrain, yrtrain)