In [1]:
#import standard packages
import pandas as pd
import numpy as np

In [2]:
# mport the random forest package # it's the best model used so far but i want to prove that
from sklearn.ensemble import RandomForestClassifier

In [43]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.cross_validation import cross_val_score

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [20]:
#load the cleaned training values
train = pd.read_csv('cleaned_v2.csv')

In [21]:
#load the cleaned test set
test = pd.read_csv('cleaned_test_v2.csv')

In [7]:
#look at shape because will combine and separate again later
train.shape, test.shape

((59400, 35), (14850, 34))

In [8]:
#take off the status group column  - it is the outcome to predict
y_train = train.pop('status_group')

In [9]:
train.shape, test.shape

((59400, 34), (14850, 34))

In [22]:
#drop this unnamed column 
train.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('Unnamed: 0', axis=1, inplace=True)

In [23]:
#define categorical variables to encode for model
categorical = ['funder',
 'installer',
 'scheme_management',
 'extraction_type',
 'management',
 'payment_type',
 'water_quality',
 'quantity']

In [26]:
#Encode the categorical data, with the complete set (train and test)

#this allows you to separate out your df again
train["train"] = 1
test["train"] = 0

#combine the dfs
combined = pd.concat([train, test])

#factorize for random forest
combined['funder'] = pd.factorize(combined['funder'])[0]
combined['installer'] = pd.factorize(combined['installer'])[0]
combined['scheme_management'] = pd.factorize(combined['scheme_management'])[0]
combined['extraction_type'] = pd.factorize(combined['extraction_type'])[0]
combined['management'] = pd.factorize(combined['management'])[0]
combined['payment_type'] = pd.factorize(combined['payment_type'])[0]
combined['water_quality'] = pd.factorize(combined['water_quality'])[0]
combined['quantity'] = pd.factorize(combined['quantity'])[0]

In [27]:
combined.shape

(74250, 35)

In [24]:
#Encode the categorical data, with the complete set (train and test)

#this allows you to separate out your df again
train["train"] = 1
test["train"] = 0

#combine the dfs
combined = pd.concat([train, test])

In [28]:
#boolean indexing to separate train vs test
train_df = combined[combined["train"] == 1]
test_df = combined[combined["train"] == 0]
train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

In [29]:
train_df.shape, test_df.shape
#looks correct!

((59400, 34), (14850, 34))

In [28]:
train.shape, test.shape

((59400, 41), (14850, 41))

In [61]:
#define features columns
#some trial and error went into this 
feature_columns = [
 'funder',
 'installer',
 'scheme_management',
 'extraction_type',
 'management',
 'payment_type',
 'water_quality',
 'longitude',
 'latitude',
 'gps_height',
 'region_code',
 'district_code',
 'operation_year',
 'quantity'
]

In [62]:
#define X train
X = train_df[feature_columns]
#y_train indentified at the beginning 
y = y_train

In [63]:
#define X test
X_test = test_df[feature_columns]


In [69]:
#define and fit model - i found 1000 estimators gave the best score
model = RandomForestClassifier(n_estimators=1000)
model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
#warning this takes forever
cross_val_score(model, X, y, 'accuracy', cv=10)

array([ 0.80642989,  0.80121192,  0.80020199,  0.79279583,  0.79983165,
        0.79175084,  0.80252525,  0.78868496,  0.79121064,  0.80060626])

In [70]:
#look at feature importance
importances = model.feature_importances_
importances

array([ 0.0557846 ,  0.04613513,  0.02024127,  0.08536828,  0.0205207 ,
        0.04047207,  0.01725403,  0.18043261,  0.17914069,  0.09945361,
        0.02803818,  0.03119936,  0.07887847,  0.117081  ])

In [71]:
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


Feature ranking:
1. feature 7 (0.180433)
2. feature 8 (0.179141)
3. feature 13 (0.117081)
4. feature 9 (0.099454)
5. feature 3 (0.085368)
6. feature 12 (0.078878)
7. feature 0 (0.055785)
8. feature 1 (0.046135)
9. feature 5 (0.040472)
10. feature 11 (0.031199)
11. feature 10 (0.028038)
12. feature 4 (0.020521)
13. feature 2 (0.020241)
14. feature 6 (0.017254)


In [72]:
#in order: 
#longitude, latitude, quantity, gps height, extraction type, operation year, funder, installer
#payment type, district code, region code, management, scheme management, water quality


In [73]:
#create status_group column on test df
y_pred = model.predict(X_test)

In [52]:
y_pred

array(['non functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [68]:
#export it to csv
pd.DataFrame(y_pred).to_csv("submission.csv")