In [1]:
#import standard packages
import pandas as pd
import numpy as np

In [2]:
# mport the random forest package # it's the best model used so far but i want to prove that
from sklearn.ensemble import RandomForestClassifier

In [3]:
#import logistic reg as well
from sklearn.linear_model import LogisticRegression

In [4]:
%matplotlib inline

from matplotlib import pyplot as plt

from sklearn.cross_validation import cross_val_score

#importing these but may not use all of them
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
#load the cleaned training values
train = pd.read_csv('cleaned.csv')

In [6]:
#load the cleaned test set
test = pd.read_csv('cleaned_test.csv')

In [7]:
#drop this unnamed column 
train.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
#fill NaN values with the mean
train["construction_year"].fillna(train["construction_year"].mean(), inplace=True)
train["operation_year"].fillna(train["operation_year"].mean(), inplace=True)
test["construction_year"].fillna(test["construction_year"].mean(), inplace=True)
test["operation_year"].fillna(test["construction_year"].mean(), inplace=True)

In [9]:
#get numeric columns for model from train df
[(col, dtype) for col, dtype in zip(train.columns, train.dtypes) if dtype != 'object']
num_columns = [col for col, dtype in zip(train.columns, train.dtypes) if dtype != 'object']
num_columns

['id',
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'operation_year',
 'month_recorded',
 'season']

In [10]:
#get numeric columns for model from test df
[(col, dtype) for col, dtype in zip(test.columns, test.dtypes) if dtype != 'object']
num_columns_test = [col for col, dtype in zip(test.columns, test.dtypes) if dtype != 'object']
num_columns_test

['id',
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'operation_year',
 'month_recorded',
 'season']

In [11]:
#going to define feature columns for model because I don't want all of these num_columns 
#these features are a work in progress - want to factorize water type/quality and maintenance system
feature_columns = [
 'amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'operation_year', 
 'season']

In [12]:
X = train[feature_columns]
y = train.status_group

In [13]:
X_test = test[feature_columns]


In [14]:
#try with logistic regression to establish a baseline
model = LogisticRegression()
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
cross_val_score(model, X, y, 'accuracy', cv=10)

Wall time: 0 ns


array([ 0.59771082,  0.60242383,  0.58761151,  0.58777984,  0.58417508,
        0.58552189,  0.59393939,  0.59622832,  0.59252399,  0.59279219])

In [16]:
#try with rfc and see if cross_val_score improves
model = RandomForestClassifier()
model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
cross_val_score(model, X, y, 'accuracy', cv=10)
#these scores are improved 

array([ 0.71149638,  0.71907086,  0.71065477,  0.70846659,  0.72020202,
        0.70858586,  0.71734007,  0.69944435,  0.70213841,  0.70882452])

In [18]:
#try again and specify n_estimators 100
model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
cross_val_score(model, X, y, 'accuracy', cv=10)
#these scores are improved but not that much

array([ 0.7185659 ,  0.73051675,  0.71620939,  0.72226898,  0.73198653,
        0.71919192,  0.73114478,  0.70786328,  0.71712409,  0.72364432])

In [20]:
#try again and specify n_estimators 100
model = RandomForestClassifier(n_estimators=1000)
model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
#am not going to run the cross_val_score because it is taking too long

In [22]:
#look at feature importance
importances = model.feature_importances_
importances

array([ 0.05445317,  0.11481486,  0.28526193,  0.28087867,  0.02705655,
        0.03252202,  0.07535033,  0.05932817,  0.05532201,  0.0150123 ])

In [23]:
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


Feature ranking:
1. feature 2 (0.285262)
2. feature 3 (0.280879)
3. feature 1 (0.114815)
4. feature 6 (0.075350)
5. feature 7 (0.059328)
6. feature 8 (0.055322)
7. feature 0 (0.054453)
8. feature 5 (0.032522)
9. feature 4 (0.027057)
10. feature 9 (0.015012)


In [24]:
#in order: 
#2-longitude, 3-latitude, 1-elevation, 6-population
#7-construction year, 8-operation year, 0-free/not free (amt tsh)
#5-district, 6-region, 9-season


In [26]:
#create status_group column on test df
test['status_group'] = model.predict(X_test)

In [27]:
#export it to csv
pd.DataFrame(test['status_group']).to_csv("submission11.csv")

In [None]:
#looking forward to factorizing additional categorical variables
#want to calculate ROC and F1 score by doing train-test split on training df