In [1]:
import pandas as pd
import sklearn as sk
import seaborn as sn
import numpy as np
from sklearn.decomposition import PCA

## Random Forest Classifier

### Data Munging
Let's load the data first. Some initial prep is needed. You have to make sure that the index column is not in your dataframe. Then separate the dataframe into the features and the target. We do *not* want to run PCA with cover type!

In [2]:
df = pd.read_csv('train_and_test.csv', index_col=0)

In [3]:
features = df[df.columns[:54]]
target = df['Cover_Type']

A good way to make sure that your dataset is what you want it to be is by using the .head() function. It is not enough but at least the first 5 rows look good lol. 

In [4]:
features.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Dist_To_Water,Vertical_Dist_To_Water,Horizontal_Dist_To_Roadways,Hillshade_9am,Hillshade_noon,Hillshade_3pm,Horizontal_Dist_To_Fire_Points,...,Soiltype31,Soiltype32,Soiltype33,Soiltype34,Soiltype35,Soiltype36,Soiltype37,Soiltype38,Soiltype39,Soiltype40
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0
5,2579,132,6,300,-15,67,230,237,140,6031,...,0,0,0,0,0,0,0,0,0,0
7,2605,49,4,234,7,573,222,230,144,6228,...,0,0,0,0,0,0,0,0,0,0


In [5]:
np.random.seed(69) # seed random number generator
# Separate into training (80%) and testing (20%):
msk = np.random.rand(len(features)) < 0.8
train_features = features[msk]
train_target = target[msk]
test_features = features[~msk]
test_target = target[~msk]

### Random Forest Classifier

In [6]:
features = df.columns[:54]
features

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Dist_To_Water',
       'Vertical_Dist_To_Water', 'Horizontal_Dist_To_Roadways',
       'Hillshade_9am', 'Hillshade_noon', 'Hillshade_3pm',
       'Horizontal_Dist_To_Fire_Points', 'WA_Rawah', 'Wa_Neota', 'WA_Comanche',
       'WA_CacheLaPoudre', 'Soiltype1', 'Soiltype2', 'Soiltype3', 'Soiltype4',
       'Soiltype5', 'Soiltype6', 'Soiltype7', 'Soiltype8', 'Soiltype9',
       'Soiltype10', 'Soiltype11', 'Soiltype12', 'Soiltype13', 'Soiltype14',
       'Soiltype15', 'Soiltype16', 'Soiltype17', 'Soiltype18', 'Soiltype19',
       'Soiltype20', 'Soiltype21', 'Soiltype22', 'Soiltype23', 'Soiltype24',
       'Soiltype25', 'Soiltype26', 'Soiltype27', 'Soiltype28', 'Soiltype29',
       'Soiltype30', 'Soiltype31', 'Soiltype32', 'Soiltype33', 'Soiltype34',
       'Soiltype35', 'Soiltype36', 'Soiltype37', 'Soiltype38', 'Soiltype39',
       'Soiltype40'],
      dtype='object')

In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=2)

clf.fit(train_features, train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [8]:
clf.predict(test_features) #these are the predictions made by the classifier on the test set

array([2, 2, 2, ..., 3, 3, 3])

In [9]:
clf.predict_proba(test_features)[0:10] #these are the probabilities that it predicts for each row of testing data

array([[ 0.1,  0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.3,  0.7,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.2,  0.5,  0. ,  0. ,  0.3,  0. ,  0. ],
       [ 0.6,  0.4,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.1,  0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0.9,  0.1,  0. ,  0. ,  0. ,  0. ],
       [ 0.8,  0.2,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.1,  0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ]])

In [10]:
pd.crosstab(test_target, clf.predict(test_features), rownames=['Actual type'], colnames=['Predicted Species'])

Predicted Species,1,2,3,4,5,6,7
Actual type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,31766,1960,0,0,15,4,50
2,2042,42991,119,1,46,72,10
3,1,128,5387,28,8,135,0
4,0,0,71,364,0,13,0
5,34,383,15,0,1083,4,0
6,4,121,329,12,2,2303,0
7,215,27,0,0,0,0,2952


In [11]:
predicted = clf.predict(test_features)
actual = np.array(test_target)

result = [None]*len(actual)
for i in range(len(actual)):
    result[i] = (predicted[i]==actual[i])

correct_count = [i for i in result if i==True]
correct_rate = float(len(correct_count))/len(result)
correct_rate

0.9369005879497276

### Feature importances
The classification tree that our random forest picked determined, according to the values below that the elevation is the most important predictor followed by Horizontal distance to roadways and horizontal distance to fire points.

In [12]:
list(zip(train_features, clf.feature_importances_))

[('Elevation', 0.26599915409165864),
 ('Aspect', 0.050243286779335969),
 ('Slope', 0.033715596488307534),
 ('Horizontal_Dist_To_Water', 0.059739437787144412),
 ('Vertical_Dist_To_Water', 0.058384516113076726),
 ('Horizontal_Dist_To_Roadways', 0.11516090833627816),
 ('Hillshade_9am', 0.041334738112380517),
 ('Hillshade_noon', 0.043962954943771629),
 ('Hillshade_3pm', 0.041409094198060901),
 ('Horizontal_Dist_To_Fire_Points', 0.11095161187228741),
 ('WA_Rawah', 0.01005378417501674),
 ('Wa_Neota', 0.0039997484443675202),
 ('WA_Comanche', 0.010771540388857496),
 ('WA_CacheLaPoudre', 0.014862710990769579),
 ('Soiltype1', 0.0016336848199325891),
 ('Soiltype2', 0.0060106823923758244),
 ('Soiltype3', 0.0023108212083113025),
 ('Soiltype4', 0.01035057074269982),
 ('Soiltype5', 0.00042689740760972134),
 ('Soiltype6', 0.0031703684726725372),
 ('Soiltype7', 8.8367607334456821e-06),
 ('Soiltype8', 5.3366883123665109e-05),
 ('Soiltype9', 0.00011011937808647677),
 ('Soiltype10', 0.011864007344320377),