In [15]:
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
import matplotlib.pyplot as plt

## Random Forest Classifier

### Data Munging
Let's load the data first. Some initial prep is needed. You have to make sure that the index column is not in your dataframe. Then separate the dataframe into the features and the target. We do *not* want to run PCA with cover type!

In [2]:
df = pd.read_csv('train_and_test.csv', index_col=0)

In [3]:
features = df[df.columns[:54]]
target = df['Cover_Type']

A good way to make sure that your dataset is what you want it to be is by using the .head() function. It is not enough but at least the first 5 rows look good lol. 

In [4]:
#features.head()

In [5]:
np.random.seed(69) # seed random number generator
# Separate into training (80%) and testing (20%):
msk = np.random.rand(len(features)) < 0.8
train_features = features[msk]
train_target = target[msk]
test_features = features[~msk]
test_target = target[~msk]

### Random Forest Classifier

In [9]:
def k_fold_split(df, fold_number, target_fold_index):
    size = len(df)
    test_mask = np.array([False]*size,  dtype=bool)
    for i in range(size-target_fold_index):
        if i%fold_number==0: test_mask[i+ target_fold_index] = True
    
    train = df[~test_mask]
    test= df[test_mask]
    
    return train, test

In [10]:
def random_forest(train, test, bootstrap=True):
    train = resample(train, replace=bootstrap)
    
    train_features = train[train.columns[:54]]
    train_target = train["Cover_Type"]
    test_features = test[test.columns[:54]]
    test_target = test["Cover_Type"]
    
    clf = RandomForestClassifier()
    clf.fit(train_features, train_target)
    
    predicted = clf.predict(test_features)
    actual = np.array(test_target)
    
    result = [True]*len(actual)
    for i in range(len(actual)):
        if predicted[i] != actual[i]:
            result[i] = False
    
    correct_count = [i for i in result if i==True]
    correct_rate = float(len(correct_count))/len(actual)
    
    return correct_rate

In [17]:
# Cross validation parameters:
fold = 10
bootsrap_rounds = 10

cv = np.empty(shape=(bootsrap_rounds, fold))
for fold_index in range(fold):
    train, test = k_fold_split(df, fold, fold_index)
    for i in range(bootsrap_rounds):
        correct_rate = random_forest(train, test)
        cv[i, fold_index] = correct_rate

KeyboardInterrupt: 

# Don't Execute After This !!!

In [None]:
cv_flat = cv.flatten()
bin_size = (np.amin(cv_flat)- np.amax(cv_flat))/100
bins = np.linspace(np.amin(cv), np.amax(cv), 20)

In [None]:
np.mean(cv_flat)

In [None]:
plt.close('all')

plt.hist(cv_flat, bins=bins)
plt.hist(cv_flat, bins=bins)
plt.title('Random Forest Prediction Rate', fontsize=14)
plt.xlabel('Prediction Rate', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

plt.show()

In [8]:
clf.predict(test_features) #these are the predictions made by the classifier on the test set

array([2, 2, 2, ..., 3, 3, 3])

In [9]:
clf.predict_proba(test_features)[0:10] #these are the probabilities that it predicts for each row of testing data

array([[ 0.1,  0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.3,  0.7,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.2,  0.5,  0. ,  0. ,  0.3,  0. ,  0. ],
       [ 0.6,  0.4,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.1,  0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0.9,  0.1,  0. ,  0. ,  0. ,  0. ],
       [ 0.8,  0.2,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0.1,  0.9,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ]])

In [10]:
pd.crosstab(test_target, clf.predict(test_features), rownames=['Actual type'], colnames=['Predicted Species'])

Predicted Species,1,2,3,4,5,6,7
Actual type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,31766,1960,0,0,15,4,50
2,2042,42991,119,1,46,72,10
3,1,128,5387,28,8,135,0
4,0,0,71,364,0,13,0
5,34,383,15,0,1083,4,0
6,4,121,329,12,2,2303,0
7,215,27,0,0,0,0,2952


In [11]:
predicted = clf.predict(test_features)
actual = np.array(test_target)

result = [None]*len(actual)
for i in range(len(actual)):
    result[i] = (predicted[i]==actual[i])

correct_count = [i for i in result if i==True]
correct_rate = float(len(correct_count))/len(result)
correct_rate

0.9369005879497276

### Feature importances
The classification tree that our random forest picked determined, according to the values below that the elevation is the most important predictor followed by Horizontal distance to roadways and horizontal distance to fire points.

In [12]:
list(zip(train_features, clf.feature_importances_))

[('Elevation', 0.26599915409165864),
 ('Aspect', 0.050243286779335969),
 ('Slope', 0.033715596488307534),
 ('Horizontal_Dist_To_Water', 0.059739437787144412),
 ('Vertical_Dist_To_Water', 0.058384516113076726),
 ('Horizontal_Dist_To_Roadways', 0.11516090833627816),
 ('Hillshade_9am', 0.041334738112380517),
 ('Hillshade_noon', 0.043962954943771629),
 ('Hillshade_3pm', 0.041409094198060901),
 ('Horizontal_Dist_To_Fire_Points', 0.11095161187228741),
 ('WA_Rawah', 0.01005378417501674),
 ('Wa_Neota', 0.0039997484443675202),
 ('WA_Comanche', 0.010771540388857496),
 ('WA_CacheLaPoudre', 0.014862710990769579),
 ('Soiltype1', 0.0016336848199325891),
 ('Soiltype2', 0.0060106823923758244),
 ('Soiltype3', 0.0023108212083113025),
 ('Soiltype4', 0.01035057074269982),
 ('Soiltype5', 0.00042689740760972134),
 ('Soiltype6', 0.0031703684726725372),
 ('Soiltype7', 8.8367607334456821e-06),
 ('Soiltype8', 5.3366883123665109e-05),
 ('Soiltype9', 0.00011011937808647677),
 ('Soiltype10', 0.011864007344320377),