# Forest Cover Type Prediction
Use cartographic variables to classify forest categories

## Random Forest

#### Problem
The study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado. Each observation is a 30m x 30m patch. You are asked to predict an integer classification for the forest cover type. The seven types are:

    1 - Spruce/Fir
    2 - Lodgepole Pine
    3 - Ponderosa Pine
    4 - Cottonwood/Willow
    5 - Aspen
    6 - Douglas-fir
    7 - Krummholz

The training set (15120 observations) contains both features and the Cover_Type. The test set contains only the features. You must predict the Cover_Type for every row in the test set (565892 observations).

#### Evaluation Metric
Multi-class classification accuracy

## Imports

In [1]:
import pandas as pd
import numpy as np
import random
from random import randint
import matplotlib as m
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# make pandas show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Formating the plots
plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline

plt.style.use('fivethirtyeight')
m.rcParams['axes.labelsize'] = 14
m.rcParams['xtick.labelsize'] = 12
m.rcParams['ytick.labelsize'] = 12
m.rcParams['figure.figsize'] = (15, 5)
m.rcParams['font.size'] = 12
m.rcParams['legend.fontsize'] = 'large'
m.rcParams['figure.titlesize'] = 'medium'
m.rcParams['text.color'] = 'k'
sns.set(rc={'figure.figsize':(15,5)})

In [2]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Forest Cover Type Prediction -- Jessica Cabral" --iversions
%watermark -n -t -z

seaborn    0.10.1
numpy      1.19.1
matplotlib 3.2.2
pandas     1.1.0
Forest Cover Type Prediction -- Jessica Cabral
Sat Aug 22 2020 18:59:27 Hora oficial do Brasil


In [3]:
np.random.seed(42)
random.seed(42)
random_seed = 42

## Import Data

In [4]:
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')
sample_submission = pd.read_csv('../../data/sampleSubmission.csv')

print('Train: {}'.format(train.shape))
print('test: {}'.format(test.shape))
print('sample_submission: {}'.format(sample_submission.shape))

Train: (15120, 56)
test: (565892, 55)
sample_submission: (565892, 2)


In [5]:
display(train.head(), test.head())

train.shape, test.shape

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,6645,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,6675,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,6344,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,15124,2709,24,17,0,0,2950,208,201,125,6374,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,15125,2706,29,19,0,0,2920,210,195,115,6404,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


((15120, 56), (565892, 55))

In [6]:
# remove de ID column

train = train.drop(columns=['Id'], axis=1)
test = test.drop(columns=['Id'], axis=1)

train.shape, test.shape

((15120, 55), (565892, 54))

## Pre-Processing

### Replace some col names

In [7]:
cols_original_values = {"Wilderness_Area1":"Rawah_WA",
                        "Wilderness_Area2":"Neota_WA",
                        "Wilderness_Area3":"Comanche_Peak_WA",
                        "Wilderness_Area4":"Cache_la_Poudre_WA",
                        "Horizontal_Distance_To_Hydrology":"HD_Hydrology",
                        "Vertical_Distance_To_Hydrology":"VD_Hydrology",
                        "Horizontal_Distance_To_Roadways":"HD_Roadways",
                        "Horizontal_Distance_To_Fire_Points":"HD_Fire_Points"}

train = train.rename(columns=cols_original_values)
test = test.rename(columns=cols_original_values)

### Lets try to "undo" the OHE in our data

### Normalizing

In [8]:
from sklearn.preprocessing import normalize

cols_to_normalize = ['Aspect','Slope','HD_Hydrology','VD_Hydrology',
                     'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','HD_Fire_Points']

train[cols_to_normalize] = normalize(train[cols_to_normalize])
test[cols_to_normalize] = normalize(test[cols_to_normalize])

### Delete some nor correlated features 

In [9]:
display(train.head(), test.head())

Unnamed: 0,Elevation,Aspect,Slope,HD_Hydrology,VD_Hydrology,HD_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,HD_Fire_Points,Rawah_WA,Neota_WA,Comanche_Peak_WA,Cache_la_Poudre_WA,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,0.008102,0.000477,0.040989,0.0,510,0.035111,0.036858,0.023513,0.997552,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,0.008976,0.000321,0.03398,-0.000962,390,0.035262,0.037666,0.024203,0.997755,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,0.022641,0.001466,0.043653,0.010587,3180,0.038115,0.038766,0.021989,0.99701,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,0.024883,0.00289,0.03885,0.018943,3090,0.038208,0.038208,0.019586,0.997096,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,0.007277,0.000323,0.02474,-0.000162,391,0.035574,0.037838,0.024255,0.998023,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


Unnamed: 0,Elevation,Aspect,Slope,HD_Hydrology,VD_Hydrology,HD_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,HD_Fire_Points,Rawah_WA,Neota_WA,Comanche_Peak_WA,Cache_la_Poudre_WA,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2680,0.053133,0.002101,0.0,0.0,2684,0.029418,0.03212,0.023414,0.99736,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,2683,0.0,0.001945,0.0,0.0,2654,0.030075,0.03232,0.022743,0.998764,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2713,0.002519,0.002361,0.0,0.0,2980,0.032429,0.032744,0.021567,0.998699,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,2709,0.003761,0.002664,0.0,0.0,2950,0.032592,0.031496,0.019587,0.99877,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2706,0.004523,0.002963,0.0,0.0,2920,0.032753,0.030414,0.017936,0.998825,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


## Feature Engineering

In [10]:
def add_feature(data):   
    data['Ele_minus_VDtHyd'] = data['Elevation']-data['VD_Hydrology']
    data['Ele_plus_VDtHyd'] = data['Elevation']+data['VD_Hydrology']
    data['Distanse_to_Hydrolody'] = (data['HD_Hydrology']**2+data['VD_Hydrology']**2)**0.5
    data['Hydro_plus_Fire'] = data['HD_Hydrology']+data['HD_Fire_Points']
    data['Hydro_minus_Fire'] = data['HD_Hydrology']-data['HD_Fire_Points']
    data['Hydro_plus_Road'] = data['HD_Hydrology']+data['HD_Roadways']
    data['Hydro_minus_Road'] = data['HD_Hydrology']-data['HD_Roadways']
    data['Fire_plus_Road'] = data['HD_Fire_Points']+data['HD_Roadways']
    data['Fire_minus_Road'] = data['HD_Fire_Points']-data['HD_Roadways']
    return data

train = add_feature(train)
test = add_feature(test)

In [11]:
display(train.head(), test.head())

Unnamed: 0,Elevation,Aspect,Slope,HD_Hydrology,VD_Hydrology,HD_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,HD_Fire_Points,Rawah_WA,Neota_WA,Comanche_Peak_WA,Cache_la_Poudre_WA,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Ele_minus_VDtHyd,Ele_plus_VDtHyd,Distanse_to_Hydrolody,Hydro_plus_Fire,Hydro_minus_Fire,Hydro_plus_Road,Hydro_minus_Road,Fire_plus_Road,Fire_minus_Road
0,2596,0.008102,0.000477,0.040989,0.0,510,0.035111,0.036858,0.023513,0.997552,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,2596.0,2596.0,0.040989,1.038541,-0.956563,510.040989,-509.959011,510.997552,-509.002448
1,2590,0.008976,0.000321,0.03398,-0.000962,390,0.035262,0.037666,0.024203,0.997755,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,2590.000962,2589.999038,0.033993,1.031735,-0.963776,390.03398,-389.96602,390.997755,-389.002245
2,2804,0.022641,0.001466,0.043653,0.010587,3180,0.038115,0.038766,0.021989,0.99701,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2803.989413,2804.010587,0.044918,1.040663,-0.953357,3180.043653,-3179.956347,3180.99701,-3179.00299
3,2785,0.024883,0.00289,0.03885,0.018943,3090,0.038208,0.038208,0.019586,0.997096,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2784.981057,2785.018943,0.043222,1.035946,-0.958246,3090.03885,-3089.96115,3090.997096,-3089.002904
4,2595,0.007277,0.000323,0.02474,-0.000162,391,0.035574,0.037838,0.024255,0.998023,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,2595.000162,2594.999838,0.024741,1.022763,-0.973282,391.02474,-390.97526,391.998023,-390.001977


Unnamed: 0,Elevation,Aspect,Slope,HD_Hydrology,VD_Hydrology,HD_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,HD_Fire_Points,Rawah_WA,Neota_WA,Comanche_Peak_WA,Cache_la_Poudre_WA,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Ele_minus_VDtHyd,Ele_plus_VDtHyd,Distanse_to_Hydrolody,Hydro_plus_Fire,Hydro_minus_Fire,Hydro_plus_Road,Hydro_minus_Road,Fire_plus_Road,Fire_minus_Road
0,2680,0.053133,0.002101,0.0,0.0,2684,0.029418,0.03212,0.023414,0.99736,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2680.0,2680.0,0.0,0.99736,-0.99736,2684.0,-2684.0,2684.99736,-2683.00264
1,2683,0.0,0.001945,0.0,0.0,2654,0.030075,0.03232,0.022743,0.998764,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2683.0,2683.0,0.0,0.998764,-0.998764,2654.0,-2654.0,2654.998764,-2653.001236
2,2713,0.002519,0.002361,0.0,0.0,2980,0.032429,0.032744,0.021567,0.998699,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2713.0,2713.0,0.0,0.998699,-0.998699,2980.0,-2980.0,2980.998699,-2979.001301
3,2709,0.003761,0.002664,0.0,0.0,2950,0.032592,0.031496,0.019587,0.99877,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2709.0,2709.0,0.0,0.99877,-0.99877,2950.0,-2950.0,2950.99877,-2949.00123
4,2706,0.004523,0.002963,0.0,0.0,2920,0.032753,0.030414,0.017936,0.998825,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2706.0,2706.0,0.0,0.998825,-0.998825,2920.0,-2920.0,2920.998825,-2919.001175


## Save Model Results

In [12]:
# Create a dataframe to store the model results to compare later

def save_results(algorithm, acc_train, acc_test, sc, model_results):
    result = [
    algorithm,
    acc_train, 
    acc_test,
    sc.mean(),
    sc.std() * 2
    ]
        
    aux_df = (pd.DataFrame(result).T).rename(columns = {0:'algorithm' , 
                                                        1: 'accuracy_train', 
                                                        2: 'accuracy_test',
                                                        3: 'cross_val_score_mean', 
                                                        4:'cross_val_score_std'})
    model_results = model_results.append(aux_df, ignore_index=True)
    display(model_results)
    return model_results


model_results = pd.DataFrame(columns=['algorithm', 
                                      'accuracy_train',
                                      'accuracy_test',
                                      'cross_val_score_mean',
                                      'cross_val_score_std'])
model_results.head()

Unnamed: 0,algorithm,accuracy_train,accuracy_test,cross_val_score_mean,cross_val_score_std


## Confusion Matrix

In [13]:
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
    plt.figure(figsize=[7, 6])
    norm_cm = cm
    if normalized:
        norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)


## Train - Test Split

In [14]:
cols_to_train = []

X = train[train.columns.difference(['Cover_Type'])]#[cols_to_train]
y = train['Cover_Type']

display(X.head(), y.head())

Unnamed: 0,Aspect,Cache_la_Poudre_WA,Comanche_Peak_WA,Distanse_to_Hydrolody,Ele_minus_VDtHyd,Ele_plus_VDtHyd,Elevation,Fire_minus_Road,Fire_plus_Road,HD_Fire_Points,HD_Hydrology,HD_Roadways,Hillshade_3pm,Hillshade_9am,Hillshade_Noon,Hydro_minus_Fire,Hydro_minus_Road,Hydro_plus_Fire,Hydro_plus_Road,Neota_WA,Rawah_WA,Slope,Soil_Type1,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type2,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type3,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type4,Soil_Type40,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,VD_Hydrology
0,0.008102,0,0,0.040989,2596.0,2596.0,2596,-509.002448,510.997552,0.997552,0.040989,510,0.023513,0.035111,0.036858,-0.956563,-509.959011,1.038541,510.040989,0,1,0.000477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,0.008976,0,0,0.033993,2590.000962,2589.999038,2590,-389.002245,390.997755,0.997755,0.03398,390,0.024203,0.035262,0.037666,-0.963776,-389.96602,1.031735,390.03398,0,1,0.000321,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.000962
2,0.022641,0,0,0.044918,2803.989413,2804.010587,2804,-3179.00299,3180.99701,0.99701,0.043653,3180,0.021989,0.038115,0.038766,-0.953357,-3179.956347,1.040663,3180.043653,0,1,0.001466,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.010587
3,0.024883,0,0,0.043222,2784.981057,2785.018943,2785,-3089.002904,3090.997096,0.997096,0.03885,3090,0.019586,0.038208,0.038208,-0.958246,-3089.96115,1.035946,3090.03885,0,1,0.00289,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.018943
4,0.007277,0,0,0.024741,2595.000162,2594.999838,2595,-390.001977,391.998023,0.998023,0.02474,391,0.024255,0.035574,0.037838,-0.973282,-390.97526,1.022763,391.02474,0,1,0.000323,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.000162


0    5
1    5
2    2
3    2
4    5
Name: Cover_Type, dtype: int64

In [15]:
# train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10130, 63), (4990, 63), (10130,), (4990,))

## Pre-Processing

### Label Encoding

In [16]:
display(X_train.head(), X_test.head())

Unnamed: 0,Aspect,Cache_la_Poudre_WA,Comanche_Peak_WA,Distanse_to_Hydrolody,Ele_minus_VDtHyd,Ele_plus_VDtHyd,Elevation,Fire_minus_Road,Fire_plus_Road,HD_Fire_Points,HD_Hydrology,HD_Roadways,Hillshade_3pm,Hillshade_9am,Hillshade_Noon,Hydro_minus_Fire,Hydro_minus_Road,Hydro_plus_Fire,Hydro_plus_Road,Neota_WA,Rawah_WA,Slope,Soil_Type1,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type2,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type3,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type4,Soil_Type40,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,VD_Hydrology
3388,0.110514,1,0,0.0,2149.0,2149.0,2149,-779.080931,780.919069,0.919069,0.0,780,0.094726,0.27854,0.236815,-0.919069,-780.0,0.919069,780.0,0,0,0.021426,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0
10782,0.170271,0,1,0.067934,2605.979165,2606.020835,2606,-1444.053093,1445.946907,0.946907,0.06466,1445,0.158776,0.109922,0.178892,-0.882247,-1444.93534,1.011567,1445.06466,0,0,0.021553,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.020835
13624,0.001284,0,1,0.009627,3015.0,3015.0,3015,-1979.006196,1980.993804,0.993804,0.009627,1980,0.049097,0.067708,0.072522,-0.984177,-1979.990373,1.003431,1980.009627,0,0,0.002246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
7855,0.012356,0,1,0.280927,3404.924033,3405.075967,3405,-3131.052245,3132.947755,0.947755,0.27046,3132,0.060407,0.097933,0.097018,-0.677295,-3131.72954,1.218215,3132.27046,0,0,0.005949,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.075967
11884,0.377036,0,0,0.122413,3234.994339,3235.005661,3235,-4595.172331,4596.827669,0.827669,0.122282,4596,0.18682,0.23211,0.26268,-0.705386,-4595.877718,0.949951,4596.122282,0,1,0.006793,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.005661


Unnamed: 0,Aspect,Cache_la_Poudre_WA,Comanche_Peak_WA,Distanse_to_Hydrolody,Ele_minus_VDtHyd,Ele_plus_VDtHyd,Elevation,Fire_minus_Road,Fire_plus_Road,HD_Fire_Points,HD_Hydrology,HD_Roadways,Hillshade_3pm,Hillshade_9am,Hillshade_Noon,Hydro_minus_Fire,Hydro_minus_Road,Hydro_plus_Fire,Hydro_plus_Road,Neota_WA,Rawah_WA,Slope,Soil_Type1,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type2,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type3,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type4,Soil_Type40,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,VD_Hydrology
6077,0.102027,1,0,0.062481,2431.998612,2432.001388,2432,-483.038725,484.961275,0.961275,0.062466,484,0.083287,0.166575,0.163798,-0.898809,-483.937534,1.02374,484.062466,0,0,0.011105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.001388
7894,0.047893,0,1,0.075233,2652.984036,2653.015964,2653,-2277.014836,2278.985164,0.985164,0.07352,2278,0.044952,0.102507,0.094525,-0.911644,-2277.92648,1.058684,2278.07352,0,0,0.005882,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.015964
6820,0.204363,0,0,0.185695,3541.91566,3542.08434,3542,-3217.060578,3218.939422,0.939422,0.165437,3218,0.133647,0.080448,0.128457,-0.773985,-3217.834563,1.104859,3218.165437,0,1,0.020112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.08434
1593,0.315909,0,0,0.03569,3190.986745,3191.013255,3191,-4964.137327,4965.862673,0.862673,0.033137,4965,0.23417,0.178941,0.259575,-0.829536,-4964.966863,0.895811,4965.033137,0,1,0.022092,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.013255
11986,0.107342,1,0,0.357038,2159.838988,2160.161012,2160,-239.188229,240.811771,0.811771,0.31867,240,0.025158,0.377373,0.23481,-0.493101,-239.68133,1.130441,240.31867,0,0,0.062057,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.161012


## Modeling

#### RandomForestClassifier

In [None]:
%%time 

param_grid = {
    'min_samples_leaf': [1,4,7],
    'min_samples_split': [2, 3, 4, 5],
    'max_depth': [int(x) for x in range(1, 60, 4)] + [None],
    'max_features': ["sqrt", "log2", None],
    'n_estimators' : [randint(10, 50) for p in range(0, 4)] + [randint(50, 200) for p in range(0, 4)] + [randint(200, 1000) for p in range(0, 4)],
    
}


clf_random_forest = RandomForestClassifier(n_estimators=500, random_state=random_seed)

grid_search = GridSearchCV(clf_random_forest, param_grid, cv=5, refit='True')
grid_search.fit(X_train, y_train)

print('best_score_: {}'.format(grid_search.best_score_))
print('best_params_: {}'.format(grid_search.best_params_))

In [None]:
# Calculate the accuracy

clf_random_forest = rf_grid_search.best_estimator_

# accuracy in train
y_train_pred  = clf_random_forest.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred , normalize=True)*100
print('Accuracy in train data: {:0.2f}%'.format(acc_train))

# accuracy in test
y_test_pred  = clf_random_forest.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred , normalize=True)*100
print('Accuracy in test data: {:0.2f}%'.format(acc_test))

# Classification Report
print('\nClassification Report in test dataset')
print(classification_report(y_test, y_test_pred))

In [None]:
sc_random_forest = cross_val_score(clf_random_forest, X, y, scoring='accuracy', cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (sc_random_forest.mean(), sc_random_forest.std() * 2))

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm, train.Cover_Type.unique())

In [None]:
# Feature Importance

feat_importances = pd.Series(clf_random_forest.feature_importances_, index=X_train.columns)
feat_importances.nlargest(25).plot(kind='barh')
plt.show()

## Submission 

In [None]:
# predict the test dataframe
y_pred  = clf_random_forest.predict(test)

#sample_submission
sample_submission['Cover_Type'] = y_pred

# Lets see the head of our submission file
display(sample_submission.head())

# Analyse the % of Cover Types predicted
display(sample_submission['Cover_Type'].value_counts(normalize=True)*100)

# Save the 
file_name = '3-sub_randomforest2' 
sample_submission.to_csv('../../submissions/{}.csv'.format(file_name), index=False)