# Random Forest
- Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. 

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
#load data
data = pd.read_csv("./Data/otto_train.csv")
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [3]:
nCar = data.shape[0] #num of data
nVar = data.shape[1] #num of variable
print('nCar: %d' %nCar, 'nVar: %d' %nVar)

nCar: 61878 nVar: 95


In [4]:
#remove id
data = data.drop(['id'], axis = 1)

In [5]:
#convert string to number
mapping_dict = {"Class_1": 1,
                "Class_2": 2,
                "Class_3": 3,
                "Class_4": 4,
                "Class_5": 5,
                "Class_6": 6,
                "Class_7": 7,
                "Class_8": 8,
                "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

In [6]:
#split feature, target & train, test
feature_columns = list(data.columns.difference(['target'])) 
X = data[feature_columns] # feature variable
y = after_mapping_target # target variable
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # train:test = 8:2 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # check num of test data

(49502, 93) (12376, 93) (49502,) (12376,)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
random_forest_model1 = RandomForestClassifier(n_estimators = 20, # number of trees
                                             max_depth = 5, # maximum depth of tree 
                                             random_state = 42)
model1 = random_forest_model1.fit(train_x, train_y) 
predict1 = model1.predict(test_x) # predict
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%") 

Accuracy: 60.16 %


In [9]:
#using more tree
random_forest_model1 = RandomForestClassifier(n_estimators = 300, #number of tress
                                             max_depth = 5, # maximum depth of tree 
                                             random_state = 42)
model1 = random_forest_model1.fit(train_x, train_y) 
predict1 = model1.predict(test_x) # predict
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%") 

Accuracy: 61.73 %


In [10]:
# increasing depth
random_forest_model1 = RandomForestClassifier(n_estimators = 300, 
                                             max_depth = 20, # maximum depth of tree 
                                             random_state = 42)
model1 = random_forest_model1.fit(train_x, train_y) 
predict1 = model1.predict(test_x) # predict
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%") 

Accuracy: 78.09 %
