In [1]:
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import mode
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_curve, auc

%matplotlib inline

In [2]:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# rename column headers

col_names = {
    "sepal length (cm)" : "sepal_length",
    "sepal width (cm)" : "sepal_width",
    "petal length (cm)" : "petal_length",
    "petal width (cm)" : "petal_width"
}

df = df.rename(columns=col_names)

In [4]:
# set target
target = iris.target_names[iris.target]
target

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolo

In [5]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    df, target, test_size=0.2, stratify=target)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


In [6]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
51,6.4,3.2,4.5,1.5
20,5.4,3.4,1.7,0.2
17,5.1,3.5,1.4,0.3
14,5.8,4.0,1.2,0.2
19,5.1,3.8,1.5,0.3


## Bootstrapping Function

In [7]:
def bootstrapping(df, num_samples):
    indices = np.random.randint(low=0, high=len(df), 
                                size=num_samples)
    return list(indices)

In [8]:
indices = bootstrapping(df, num_samples=100)
df.iloc[indices]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
13,4.3,3.0,1.1,0.1
93,5.0,2.3,3.3,1.0
88,5.6,3.0,4.1,1.3
93,5.0,2.3,3.3,1.0
50,7.0,3.2,4.7,1.4
...,...,...,...,...
135,7.7,3.0,6.1,2.3
86,6.7,3.1,4.7,1.5
21,5.1,3.7,1.5,0.4
98,5.1,2.5,3.0,1.1


## Random Forest Algorithm

Decision Tree Parameters: 
- criterion='gini',
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,    
- min_weight_fraction_leaf=0.0,
- max_features=None,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- min_impurity_split=None

In [9]:
class RandomForestClassifier():
    
    # constructor
    def __init__( self, n_estimators=100, criterion='gini', 
        max_depth=None, min_samples_split=2, min_samples_leaf=1, 
        min_weight_fraction_leaf=0.0, max_features='auto', 
        max_leaf_nodes=None, min_impurity_decrease=0.0,
        min_impurity_split=None):
        
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        
        self.forest = []
    
    # bootstrapping function
    def get_bootstrap(self, num_samples):
        indices = np.random.randint(low=0, high=num_samples, 
                                    size=num_samples)
        return list(indices)
    
    # sklearn APIs: fit, predict
    def fit(self, X, y):
        # number of samples
        num_samples, _ = X.shape
        
        for i in range(self.n_estimators):
            tree = DecisionTreeClassifier(
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                min_impurity_split=self.min_impurity_split
            )
            
            self.forest.append(tree)
            indices = self.get_bootstrap(num_samples)
            tree.fit(X.iloc[indices], y[indices])
    
    def predict(self, X):
        num_examples, _ = X.shape
        predictions = {}
        for i in range(num_examples):
            predictions[i] = []

        for tree in self.forest:
            cur_pred = tree.predict(X)
            for i, val in enumerate(cur_pred):
                predictions[i].append(val)
        
        results = []
        for key in predictions.keys():
            results.append(mode(predictions[key]).mode[0])
        return np.array(results)

In [10]:
rf = RandomForestClassifier()

In [11]:
rf.fit(X_train, y_train)

In [12]:
y_test_pred = rf.predict(X_test)

In [13]:
pd.crosstab(y_test, y_test_pred, 
            rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,1
virginica,0,2,8
