In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

First lets load our data

In [2]:
data = pd.read_csv("./data/iris.csv")
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


We will represent species (or classes) as integers instead of their names. To do this we will use sci-kit's LabelEncoder

In [3]:
label_enc = LabelEncoder()
data["species"] = label_enc.fit_transform(data["species"])

In [4]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


Lets check how many classes we have and number of samples belonging to each class. As you can see we have a uniformly distributed data. Each class has 50 samples

In [5]:
print(np.unique(data["species"],return_counts=True))

(array([0, 1, 2]), array([50, 50, 50]))


Random forest classifier is an ensemble model. Ensemble models use multiple models to get better performance. Random forest classifier consists of multiple decision trees. We explain deision trees on another notebook in detail. So we will skip decision tree related parts which you can find in a seperate notebook. 

In [6]:
def compute_gini(data):
    _ , counts = np.unique(data,return_counts=True)

    gini = (counts/len(data))**2

    gini = 1 - gini.sum()
    return gini

def weighted_gini(data): # a list of arrays (two element list)
    w_gini = []
    n = 0
    
    for i in data:
        gini = compute_gini(i)
        w_gini.append(gini * len(i)) 
        n+=len(i)
        
    w_gini = np.array(w_gini)/n
    return w_gini.sum()

In [7]:
def gini_select(X,Y): # X is data for only a single feature
    X2 = X.copy() # we create a copy of X just for finding the thresholds (we don't want to sort the original data)
    X2 = X2.sort_values()
 
    thresholds = X2.rolling(2).mean()[1:] # moving average, since first value is NAN we remove it

    unique_vals , _ = np.unique(thresholds,return_counts=True) # get each unique threshold value
    
    gini_list = []

    for tresh in unique_vals: #iterate over each threshold
        # seperate the data using threshold
        left_d = np.where(X<tresh)[0] # actually we can iterate over thre X once and use if<thresh,else to divide data
        right_d = np.where(X>=tresh)[0] # and not use np.where twice but numpy is faster than regular for loop

        gini = weighted_gini([Y.iloc[left_d],Y.iloc[right_d]]) #select elements of Y (labels) and calculate gini
        gini_list.append(gini)

    arg_min = np.argmin(gini_list) #select lowest gini val
    thres_val = unique_vals[arg_min] # select threshold for lowest gini
    gini = gini_list[arg_min]
    return thres_val,gini

In [8]:
def find_split_feature(X,Y): # data is a pandas dataframe
    feature_select_list = []

    for feature_name in X.columns:
        split_val,gini = gini_select(X[feature_name],Y)
        feature_select_list.append([feature_name,gini,split_val])
  
    feature_select_list = np.array(feature_select_list)
    arg_min = np.argmin(feature_select_list[:,1])
    
    feature_name = feature_select_list[arg_min,0]
    threshold = feature_select_list[arg_min,2]
    gini_val = feature_select_list[arg_min,1]
    
    #because we can not have np arrays with different typed
    #elements numpy changed everything into string (due to feature name being a string) now we have to revert it
    return feature_name,float(threshold),float(gini_val)     

In [9]:
class Node():
    def __init__(self):
        self.left = None
        self.right = None
        self.feature = None
        self.threshold = None
        self.label = None

    def __repr__(self):
        if self.feature == None:
            return "Leaf->Label: {}".format(self.label)
        else:
            return "Node->Feature: {} Threshold: {}".format(self.feature,self.threshold)

In [10]:
class DesicionTree():
    def __init__(self):
        self.root = Node()

    def build_tree(self,node,X,Y): # data is pandas dataframe

    
        gini = compute_gini(Y)
        if gini!=0:
            feature_name,threshold, gini = find_split_feature(X,Y)

            d = X[feature_name]
            left_ind = np.where(d<threshold)[0]
            right_ind = np.where(d>=threshold)[0]

            if len(left_ind) !=0 and len(right_ind) != 0:
                node.feature = feature_name
                node.threshold = threshold

                node.left = Node()
                self.build_tree(node.left,X.iloc[left_ind,:],Y.iloc[left_ind])

                node.right = Node()
                self.build_tree(node.right,X.iloc[right_ind,:],Y.iloc[right_ind])
            else:
                unique_vals , counts = np.unique(Y,return_counts=True)
                max_count = np.argmax(counts)
                node.label = unique_vals[max_count]
        else:
            unique_vals , _ = np.unique(Y,return_counts=True)
            node.label = unique_vals[0]
            
    def predict(self,node,data): # data is a pandas dataframe with a single sample
        if node.label==None:
            d = data[node.feature]

            if d<node.threshold:

                return self.predict(node.left,data)
            else:

                return self.predict(node.right,data)
        else:

            return node.label
        
    def predict_dataset(self,data): # data is a pandas dataframe
        output = []
        for i in range(len(data)):
            output.append(self.predict(self.root,data.iloc[i,:]))
        return np.array(output)
    
            

In [11]:
class Randomforest():
    def __init__(self,n_trees = 1):
        
        self.n_trees = n_trees #number of trees (must be at least 1)
        self.trees = [DesicionTree() for i in range(self.n_trees)]
        self.tree_features = []
        
    def bootstrap(self,X,Y,min_feature_num,max_feature_num):
        indices = np.random.randint(0,len(X),len(X))
        n_features = np.random.randint(min_feature_num,max_feature_num)
        
        features = np.random.choice(np.arange(len(X.columns)),n_features,replace = False) #no feature repetition

        return X.iloc[indices,features],Y.iloc[indices]
    
    def train(self,X,Y,min_feature_num,max_feature_num): 
        for i in range(len(self.trees)):
            X_boot, Y_boot = self.bootstrap(X,Y,min_feature_num,max_feature_num)
            self.tree_features.append(X_boot.columns)
            self.trees[i].build_tree(self.trees[i].root,X_boot,Y_boot)
            
    def predict(self,X):
        preds = []
        for i in range(len(self.trees)):
            preds.append(self.trees[i].predict(self.trees[i].root,X))
        
        unique_vals , counts = np.unique(preds,return_counts=True,axis=0)
        return unique_vals[np.argmax(counts)]
    def predict_dataset(self,X):
        preds = []
        for i in range(len(X)):
            preds.append(self.predict(X.iloc[i,:]))
        return preds

Now we will define our Randomforest class. As we have said before random forest consists of multiple trees. We hold these trees in a list. The idea of random forest is simple. We create multiple trees. However we don't train each tree with the same data. In this step we use a method called bootstrapping. Bootstrapping is randomly sampling the data. So we resample the data. Two important things to note are: 1) We don't use all the features in  the new version of our data. In iris dataset we have 4 features. When we use bootstrapping we can have 1 to 4 features in our new data. 2) A sample can appear more than once in the new data. We use bootstrapping and train each tree with a different resampled version of our dataset. This leads to having different trees. Now we have multiple trees but how are we going to make a prediction? We use majority voting. Each tree classifies the input sample. To give the final decision we take the most predicted class by all the trees. This whole process is called bagging (bootstrapping + aggregation). We have four methods in our Randomforest class. We will go over them one by one.  

bootstrap method randomly resamples data and selects 2 to 4 features and returns bootsrapped version of data.

train method is used for building our trees. We go over each tree and build it.

predict method is for classification of a single sample. Since we have multiple trees we make each tree classify the input. Our final prediction is the most predicted class by all the trees.

predict_dataset method takes an input with multiple samples and classfies each sample

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.33, random_state=42)

Now lets divide our data into train and test sets and look the performance on test set.

In [13]:
np.random.seed(42)
random_f = Randomforest(10) # lets have 10 trees
random_f.train(X_train,y_train,2,4)

In [14]:
output = random_f.predict_dataset(X_test)

In [15]:
100*(y_test == output).sum()/len(y_test)

98.0

You can observe that each tree uses different features.

In [16]:
random_f.tree_features

[Index(['sepal_width', 'petal_length', 'petal_width'], dtype='object'),
 Index(['petal_width', 'sepal_length'], dtype='object'),
 Index(['petal_length', 'sepal_length', 'petal_width'], dtype='object'),
 Index(['petal_length', 'petal_width'], dtype='object'),
 Index(['sepal_width', 'petal_length', 'sepal_length'], dtype='object'),
 Index(['petal_width', 'sepal_width'], dtype='object'),
 Index(['petal_width', 'petal_length', 'sepal_length'], dtype='object'),
 Index(['sepal_width', 'petal_width', 'petal_length'], dtype='object'),
 Index(['petal_length', 'sepal_width'], dtype='object'),
 Index(['petal_width', 'sepal_width', 'petal_length'], dtype='object')]