# Random Forest

- Multiple Decision Trees
- Random Sub-Sampling of features
- Bootstrapping
- Averaging the results
- Reduces Variance by training on diferent sample data
- Limit overfitting without increasing bias

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv("/content/haberman.csv", header=None)

In [3]:
df.columns = ['Age','OP_year','Nodes','output']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Age      306 non-null    int64
 1   OP_year  306 non-null    int64
 2   Nodes    306 non-null    int64
 3   output   306 non-null    int64
dtypes: int64(4)
memory usage: 9.7 KB


In [5]:
def entropy(col):
  counts = np.unique(col,return_counts=True)
  ent = 0.0
  for ix in counts[1]:
    p = ix/col.shape[0]
    ent += (-1.0*p*np.log2(p))
  return ent

In [6]:
def divide_data(x_data,nkey,mval):
  x_right = pd.DataFrame([],columns=x_data.columns)
  x_left = pd.DataFrame([],columns=x_data.columns)
  for ix in range(x_data.shape[0]):
    val = x_data[nkey].loc[ix]
    if val >= mval:
      x_right = x_right.append(x_data.iloc[ix])
    else:
      x_left = x_left.append(x_data.iloc[ix])
  return x_right,x_left

In [7]:
def info_gain(x_data,nkey,mval):
  right,left = divide_data(x_data,nkey,mval)

  l = float(left.shape[0])/x_data.shape[0]
  r = float(right.shape[0])/x_data.shape[0]

  if left.shape[0]==0 or right.shape[0]==0:
    return -1e5
  i_gain = entropy(x_data.output) - (l * entropy(left.output) + r*entropy(right.output))
  return i_gain

In [68]:
class DecisionTree:
    
    def __init__(self,depth=0,max_depth=5,n_feat=None):
        self.left = None
        self.right = None
        self.nkey = None
        self.mval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        self.n_feat = n_feat
    
    def train(self,X_train):
        features = ['Age','OP_year','Nodes']
        info_gains = []
        for i in features:
            i_gain = info_gain(X_train,i,X_train[i].mean())
            info_gains.append(i_gain)
        self.nkey = features[np.argmax(info_gains)]
        self.mval = X_train[self.nkey].mean()
        data_left,data_right = divide_data(X_train,self.nkey,self.mval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        if data_left.shape[0] == 0 or data_right.shape[0] ==0 :
            if X_train.output.mean() >=0.5:
                self.target = 1
            else:
                self.target = 2
            return
        if self.depth >= self.max_depth:
            if X_train.output.mean() >=0.5:
                self.target = 1
            else:
                self.target = 2
            return
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        if X_train.output.mean() >=0.5:
            self.target = 1
        else:
            self.target = 2
        return
    
    def predict(self,test):
        if test[self.nkey]>self.mval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [88]:
class RandomForest:
    def __init__(self,n_trees=5,n_bootstrap=100,max_depth=5,n_feat=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.forest = []
        self.n_bootstrap=n_bootstrap
        self.n_feat = n_feat

    def bootstraping(X,n_bootstrap):
      index = np.random.randint(low=0, high=len(X), size=n_bootstrap)
      data = X.iloc[index]
      data = data.reset_index(drop=True)
      return data
        
    def trainrf(self,X):
        self.forest = []
        for i in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth,n_feat=self.n_feat)
            x = bootstraping(X,n_bootstrap=self.n_bootstrap)
            tree.train(x)
            self.forest.append(tree)

    def predict(self,X):
        tree_predicted = np.array([tree.predict(X) for tree in self.forest])
        return tree_predicted

In [60]:
split = int(0.75*df.shape[0])
train_data = df[:split]
test_data = df[split:]
test_data = test_data.reset_index(drop = True)

In [89]:
rf = RandomForest()

In [90]:
rf.trainrf(train_data)

In [100]:
y_pred = []
for ix in range(test_data.shape[0]):
  y_pred.append(np.bincount(np.array(rf.predict(test_data.loc[ix]))).argmax())

In [102]:
print("Accuracy: ",np.mean(y_pred == test_data['output'])*100)

Accuracy:  71.42857142857143
