# Building a basic Random Forest

The Random Forest model is a popular bagging ensemble method. It combines many decision tree classifiers or regressors as the "base models" to make predictions.

By building this ourselves we will get to see the internals of exactly what is going on in a bagging ensemble model.

---

### Construction of the RF

The Random Forest classifier is built such that:

1. Multiple internal decision tree classifiers will be built as the base models
- For each base model, the data will be resampled like in bootstrapping.
- Each decision tree will be fit on the bootstrapped sample of the data.
- To predict, each internal base model will be passed the new data and make their predictions. The final output will be a vote across the base models for the class.

---



In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
wine = pd.read_csv('/Users/tlee010/desktop/DSI-SF-2-timdavidlee/datasets/wine_quality/winequality_merged.csv')

---

### 1. Create the random forest class

Keyword arguments:

    n_estmators
    max_depth
    max_features
    

In [9]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [57]:
class RandomForest(object):
    def __init__(self, n_estimators=3, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.base_estimators = {}
        
    
    def _make_base_estimator(self,X,y, estimator_number=1):
        #make a random sampled version of X, y
        #will use indices to get into our random version
        row_indices =range(X.shape[0])
        random_indices = np.random.choice(row_indices, size= len(row_indices),replace=True)
        #print X,y
        #print row_indices
        #print random_indices
        
        #make bootstrapped X,y
        Xr = X.iloc[random_indices,:]
        yr = [y[x] for x in random_indices]
        #print Xr
        #print yr
        
        dtc = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features)
        dtc.fit(Xr,yr)
        self.base_estimators[estimator_number] = dtc
        print dtc.score(Xr,yr)
    def fit(self,X,y):
        for i in range(self.n_estimators):
            self._make_base_estimator(X,y,i)

In [58]:
# X = pd.DataFrame({'a':[1,2,3,4,5,6,7,8]})
# y = [1,0,1,0,1,0,1,0]

y= wine.red_wine.values
X = wine[['quality','alcohol','chlorides']]
print X.shape, len(y)
rf = RandomForest(n_estimators=3,max_depth=3,max_features=3)
#rf._make_base_estimator(X,y)
rf.fit(X,y)


(6497, 3) 6497
0.932738186855
0.931660766508
0.9318146837


---

### 2. Write a base model creator function

---

### 3. Write a data boostrapping function

---

### 4. Write the fit function

---

### 5. Write the predict function

In [61]:
class RandomForest(object):
    def __init__(self, n_estimators=3, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.base_estimators = {}
        
    
    def _make_base_estimator(self,X,y, estimator_number=1):
        #make a random sampled version of X, y
        #will use indices to get into our random version
        row_indices =range(X.shape[0])
        random_indices = np.random.choice(row_indices, size= len(row_indices),replace=True)
        #print X,y
        #print row_indices
        #print random_indices
        
        #make bootstrapped X,y
        Xr = X.iloc[random_indices,:]
        yr = [y[x] for x in random_indices]
        #print Xr
        #print yr
        
        dtc = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features)
        dtc.fit(Xr,yr)
        self.base_estimators[estimator_number] = dtc
        print dtc.score(Xr,yr)
    def fit(self,X,y):
        for i in range(self.n_estimators):
            self._make_base_estimator(X,y,i)
            
    def predict(self,X,y):
        predictions = []
        for i in range(self.n_estimators):
            base_model = self.base_estimators[i]
            current_pred = base_model.predict(X)
            predictions.append(current_pred)
        voted_class = []
        predictor_length = X.shape[0]
        
        for predict in range(predictor_length):
            ones = 0
            for i in range(self.n_estimators):
                ones += predictions[pred]
            if ones > (self.n_estimators/2.):
                voted_class.append(1)
            else:
                voted_class.append(0)
                

---

### 6. Test on the wine data

In [62]:

y= wine.red_wine.values
X = wine[['quality','alcohol','chlorides']]
print X.shape, len(y)
rf = RandomForest(n_estimators=3,max_depth=3,max_features=3)
#rf._make_base_estimator(X,y)
rf.fit(X,y)

(6497, 3) 6497
0.933969524396
0.926581499153
0.924734492843


In [64]:
from sklearn.metrics import accuracy_score