# Application of Bootstrap samples in Random Forest

In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import random
from random import randint
random.seed = 42
from sklearn.tree import DecisionTreeRegressor

 <li> Load the boston house dataset </li>

In [2]:
boston = load_boston()
x=boston.data #independent variables
y=boston.target #target variable

In [3]:
x.shape

(506, 13)

### Task: 1
<font color='red'><b>Step 1 Creating samples: </b></font> Randomly create 30 samples from the whole boston data points.
<ol>
<li>Creating each sample: Consider any random 303(60% of 506) data points from whole data set and then replicate any 203 points from the sampled points</li>
<li>Ex: For better understanding of this procedure lets check this examples, assume we have 10 data points [1,2,3,4,5,6,7,8,9,10], first we take 6 data points randomly consider we have selected [4, 5, 7, 8, 9, 3] now we will replciate 4 points from [4, 5, 7, 8, 9, 3], consder they are [5, 8, 3,7] so our final sample will be [4, 5, 7, 8, 9, 3, 5, 8, 3,7]</li>
<li> we create 30 samples like this </li>
<li> Note that as a part of the Bagging when you are taking the random samples make sure each of the sample will have                different set of columns</li>
<li> Ex: assume we have 10 columns for the first sample we will select [3, 4, 5, 9, 1, 2] and for the second sample [7, 9, 1, 4, 5, 6, 2] and so on...</li>
<li> Make sure each sample will have atleast 3 feautres/columns/attributes</li>
</ol>

<font color='red'><b>Step 2 Building High Variance Models on each of the sample and finding train MSE value:</b></font> Build a DecisionTreeRegressor on each of the sample.
<ol><li>Build a regression trees on each of 30 samples.</li>
<li>computed the predicted values of each data point(506 data points) in your corpus.</li>
<li> predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{30}\sum_{k=1}^{30}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $MSE =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

<font color='red'><b>Step 3 Calculating the OOB score :</b></font>
<ol>
<li>Computed the predicted values of each data point(506 data points) in your corpus.</li>
<li>Predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{k}\sum_{\text{k= model which was buit on samples not included } x^{i}}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $OOB Score =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

### Task: 2
<pre>
<font color='red'><b>Computing CI of OOB Score and Train MSE</b></font>
<ol>
<li> Repeat Task 1 for 35 times, and for each iteration store the Train MSE and OOB score </li>
<li> After this we will have 35 Train MSE values and 35 OOB scores </li>
<li> using these 35 values (assume like a sample) find the confidence intravels of MSE and OOB Score </li>
<li> you need to report CI of MSE and CI of OOB Score </li>
<li> Note: Refer the Central_Limit_theorem.ipynb to check how to find the confidence intravel</li>
</ol>
</pre>
### Task: 3
<pre>
<font color='red'><b>Given a single query point predict the price of house.</b></font>

<li>Consider xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60] Predict the house price for this point as mentioned in the step 2 of Task 1. </li>
</pre>

## TASK - 1

In [4]:
# Row Sampling and Column Sampling for creating n Samples
def Sampling(n):
    index = []
    cols = []
    for i in range(0, n):
        id1 = random.sample(range(0, len(x)), 303)
        id2 = random.sample(id1, 203)
        index_row = (id1 + id2)
        col = random.sample(range(0, x.shape[1]), randint(3, x.shape[1]))
        index.append(index_row)
        cols.append(col)
    return index, cols

In [5]:
def RFT(index, cols, x, y):
    y_pred = {k:v for k, v in zip(range(0, len(index[0])), np.zeros(len(index[0])))}
    counts = {k:v for k, v in zip(range(0, len(index[0])), np.zeros(len(index[0])))}
    y_pred_ib = 0
    y_pred_oob = []
    for ind, col in zip(index, cols):
        x_new = x[:, col]
        X_new = x[ind, :][:, col]
        Y_new = y[ind]
        DTR = DecisionTreeRegressor()
        DTR.fit(X_new, Y_new)
        y_pred_ib += DTR.predict(x_new) # Calculating y_pred for mse

        
        for i in range(0, len(x)):
            if np.all(i not in ind):
                temp1 = y_pred[i] 
                temp1 += DTR.predict(x_new[i].reshape(1, -1)) # Calculating y_pred for oob points
                y_pred[i] = temp1 
            count = ind.count(i)
            count1 = np.ceil(count / 2)
            temp2 = counts[i]
            count = count1 + temp2
            counts[i] = count

    y_pred_ib = y_pred_ib / len(index) # Average of y_pred for mse
    for key, value in zip(counts.keys(), y_pred.values()):
        y_pred_oob.append(value / (len(index) - counts[key])) # k-Avg of y_pred for mse
    y_pred_oob = np.array(y_pred_oob).ravel() 
    
    return y_pred_ib, y_pred_oob

In [6]:
indexes, columns = Sampling(30) # Creating 30 samples
y_pred_mse, y_pred_oob = RFT(indexes, columns, x, y)

In [7]:
print("MSE : ", mean_squared_error(y, y_pred_mse))
print("OOB_Score : ", mean_squared_error(y, y_pred_oob))

MSE :  2.016079771359799
OOB_Score :  11.715156916910528


## TASK - 2

In [8]:
MSE = []
OOB = []

for i in range(0, 35):
    indexes, columns = Sampling(30)
    y_pred_mse, y_pred_oob = RFT(indexes, columns, x, y)
    
    mse = mean_squared_error(y, y_pred_mse)
    oob = mean_squared_error(y, y_pred_oob)
    
    MSE.append(mse)
    OOB.append(oob)
    

In [9]:
MSE = np.array(MSE)
OOB = np.array(OOB)

In [10]:
# Confidence Interval for MSE 
sample_mean = MSE.mean()
sample_std =  MSE.std()
sample_size = len(MSE)
# here we are using sample standard deviation instead of population standard deviation
left_limit  = np.round(sample_mean - 2*(sample_std/np.sqrt(sample_size)), 3)
right_limit = np.round(sample_mean + 2*(sample_std/np.sqrt(sample_size)), 3)
print('95% Confidence Interval of MSE is', left_limit, 'to', right_limit)

95% Confidence Interval of MSE is 2.299 to 2.502


In [11]:
# Confidence Interval for OOB
sample_mean = OOB.mean()
sample_std =  OOB.std()
sample_size = len(OOB)
# here we are using sample standard deviation instead of population standard deviation
left_limit  = np.round(sample_mean - 2*(sample_std/np.sqrt(sample_size)), 3)
right_limit = np.round(sample_mean + 2*(sample_std/np.sqrt(sample_size)), 3)
print('95% Confidence Interval of OOB is', left_limit, 'to', right_limit)

95% Confidence Interval of OOB is 13.193 to 14.023


## TASK - 3

In [12]:
# Predicting the price of xq
xq = [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]
xq = np.array(xq)
y_pred_ib = 0

for ind, col in zip(indexes, columns):
        x_new = xq[col]
        X_new = x[ind, :][:, col]
        Y_new = y[ind]
        DTR = DecisionTreeRegressor()
        DTR.fit(X_new, Y_new)
        y_pred_ib += DTR.predict(x_new.reshape(1, -1))
        
y_pred_ib = y_pred_ib / len(indexes)
print('Price of xq is: ', y_pred_ib)

Price of xq is:  [20.452]
