# Application of Bootstrap samples in Random Forest

In [136]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

 <li> Load the boston house dataset </li>

In [137]:
boston = load_boston()
x=boston.data #independent variables
y=boston.target #target variable

### Task: 1
<font color='red'><b>Step 1 Creating samples: </b></font> Randomly create 30 samples from the whole boston data points.
<ol>
<li>Creating each sample: Consider any random 303(60% of 506) data points from whole data set and then replicate any 203 points from the sampled points</li>
<li>Ex: For better understanding of this procedure lets check this examples, assume we have 10 data points [1,2,3,4,5,6,7,8,9,10], first we take 6 data points randomly consider we have selected [4, 5, 7, 8, 9, 3] now we will replciate 4 points from [4, 5, 7, 8, 9, 3], consder they are [5, 8, 3,7] so our final sample will be [4, 5, 7, 8, 9, 3, 5, 8, 3,7]</li>
<li> we create 30 samples like this </li>
<li> Note that as a part of the Bagging when you are taking the random samples make sure each of the sample will have                different set of columns</li>
<li> Ex: assume we have 10 columns for the first sample we will select [3, 4, 5, 9, 1, 2] and for the second sample [7, 9, 1, 4, 5, 6, 2] and so on...</li>
<li> Make sure each sample will have atleast 3 feautres/columns/attributes</li>
</ol>

<font color='red'><b>Step 2 Building High Variance Models on each of the sample and finding train MSE value:</b></font> Build a DecisionTreeRegressor on each of the sample.
<ol><li>Build a regression trees on each of 30 samples.</li>
<li>computed the predicted values of each data point(506 data points) in your corpus.</li>
<li> predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{30}\sum_{k=1}^{30}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $MSE =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

<font color='red'><b>Step 3 Calculating the OOB score :</b></font>
<ol>
<li>Computed the predicted values of each data point(506 data points) in your corpus.</li>
<li>Predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{k}\sum_{\text{k= model which was buit on samples not included } x^{i}}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $OOB Score =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

### Task: 2
<pre>
<font color='red'><b>Computing CI of OOB Score and Train MSE</b></font>
<ol>
<li> Repeat Task 1 for 35 times, and for each iteration store the Train MSE and OOB score </li>
<li> After this we will have 35 Train MSE values and 35 OOB scores </li>
<li> using these 35 values (assume like a sample) find the confidence intravels of MSE and OOB Score </li>
<li> you need to report CI of MSE and CI of OOB Score </li>
<li> Note: Refer the Central_Limit_theorem.ipynb to check how to find the confidence intravel</li>
</ol>
</pre>
### Task: 3
<pre>
<font color='red'><b>Given a single query point predict the price of house.</b></font>

<li>Consider xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60] Predict the house price for this point as mentioned in the step 2 of Task 1. </li>
</pre>

### Task: 1

#### Step 1: Creating samples: Randomly create 30 samples from the whole boston data points.

In [169]:
from sklearn.tree import DecisionTreeRegressor
import random
from tqdm import tqdm

In [159]:
# Converting x values to dataframe
data = pd.DataFrame(data=x[:,:], index= range(len(x)), columns=boston.feature_names)  

In [212]:
X_n_sample = {}
Y_n_sample = {}
data_sample = {}
data_index = {}

def create_n_samples(data,y,n):
    data_size_60 = (int)(0.6*data.shape[0])
    data_size_40 = data.shape[0] - data_size_60
    
    for i in tqdm(range(n)):
#         column sampling
        random_n_feature = random.randrange(3, data.shape[1])
        data_column = data.sample(random_n_feature,axis = 1)
        data_sample[i] = data_column
        
#         row sampling
        idx = random.sample(range(data.shape[0]),data_size_60)
        idx2 = random.sample(idx,data_size_40)
        idx_row = idx + idx2
        
        data_index[i] = idx_row
        
        sample_x = data_column.iloc[idx_row].values
        sample_y = y[idx_row]
        
        X_n_sample[i] = sample_x
        Y_n_sample[i] = sample_y

In [215]:
def mean_square_error(y_orig,y_pred):
    return (1/506)*np.sum(np.subtract(y_orig,y_pred) )

In [214]:
sample_count = 30
create_n_samples(data,y,sample_count)

100%|██████████| 30/30 [00:00<00:00, 612.22it/s]


#### Step 2 Building High Variance Models on each of the sample and finding train MSE value

In [205]:
a = [2,2,3,4,5]
b = [2,2,3,4,5]
np.sum(np.add(b,a))

32

In [202]:
y_pred_total = np.zeros(506)
regressor = DecisionTreeRegressor(random_state=0)
for i in range(sample_count):
    regressor.fit(X_n_sample[i],Y_n_sample[i])
    y_pred_sample = regressor.predict(data_sample[i])
#     print(y_pred_sample)
    y_pred_total = np.add(y_pred_sample,y_pred_total)
y_pred = (1/30)*y_pred_total

In [211]:
mean_square_error(y,y_pred)

-0.08183127093613031

#### Step 3 Calculating the OOB score :

In [224]:
data_sample[0].iloc[0]

NOX      0.538
CHAS     0.000
DIS      4.090
RM       6.575
AGE     65.200
ZN      18.000
Name: 0, dtype: float64

In [225]:
y_pred_total = []
regressor = DecisionTreeRegressor(random_state=0)
for i in range(data.shape[0]):
    y_pred_sample = 0
    k = 0
    for j in range(sample_count):
        if i  not in data_index[j]:
            k+=1
            regressor.fit(X_n_sample[j],Y_n_sample[j])
            y_pred_sample += regressor.predict(data_sample[j].iloc[i])
    y_pred_total.append((1/k)*y_pred_sample)
    
y_pred_total    

ValueError: Expected 2D array, got 1D array instead:
array=[  2.96000000e+02   4.09000015e+00   5.37999988e-01   1.53000002e+01
   1.00000000e+00   1.80000000e+01   2.30999994e+00   0.00000000e+00
   6.51999969e+01   4.98000002e+00   6.57499981e+00   6.32000016e-03].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.