# Application of Bootstrap samples in Random Forest

# Task 1

## 1.1 Creating Bagging samples

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor as reg
import random
from collections import defaultdict

 <li> Load the boston house dataset </li>

In [2]:
boston = load_boston()
#x=boston.data #independent variables
#y=boston.target #target variable

In [3]:
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data['target'] = pd.Series(boston.target)
x = data.drop(['target'],axis = 1)
y = data['target']

In [4]:
sample_row_idx = []
sample_col_idx = []
for i in range(30):
    sample_1 = np.array(random.sample(range(0, len(y)),303))
    sample_2 = np.random.choice(sample_1,203)
    sample = list(np.concatenate((sample_1,sample_2)))
    sample_row_idx.append(sample)
    size_val = np.random.randint(3,13)
    sample_col_idx.append(list(set(np.random.randint(0,13,size = size_val)))) 
    

## 1.2 Building High Variance Model

In [5]:
corpus = []
for i in range(30):
    x_sample = x.iloc[sample_row_idx[i],sample_col_idx[i]] 
    y_sample = y.iloc[sample_row_idx[i]]
    regressor = reg(min_samples_split = 10,min_samples_leaf = 5)
    regressor.fit(x_sample,y_sample)
    df = x[x.columns.intersection(x_sample.columns)]
    corpus.append(regressor.predict(df).tolist())
    

In [6]:
y_pred_ele = []
for p in range(len(corpus[0])):
    y_pred_list = []
    for q in range(len(corpus)):
        y_pred_list.append(corpus[q][p])
    y_pred_ele.append(y_pred_list)    

In [7]:
y_pred = [0]*506
for r in range(len(y_pred_ele)):
    y_pred[r] = sum(y_pred_ele[r])/len(y_pred_ele[r])

In [8]:
#mean square error
loss = mse(y,y_pred)
print(loss)

6.528608657692322


## 1.3 OOB score        

In [9]:
oob_corpus = []
uni_sample_row_idx = []
idx_pred_value = []
for i in range(30):
    x_sample = x.iloc[sample_row_idx[i],sample_col_idx[i]] 
    y_sample = y.iloc[sample_row_idx[i]]
    regressor = reg()
    regressor.fit(x_sample,y_sample)
    uni_sample_row_idx = list(np.unique(np.array(sample_row_idx[i])))
    oob_df = x[x.columns.intersection(x_sample.columns)]
    oob_df = oob_df.drop(uni_sample_row_idx)
    oob_corpus.append(regressor.predict(oob_df).tolist())       
    idx_pred_value.append(dict(zip(list(oob_df.index.values),oob_corpus[i])))       

In [10]:
from collections import defaultdict
some_dict = defaultdict(list)
index_dict = dict.fromkeys(range(506))
list_of_dicts = [index_dict]+[idx_pred_value[i] for i in range(30)]
for d in list_of_dicts:
    for key, value in d.items():
        some_dict[key].append(value)

In [11]:
values = list(some_dict.values())
oob_y_pred = [0]*506
for ele in range(len(values)):
    values[ele].pop(0)
    y_pred_value = sum(values[ele])/len(values[ele])
    oob_y_pred[ele] = y_pred_value 

In [12]:
OOB_Score = mse(y,oob_y_pred)
print(OOB_Score)

13.76566733746637


# Task 2 Computing CI of OOB Score and Train MSE

In [13]:
def MSE_loss():
    sample_row_idx = []
    sample_col_idx = []
    for i in range(30):
        sample_1 = np.array(random.sample(range(0, len(y)),303))
        sample_2 = np.random.choice(sample_1,203)
        sample = list(np.concatenate((sample_1,sample_2)))
        sample_row_idx.append(sample)
        size_val = np.random.randint(3,13)
        sample_col_idx.append(list(set(np.random.randint(0,13,size = size_val))))
    corpus = []
    for i in range(30):
        x_sample = x.iloc[sample_row_idx[i],sample_col_idx[i]] 
        y_sample = y.iloc[sample_row_idx[i]]
        regressor = reg()
        regressor.fit(x_sample,y_sample)
        df = x[x.columns.intersection(x_sample.columns)]
        corpus.append(regressor.predict(df).tolist())
    y_pred_idx = []
    for p in range(len(corpus[0])):
        y_pred_list = []
        for q in range(len(corpus)):
            y_pred_list.append(corpus[q][p])
        y_pred_idx.append(y_pred_list)
    
    y_pred = [0]*506
    for r in range(len(y_pred_idx)):
        y_pred[r] = sum(y_pred_idx[r])/len(y_pred_idx[r])    
    return mse(y,y_pred)


In [14]:
def OOB_score():
    sample_row_idx = []
    sample_col_idx = []
    for i in range(30):
        sample_1 = np.array(random.sample(range(0, len(y)),303))
        sample_2 = np.random.choice(sample_1,203)
        sample = list(np.concatenate((sample_1,sample_2)))
        sample_row_idx.append(sample)
        size_val = np.random.randint(3,13)
        sample_col_idx.append(list(set(np.random.randint(0,13,size = size_val))))
    oob_corpus = []
    uni_sample_row_idx = []
    idx_pred_value = []
    for i in range(30):
        x_sample = x.iloc[sample_row_idx[i],sample_col_idx[i]] 
        y_sample = y.iloc[sample_row_idx[i]]
        regressor = reg()
        regressor.fit(x_sample,y_sample)
        uni_sample_row_idx = list(np.unique(np.array(sample_row_idx[i])))
        oob_df = x[x.columns.intersection(x_sample.columns)]
        oob_df = oob_df.drop(uni_sample_row_idx)
        oob_corpus.append(regressor.predict(oob_df).tolist())       
        idx_pred_value.append(dict(zip(list(oob_df.index.values),oob_corpus[i])))
    some_dict = defaultdict(list)
    index_dict = dict.fromkeys(range(506))
    list_of_dicts = [index_dict]+[idx_pred_value[i] for i in range(30)]
    for d in list_of_dicts:
        for key, value in d.items():
            some_dict[key].append(value)
            
    values = list(some_dict.values())
    oob_y_pred = [0]*506
    for ele in range(len(values)):
        values[ele].pop(0)
        y_pred_value = sum(values[ele])/len(values[ele])
        oob_y_pred[ele] = y_pred_value        
    return mse(y,oob_y_pred)

In [15]:
mse_35 = []
oob_35 = []
for err in range(35):
    mse_val = MSE_loss()
    oob_val = OOB_score()
    mse_35.append(mse_val)
    oob_35.append(oob_val)

In [16]:
import statistics as stat
mean_mse = sum(mse_35)/len(mse_35)
mean_oob = sum(oob_35)/len(oob_35)
mse_std_dev = stat.stdev(mse_35)
oob_std_dev = stat.stdev(oob_35)

In [17]:
c_i_mse = [0]*2
c_i_mse[0] = mean_mse - (mse_std_dev*1.96) 
c_i_mse[1] = mean_mse + (mse_std_dev*1.96)
c_i_mse

[1.6587020546212852, 3.3971202205476656]

In [18]:
c_i_oob = [0]*2
c_i_oob[0] = mean_oob - (oob_std_dev*1.96) 
c_i_oob[1] = mean_oob + (oob_std_dev*1.96)
c_i_oob

[11.361155535999641, 17.573127945225437]

# Task 3

In [19]:
sample_row_idx = []
sample_col_idx = []
for i in range(30):
    sample_1 = np.array(random.sample(range(0, len(y)),303))
    sample_2 = np.random.choice(sample_1,203)
    sample = list(np.concatenate((sample_1,sample_2)))
    sample_row_idx.append(sample)
    size_val = np.random.randint(3,13)
    sample_col_idx.append(list(set(np.random.randint(0,13,size = size_val))))

In [20]:
x_q= [[0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]]

In [21]:
df = pd.DataFrame(x_q, index =[0],columns =['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'])
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.18,20.0,5.0,0.0,0.421,5.6,72.2,7.95,7.0,30.0,19.1,372.13,18.6


In [22]:
corpus = []
for i in range(30):
    x_sample = x.iloc[sample_row_idx[i],sample_col_idx[i]] 
    y_sample = y.iloc[sample_row_idx[i]]
    regressor = reg()
    regressor.fit(x_sample,y_sample)
    df_q = df[df.columns.intersection(x_sample.columns)]
    corpus.append(regressor.predict(df_q).tolist())

In [23]:
avg_val = 0
for i in range(len(corpus)):
    for j in corpus[i]:
        avg_val += j
avg_val =  avg_val/30
avg_val

19.84666666666667