In [1]:
import numpy as np
from sklearn.datasets import load_boston

import random
from sklearn.tree import DecisionTreeRegressor as DTR

from tqdm import tqdm

In [2]:
boston = load_boston()
x=boston.data #independent variables
y=boston.target #target variable

In [3]:
x.shape

(506, 13)

## Task 1

### Step 1

In [4]:
def get_row_sample_and_oob_indices():
    '''This functions will get sample and oob indices'''    
    
    x_60 = random.sample(range(x.shape[0]), round(x.shape[0]*0.6))    
    
    x_40 =  random.sample(x_60, round(x.shape[0]*0.4))    
    
    sample_indices = x_60 + x_40    
    
    oob_indices = list(set(range(x.shape[0])) - set(sample_indices))
    
    return sample_indices, oob_indices

### Step 2

In [5]:
def training_model_using_random_forest(x):
    '''This functions will train a model based on k-Decision Trees and predict y'''
    
    lst_samples = []
    random_forest_model = []
        
    number_of_features = np.random.randint(3,10, 30)
        
    for idx, count in enumerate(number_of_features):
        
        
        sample_indices, oob_indices = get_row_sample_and_oob_indices()
        
        #Getting random row samples
        row_samples = x[sample_indices]
        y_samples = y[sample_indices]
        
        #Getting feature indices
        feature_indices = random.sample(range(2, 13), count)
        
        
        #Getting row samples with different features and oob indices
        lst_samples.append({
            'sample'          : str(idx+1),
            'x_sample'        : row_samples[:, feature_indices],
            'y_sample'       : y_samples,
            'sample_indices'  : sample_indices,
            'oob_indices'     : oob_indices,
            'feature_indices' : feature_indices
        })
   
    
    
    #Train all 30 samples and get predictions
    for sample in lst_samples:
        
        X= sample['x_sample']
        Y = sample['y_sample']
        
        regressor = DTR()
        regressor.fit(X, Y)
        
        random_forest_model.append({'sample': sample['sample'],
                                    'regressor' : regressor,
                                    'y_pred' : regressor.predict(x[:,sample['feature_indices']]),
                                    'sample_indices' : sample['sample_indices'],
                                    'oob_indices' : sample['oob_indices'],
                                    'feature_indices' : sample['feature_indices']
                                   })
        
    
        
    return random_forest_model

In [6]:
def get_y_pred_from_trained_model(random_forest_model):
    '''This function will get mean y_pred from all 30 samples'''
    
    y_pred = []
        
    #Getting y_pred by mean of 30 sample's y_pred
    for i_row in range(x.shape[0]):
        
        y_pred_sample = 0
        for model in random_forest_model:          
            
            
            y_pred_sample += model['y_pred'][i_row]
        
        y_pred.append(round(y_pred_sample/30,1))
        
    return y_pred

In [7]:
def calc_mean_square_error(y, y_pred):
    
    return np.mean([(y[i] - y_pred[i]) ** 2 for i in range(len(y))])        

In [8]:
random_forest_model = training_model_using_random_forest(x)

y_pred = get_y_pred_from_trained_model(random_forest_model)

mse = calc_mean_square_error(y, y_pred)
print('--------Mean Squared Error--------')
print(mse)

--------Mean Squared Error--------
2.437608695652174


### Step 3

In [9]:
def get_oob_score(random_forest_model):
    '''This function will get oob_score from the exist k-DecisionTrees'''
    
    y_pred = []
        
    
    for i_row in range(x.shape[0]):
        
        y_pred_sample = 0
        k_model = 0
        for model in random_forest_model:          
            
            #Consider only those models in which i_row sample is not include
            #which means i_row should in present in oob_indices            
            if i_row in model['oob_indices']:
                
                y_pred_sample += model['y_pred'][i_row]                
                k_model += 1
        
        y_pred.append(round(y_pred_sample/k_model,1))
        
    return y_pred

In [10]:
oob_y_pred = get_oob_score(random_forest_model)

oob_score = calc_mean_square_error(y, oob_y_pred)
print('--------OOB Score--------')
print(oob_score)

--------OOB Score--------
14.327569169960475


## Task 2

In [11]:
def get_mse_oob_score_for_CI(x, y):
    '''This function will get MSE values and OOB Scores for 35 iterations'''
    
    mse_values = []
    oob_scores = []
    
    for i in tqdm(range(35)):
        
        #Training data using Random Forest algorithm
        random_forest_model = training_model_using_random_forest(x)
        
        #Getting y_pred from random forest
        y_pred = get_y_pred_from_trained_model(random_forest_model)                
        mse = calc_mean_square_error(y, y_pred)                
        mse_values.append(mse)
        
        
        #Getting OOB Score
        oob_y_pred = get_oob_score(random_forest_model)                
        oob_score = calc_mean_square_error(y, oob_y_pred)                
        oob_scores.append(oob_score)
    
    return mse_values, oob_scores

In [12]:
mse_values, oob_scores = get_mse_oob_score_for_CI(x, y)

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:04<00:00,  8.18it/s]


In [13]:
def calculate_CI(values, y):
        
    _min = np.mean(values) - 2*(np.std(y)/35)        
    _max = np.mean(values) + 2*(np.std(y)/35)
    
    return _min, _max

In [14]:
print(f'Confidence Interval for MSE with 95% is between range \n{calculate_CI(mse_values, y)}')

print('\n\n--------------------------------\n\n')

print(f'Confidence Interval for OOB Score with 95% is between range \n{calculate_CI(oob_scores, y)}')

Confidence Interval for MSE with 95% is between range 
(2.1288550150298398, 3.1789134773473484)


--------------------------------


Confidence Interval for OOB Score with 95% is between range 
(14.690081440778005, 15.740139903095514)


## Task 3

In [15]:
def get_qth_prediction(random_forest_model, xq):
    '''This function will give the predict y for a given query'''
    
    y_pred = []       
    
    xq = np.array([xq])
    
    #Getting mean of prediction values of 30 samples for qth query
    y_pred = round(np.mean([model['regressor'].predict(xq[:, model['feature_indices']]) for model in random_forest_model]),1)    
        
    return y_pred

In [16]:
xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]

#Getting yq_pred
yq_pred = get_qth_prediction(random_forest_model, xq)

print('--------yq Pred--------')
print(yq_pred)

--------yq Pred--------
20.6
