In [1]:
import pandas as pd
import numpy as np
import random
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor

In [2]:
def real_population(x1, size=5000, random_state=1234):
    #set.seed(99)
    y = x1**2 
    r = np.random.RandomState(random_state)
    irr_noise = r.normal(-5,10,size)
    y = y + irr_noise
    df = pd.DataFrame({'target':y, 'X1':x1})
    return df

In [11]:
def simulation_data(size = 5000, random_seed= 99):
    np.random.seed(random_seed)
    x1 = np.random.uniform(-1,1,size)
    df = real_population(x1,size)
    return df

In [5]:
def get_mse(mydf, model='Lin'):
    truth = real_population(X_test[0], size=1)['target'][0]
    truth = [truth] * simulations
    if(model== 'Lin'):
        estimate = mydf[1]
    else:
        estimate = mydf[2]
    m = np.mean((estimate-truth)**2)
    return m

In [6]:
def get_bias(mydf, model='Lin'):
    truth = real_population(X_test[0], size=1)['target'][0]
    #truth = [truth] * simulations
    if(model== 'Lin'):
        estimate = mydf[1]
    else:
        estimate = mydf[2]
    bias = np.mean(estimate) - truth
    return bias

In [7]:
def get_var(mydf, model='Lin'):
    if(model== 'Lin'):
        estimate = mydf[1]
    else:
        estimate = mydf[2]
    var = np.mean((estimate - np.mean(estimate))**2)
    return var

In [8]:
def run_simulation(lin_model, tree_model, sims = 100):
    simulations = sims
    predicted = []
    for i in range(0,simulations):
            D = simulation_data(5000, i)
            X = D[['X1']]
            Y = D['target']
            lin_model.fit(X,Y)
            tree_model.fit(X,Y)
            tup = (i, reg.predict(pd.DataFrame(X_test).T), tree.predict(pd.DataFrame(X_test).T))
            predicted.append(tup)
    predicted_df = pd.DataFrame(predicted)
    return predicted_df

In [9]:
## function to evaluate the different metrics of simulation
def evaluate_simulation(prediction_df):
    print("Bias for Lin model is: ", get_bias(prediction_df, 'Lin')**2)
    print("Bias for Tree model is: ", get_bias(prediction_df, 'tree')**2)
    
    print("Var for Lin model is:", get_var(prediction_df, 'Lin'))
    print("var for Tree model is:", get_var(prediction_df, 'tree'))
    
    print("MSE for Lin model is:", get_mse(prediction_df, 'Lin'))
    print("MSE for Tree model is:", get_mse(prediction_df, 'tree'))

    return()


In [13]:
## Invoking the functions defined above
reg = reg = linear_model.LinearRegression()
simulations = 100
np.random.seed(22)
X_test = np.random.uniform(-1,1,1)
for depth in [3,4,6,8,9,10]:
    tree = DecisionTreeRegressor(max_depth=depth)
    results = run_simulation(reg, tree)
    evaluate_simulation(results)
    print("\n end of iter for depth", depth)
    print('\n')


Bias for Lin model is:  [20.10455431]
Bias for Tree model is:  [20.49605375]
Var for Lin model is: [0.02338395]
var for Tree model is: [0.14802885]
MSE for Lin model is: [20.12793825]
MSE for Tree model is: [20.6440826]

 end of iter for depth 3


Bias for Lin model is:  [20.10455431]
Bias for Tree model is:  [20.59523615]
Var for Lin model is: [0.02338395]
var for Tree model is: [0.16950299]
MSE for Lin model is: [20.12793825]
MSE for Tree model is: [20.76473914]

 end of iter for depth 4


Bias for Lin model is:  [20.10455431]
Bias for Tree model is:  [19.51780859]
Var for Lin model is: [0.02338395]
var for Tree model is: [0.72026649]
MSE for Lin model is: [20.12793825]
MSE for Tree model is: [20.23807508]

 end of iter for depth 6


Bias for Lin model is:  [20.10455431]
Bias for Tree model is:  [20.14544712]
Var for Lin model is: [0.02338395]
var for Tree model is: [1.1708267]
MSE for Lin model is: [20.12793825]
MSE for Tree model is: [21.31627382]

 end of iter for depth 8


Bias f