<div style="text-align:center"><span style="font-size:2em; font-weight: bold;">  Lecture 8—Ensemble Learning</span></div>

# Programming: Bootstrap

In [71]:
import numpy as np
import pandas as pd
from cleands import *

def generate_mvt_normal(n,r,means=None):
    if means is None: means = np.zeros(size=(r,))
    if r == 1: return np.random.normal(loc=means,size=(n,1))
    P = np.array([[1,1]])
    for i in range(2,r):
        ones = np.ones([i,1])
        zeros = np.zeros([P.shape[0],1])
        ident = np.eye(i)
        upper = np.hstack([ones,ident])
        lower = np.hstack([zeros,P])
        P = np.vstack([upper,lower])
    covariates = np.random.normal(size=(n,P.shape[0]))
    idiosyncratics = np.random.normal(loc=means,size=(n,P.shape[1]))
    covariate_loadings = np.random.uniform(size=P.shape[0])*3-1
    return covariates@np.diagflat(covariate_loadings)@P+idiosyncratics

n = 1000
npx = generate_mvt_normal(n,4,means=np.random.uniform(size=(4,))*2-1)
ones = np.ones((n,1))
onpx = np.hstack([ones,npx])
bvec = np.random.uniform(size=(5,))
npy = np.random.normal(size=(n,))+onpx@bvec

In [72]:
bvec

array([0.59580961, 0.62677995, 0.2553745 , 0.85809453, 0.75770855])

In [73]:
model = least_squares_regressor(onpx,npy)
b = model.params
e = model.residuals
b

array([0.57904729, 0.60979805, 0.24622022, 0.85004829, 0.77330445])

In [74]:
## simple bootstrap
outp = []
bootstraps = 10000
bnew = b.copy()
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    bsmodel = least_squares_regressor(onpx[sample],npy[sample])
    outp += [bsmodel.params]
outp = np.array(outp)

In [75]:
## slightly complicated bootstrap (Lose heteroskedasticity information)
outp = []
bootstraps = 10000
bnew = b.copy()
#bnew[3] = 0 # imposing the null
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    newy = onpx@bnew+e[sample]
    bsmodel = least_squares_regressor(onpx,newy)
    outp += [bsmodel.params]
outp = np.array(outp)

In [76]:
## complicated bootstrap (perserves heteroskedasticity)
outp = []
bootstraps = 10000
bnew = b.copy()
bnew[3] = 0 # imposing the null
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    newy = onpx[sample]@bnew+e[sample]
    bsmodel = least_squares_regressor(onpx[sample],newy)
    outp += [bsmodel.params]
outp = np.array(outp)

In [77]:
outp

array([[ 0.60872075,  0.59839994,  0.25457121, -0.01213717,  0.79764942],
       [ 0.56835776,  0.56749752,  0.27615951, -0.02465991,  0.81916297],
       [ 0.52231276,  0.58439009,  0.2748202 , -0.03228486,  0.78182452],
       ...,
       [ 0.54027412,  0.62590518,  0.2748044 , -0.02137151,  0.76315605],
       [ 0.57433773,  0.59885574,  0.28106649, -0.01056872,  0.76092433],
       [ 0.56229817,  0.60140955,  0.23868917,  0.00543007,  0.78163952]])

In [78]:
outp.mean(0)

array([5.79417173e-01, 6.09812687e-01, 2.46085081e-01, 1.82956429e-04,
       7.73349002e-01])

In [79]:
b

array([0.57904729, 0.60979805, 0.24622022, 0.85004829, 0.77330445])

In [80]:
np.median(outp,0)

array([5.79951685e-01, 6.09809001e-01, 2.46178348e-01, 3.50371665e-05,
       7.73549857e-01])

In [81]:
bvec

array([0.59580961, 0.62677995, 0.2553745 , 0.85809453, 0.75770855])

In [82]:
outp.std(0)

array([0.04271117, 0.01929997, 0.0174799 , 0.01869805, 0.01888869])

In [83]:
np.sqrt(np.diag(model.vcov_params))

array([0.04281331, 0.01916156, 0.01749131, 0.0191231 , 0.01891748])

In [84]:
# confidence interval for betahat
outp.sort(0)
print(outp[int(outp.shape[0]*0.025)])
print(outp[int(outp.shape[0]*0.975)])

[ 0.49492648  0.57199882  0.21198957 -0.03677253  0.73632199]
[0.66238495 0.64770269 0.28080505 0.03630404 0.81040574]


In [85]:
# imposed null hypothesis
lowercv = outp[int(outp.shape[0]*0.025),3]
uppercv = outp[int(outp.shape[0]*0.975),3]
print((lowercv,uppercv))
print(b[3])

(-0.036772529874827195, 0.03630403993639802)
0.8500482856766594


In [86]:
def bootstrap(model,dgp,x,e,seed=None,bootstraps:int=1000):
    outp = []
    if seed is not None: np.random.seed(seed)
    for i in range(bootstraps):
        sample = np.random.randint(x.shape[0],size=(x.shape[0],))
        newy = dgp(x[sample],e[sample])
        outp += [model(x[sample],newy)]
    return outp

In [87]:
# complicated bootstrap (standard error)
bs = bootstrap(least_squares_regressor,
          lambda x,e: x@b+e,
          onpx,e)
np.array([i.params for i in bs]).std(0)


array([0.04133424, 0.01941965, 0.01767699, 0.01832018, 0.01894115])

In [88]:
# simple bootstrap
bs = bootstrap(least_squares_regressor,
          lambda x,e: e,
          onpx,npy)
np.array([i.params for i in bs]).std(0)

array([0.04285462, 0.01958486, 0.0170425 , 0.01832926, 0.01886355])

In [89]:
npx[:,3]

array([ 2.58820370e+00,  1.37011058e+00, -3.92234076e+00,  2.97130322e-01,
        1.40277955e+00, -2.20503875e-02,  6.14142883e-02, -1.25991588e+00,
        3.84000903e+00,  5.04444380e+00,  2.97669864e+00, -2.73901130e+00,
        4.40652396e+00,  2.15221201e+00, -2.32422395e+00, -1.34563751e+00,
       -2.00870454e+00,  6.41130363e-01, -4.70524312e+00,  9.88968668e-01,
       -2.05553605e+00,  5.23758126e-01,  1.51591356e+00, -2.92482796e+00,
       -2.63899239e+00,  4.37007392e-02, -9.48331684e-01,  1.16371216e+00,
       -2.10308310e+00,  2.23005717e-01, -1.42646247e+00, -2.05600581e+00,
        1.35907218e+00, -4.32733695e+00,  6.54789722e-02,  2.16826718e+00,
       -4.39690389e+00, -9.66580223e-01, -2.33971921e+00, -2.55466955e+00,
       -6.33679305e-01,  4.23803503e-02, -3.33234880e+00,  2.44287619e+00,
        2.85648910e+00, -4.82711147e-01, -1.67495356e+00, -1.14712518e-01,
       -1.50565017e+00, -3.08199938e+00, -1.07502541e+00,  2.48083771e+00,
       -7.28574560e-02, -

In [90]:
df = pd.DataFrame({'y':npy,'x1':npx[:,0],'x2':npx[:,1],'x3':npx[:,2],'x4':npx[:,3]})

In [10]:
#model = LassoCV(*add_intercept(['x1','x2','x3','x4'],'y',df))
#model.glance

# Data Science


## Bagging

Average multiple models fit with bootstrap samples

In [91]:
np.array([i.predict(onpx) for i in bs]).mean(0)

array([ 4.09440907e+00,  1.26409109e+00, -3.88601540e+00,  3.32739220e+00,
        5.52722659e+00,  1.19050181e+00,  4.91284785e+00, -4.12441868e-01,
        2.83727640e+00,  4.99028265e+00,  3.59869102e+00, -4.60727520e+00,
        6.10149847e+00,  1.16696368e+00, -3.46525739e+00,  1.23526979e+00,
       -4.54183616e-01,  4.33978041e+00, -3.24251706e+00,  5.43078440e-01,
       -2.95237180e+00,  4.39235211e-01,  2.77157418e+00, -4.23325565e+00,
       -4.26521124e+00, -3.17738572e+00, -3.40691420e+00,  7.06781555e-01,
       -1.58292152e+00,  2.27673889e+00, -2.45814073e+00,  5.94859572e-01,
        2.34466246e+00, -3.25241676e+00, -4.59769805e-01,  1.77263624e+00,
       -3.37981793e+00, -2.31148049e+00, -4.98711533e+00, -6.10263764e+00,
       -1.83762736e-01,  5.34576242e-02, -1.16517908e+00,  4.50361054e+00,
        8.97108943e+00, -5.54360277e-01, -5.59190206e+00,  1.21392147e+00,
       -3.48720980e+00,  1.37283807e+00, -2.61502192e+00,  6.90778179e+00,
       -2.36583476e-01, -

In [92]:
np.array([i.predict(onpx) for i in bs]).mean(0)-onpx@np.array([i.params for i in bs]).mean(0)

array([ 3.55271368e-15,  3.33066907e-15,  2.66453526e-15, -4.44089210e-15,
       -4.44089210e-15, -1.99840144e-15, -1.15463195e-14,  8.32667268e-15,
        8.88178420e-15, -2.66453526e-15,  4.44089210e-15, -8.88178420e-16,
        1.59872116e-14,  6.43929354e-15,  6.21724894e-15, -2.22044605e-16,
       -1.49880108e-15,  2.66453526e-15, -4.44089210e-15, -1.44328993e-15,
        5.32907052e-15, -1.11022302e-16, -5.32907052e-15, -2.66453526e-15,
        7.99360578e-15,  1.02140518e-14, -3.55271368e-15, -6.66133815e-16,
        2.44249065e-15, -1.77635684e-15,  2.22044605e-15, -2.10942375e-15,
       -3.10862447e-15, -1.77635684e-15, -1.99840144e-15,  5.32907052e-15,
        1.33226763e-15,  2.22044605e-15,  4.44089210e-15,  9.76996262e-15,
        1.66533454e-15,  5.30825384e-15,  2.22044605e-16,  2.66453526e-15,
        3.55271368e-15,  1.33226763e-15, -4.44089210e-15, -3.55271368e-15,
        3.99680289e-15, -9.99200722e-15, -2.22044605e-15,  7.99360578e-15,
       -1.47104551e-15,  

In [93]:
deck = np.arange(n)
np.random.shuffle(deck)
test = deck[:int(n*0.2)]
train = deck[int(n*0.2):]

In [94]:
model = least_squares_regressor(onpx[train],npy[train])
err = npy[test]-model.predict(onpx[test])
mspe = (err**2).mean()
mspe


0.9353986152069933

In [95]:
bs = bootstrap(least_squares_regressor,
          lambda x,e: e,
          onpx[train],npy[train])
bs = np.array([i.predict(onpx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

0.935340769978871

In [96]:
model = rpart(npx[train],npy[train])
err = npy[test] - model.predict(npx[test])
mspe = (err**2).mean()
mspe

2.9987580724318765

In [29]:
bs = bootstrap(rpart,
          lambda x,y: y,
          npx[train],npy[train],bootstraps=100)
bs = np.array([i.predict(npx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

  tstat = model.params/np.sqrt(np.diag(model.vcov_params))
  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom


1.3859435382211769

## Random Forest

Average multiple models fit with subsets of the x variables and bootstrap samples

In [30]:
bs = bootstrap(lambda x,y: rpart(x,y,random_x=True),
          lambda x,y: y,
          npx[train],npy[train],bootstraps=100)
bs = np.array([i.predict(npx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

1.3977376605291494

## Boosting

Repeatedly fit trees on the residuals

In [98]:
resid = npy.copy()
mspe = (resid[test]**2).mean()
models = []
for i in range(100):
    model = rpart(npx[train],resid[train],max_level=3)
    resid -= model.predict(npx)
    mspe_new = (resid[test]**2).mean()
    if mspe_new>mspe: break
    models += [model]
    mspe = mspe_new
    print(mspe)

8.44634711586615
5.548646862433834
4.814466204606754
3.8973677921809644
3.7228522883722723


In [None]:
# Calculate the fitted values
np.array([i.predict(npx) for i in models]).sum(0)

# Programming challenges

## Gradient boosting

Create plots for gradient boosting.

## Insertion sort

Put together a code to implement the insertion sort algorithm.


## Principal components analysis

Write a PCA class. Add an estimator for the number of principal components to use.