In [None]:
import numpy as np
import sklearn.ensemble as ske
import sklearn.cross_validation as skv
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.tree as skt
import pandas as pd
%matplotlib inline
sns.set_style("white")

# <center> Week 9 - Tree Based Methods </center>

<b>Generate Data</b>

Generate data <font face ="Courier">(X,y)</font> so that the relationship looks like the figure below. Both should be (25,1) arrays. Note that there's no error term!

<img src = "example.png">

In [None]:
X = np.linspace(-5, 5, 25).reshape(25, 1)
y = 5/(1 + np.exp(-x))

<b>Discussion</b> 

+ If you could fit *one horizontal line* to this data, at what height would you fit it? [Based on what criterion?]

+ If you could fit *two horizontal lines*, where would you place them?

+ How about more horizontal lines?

<hr>

# Regression Trees

Use <font face="Courier">sklearn.tree.DecisionTreeRegressor</font> to fit trees of different <font face="Courier">max_depth</font>s. Then plot the predicted value over a grid of $X$s.

In [None]:
tree = skt.DecisionTreeRegressor(max_depth = 1) 

In [None]:
# Fit
tree.fit(X, y)
# Predict over x_grid
x_grid = np.linspace(-5, 5, 1000)[:,np.newaxis]
yhat = tree.predict(x_grid)

Plot it!

In [None]:
fig, ax = plt.subplots(1, figsize = (15,7))
ax.grid()
plt.scatter(x, y, s = 200)
plt.plot(x_grid, yhat, linewidth = 5, color = "darkorange")

<b>Discussion</b> 

+ Which variables are we optimizing over?

+ How many leaves are there when max_depth = 2? 4? 6?

## <b>Bias vs. Variance in Trees</b>

Let's introduce some error.

In [None]:
max_depth = 1
n_bootstrap = 1

In [None]:
fig, ax = plt.subplots(1, figsize = (12,6))

n_obs = 25
XX = np.linspace(-5, 5, n_obs).reshape(n_obs, 1)
yy = y + np.random.normal(0, .5, size = (n_obs, 1))

plt.scatter(XX, yy, s = 250, alpha = .5)

for i in range(n_bootstrap):
    
    idx = np.random.choice(range(n_obs), replace = True, size = n_obs)
    tree = skt.DecisionTreeRegressor(max_depth = max_depth) # Note the max_depth!
    tree.fit(XX[idx], yy[idx])
    yhat = tree.predict(x_grid)
    plt.plot(x_grid, yhat, linewidth = 2, color = "darkorange")

    
x_grid = np.linspace(-5, 5, 1000)[:,np.newaxis]
y_true = 5/(1 + np.exp(-x_grid))
ax.plot(x_grid, y_true, linewidth = 5)

#ax.set_title("Bootstrap iterations: %d" % n_bootstrap, fontsize = 20)
ax.text(-4, 4, "Max Depth: %d" % tree.max_depth, fontsize = 20)
sns.despine()


<b>Discussion</b>

+ How does the variance of *one* tree behave as we increase the number of leaves? 

+ When we resample the data, how does the *average* perform ? 

+ Draw an insight to the previous question to propose an estimator. Then, immediately time-travel to 25 years ago, publish your idea in an academic paper and go enjoy your professorship in Stanford.

<b>By the way</b>

+ Don't worry about algorithm 8.1 in the book unless you're really into trees. 

+ No one uses trees anymore. They've been superseeded by...

<hr>


# Random Forests

<font size = 1>[This is one of my favorite algorithms.]</font>

<b>Warm-up</b>

+ Given a bunch of data $\{Z_{i}\}$ *iid* with mean $\mu$ and variance $\sigma^2$, propose an estimator for $E[Z]$.

+ What is the variance of that estimator?

+ What if your data is not independently drawn? [Hint: think about extreme cases of non-independence.]

<b>Intuition</b>

The *random* in *random* forests comes from the idea of "decorrelating" each tree estimate, by injecting randomness in the estimate. This is accomplished by:

+ Bagging

+ Not using all regressors to build each tree (!)

Sounds stupid, works wonders.

<b>Data</b>

We'll use the *Heart* dataset, as in the textbook. 

In [None]:
heart = pd.read_csv("heart.csv", index_col = 0).dropna()
heart["ChestPain"] = heart["ChestPain"].replace({"asymptomatic":0, "typical":1,  "nonanginal":2, "nontypical":3})
heart["AHD"] = heart["AHD"].replace({"Yes":1, "No":0})
heart.head()

In [None]:
y = heart["AHD"]
X = heart[["Age", "Sex", "ChestPain", "RestBP", "Chol", "Fbs", "RestECG", "MaxHR", "ExAng", "Oldpeak", "Ca"]]

Let's use <font face = "Courier">sklearn.ensemble.RandomForestClassifier</font> 

In [None]:
rf = ske.RandomForestClassifier(n_estimators = 1, oob_score = True)

In [None]:
rf.fit(X, y)
print(rf.oob_score_)  # If n_estimators is low, it will produce a warning. Don't worry about it.

<b>Variable Importance</b>

In [None]:
fig, ax = plt.subplots(1, figsize = (10, 5))
pd.Series(index = X.columns, data = rf.feature_importances_).order().plot(ax = ax, kind = "barh", color = "purple")
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14) 
ax.set_title("Variable Importance", fontsize = 20)
ax.set_xlabel("Reduction in accuracy when variable is randomized", fontsize = 18)

<b>Note</b>

Feel free to use this in your HW5 / Final Project!