# Decision Trees

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

<img src="./images/decision_tree.png"/>

## Building decision trees

<img src="./images/two_moons.png"/>

<img src="./images/depth_1.png"/>

<img src="./images/depth_2.png">

<img src="./images/depth_9.png">

---

In [None]:
from helpers import helpers_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_blobs

        
fig, ax = plt.subplots(1, 4, figsize=(16, 3))
fig.subplots_adjust(left=0.02, right=0.98, wspace=0.1)

X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=1.0)

for axi, depth in zip(ax, range(1, 5)):
    model = DecisionTreeClassifier(max_depth=depth)
    helpers_tree.visualize_tree(model, X, y, ax=axi)
    axi.set_title(f'depth = {depth}')

plt.show()

---

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=1.0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier().fit(X, y)

In [None]:
from helpers import helpers_tree

helpers_tree.plot_tree_interactive(X, y)
plt.show()

## Controlling complexity of decision trees

In [None]:
from helpers import helpers_tree

model = DecisionTreeClassifier()

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
helpers_tree.visualize_tree(model, X[::2], y[::2], boundaries=False, ax=ax[0])
helpers_tree.visualize_tree(model, X[1::2], y[1::2], boundaries=False, ax=ax[1])
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

print(f"Accuracy on training set: {tree.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {tree.score(X_test, y_test):.3f}")

In [None]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)

print(f"Accuracy on training set: {tree.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {tree.score(X_test, y_test):.3f}")

---

In [None]:
traning_scores = {}
testing_scores = {}

depths = list(range(1,10))

for depth in depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=0)
    tree.fit(X_train, y_train)
    traning_scores[depth] = tree.score(X_train, y_train)
    testing_scores[depth] = tree.score(X_test, y_test)

In [None]:
plt.plot(traning_scores.keys(), traning_scores.values(), c="blue", label="training")
plt.plot(testing_scores.keys(), testing_scores.values(), c="red", label="testing")
plt.legend()
plt.show()

## Analyzing decision trees

In [None]:
from sklearn.tree import export_graphviz

In [None]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)

export_graphviz(tree, out_file="data/tree.dot", 
                class_names=["malignant", "benign"], 
                feature_names=cancer.feature_names, 
                impurity=False, 
                filled=True)

In [None]:
import graphviz

with open("data/tree.dot") as f:
    dot_graph = f.read()

graphviz.Source(dot_graph)

## Feature importance in trees

In [None]:
tree.feature_importances_

In [None]:
def plot_feature_importances_cancer(model):
    plt.figure(figsize=(5, 7))
    n_features = cancer.data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), cancer.feature_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.show()

plot_feature_importances_cancer(tree)

## Decision trees for regression

In [None]:
ram_prices = pd.read_csv("data/ram_price.csv", index_col=0)

In [None]:
ram_prices.head()

In [None]:
plt.semilogy(ram_prices["date"], ram_prices["price"])
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")
plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

# use historical data to forecast prices after the year 2000
data_train = ram_prices[ram_prices["date"] < 2000]
data_test = ram_prices[ram_prices["date"] >= 2000]

In [None]:
# predict prices based on date
X_train = data_train["date"].values.reshape(X_train.shape[0], 1)

# we use a log-transform to get a simpler relationship of data to target
y_train = np.log(data_train["price"])

tree = DecisionTreeRegressor().fit(X_train, y_train)
linear_reg = LinearRegression().fit(X_train, y_train)

In [None]:
# predict on all data
X_all = ram_prices["date"].values.reshape(ram_prices.shape[0], 1)

pred_tree = tree.predict(X_all)
pred_lr = linear_reg.predict(X_all)

In [None]:
# undo log-transform
price_tree = np.exp(pred_tree)
price_lr = np.exp(pred_lr)

In [None]:
plt.semilogy(data_train.date, data_train.price, label="Training data")
plt.semilogy(data_test.date, data_test.price, label="Test data")
plt.semilogy(ram_prices.date, price_tree, label="Tree prediction")
plt.semilogy(ram_prices.date, price_lr, label="Linear prediction")
plt.legend()
plt.show()

## Strengths, weaknesses, and parameters