# Del 9: Decision Trees and Random Forests

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## Introduction to Decision Trees

<img src="https://jakevdp.github.io//PythonDataScienceHandbook/figures/05.08-decision-tree.png" alt="">

### Overview of the Data Set

In [None]:
income = pd.read_csv("data/income.csv", index_col=False)
income.head(5)

In [None]:
income.shape

### Converting Categorical Variables

In [None]:
cat_columns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]

for name in cat_columns:
    col = pd.Categorical(income[name])
    income[name] = col.codes

In [None]:
income.head(3)

### Splitting Data

In [None]:
private_incomes = income[income["workclass"] == 4]
public_incomes = income[income["workclass"] != 4]

print(private_incomes.shape)
print(public_incomes.shape)

### Overview of Data Set Entropy

$-\sum_{i=1}^{c} {\mathrm{P}(x_i) \log_b \mathrm{P}(x_i)}$

In [None]:
import math


prob_0 = income[income["high_income"] == 0].shape[0] / income.shape[0]
prob_1 = income[income["high_income"] == 1].shape[0] / income.shape[0]

income_entropy = -(prob_0 * math.log(prob_0, 2) + prob_1 * math.log(prob_1, 2))

In [None]:
income_entropy

### Information Gain

$IG(T,A) = Entropy(T)-\sum_{v\in A}\frac{|T_{v}|}{|T|} \cdot Entropy(T_{v})$

In [None]:
import numpy as np

def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [None]:
income_entropy = calc_entropy(income["high_income"])
income_entropy

In [None]:
# Razdelimo starost na dve kategorijji
median_age = income["age"].median()

left_split = income[income["age"] <= median_age]
right_split = income[income["age"] > median_age]

In [None]:
age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) 
                    + ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"])))

In [None]:
age_information_gain

### Finding the Best Split

In [None]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

In [None]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", 
           "race", "sex", "hours_per_week", "native_country"]

In [None]:
information_gains = []
for col in columns:
    information_gain = calc_information_gain(income, col, "high_income")
    information_gains.append(information_gain)

In [None]:
highest_gain_index = information_gains.index(max(information_gains))
highest_gain = columns[highest_gain_index]
highest_gain

## Prikaz delovanja: Decision Trees

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=300, centers=4,
                  random_state=0, cluster_std=1.0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow')
plt.show()

In [None]:
from helpers import visualize_tree
from sklearn.tree import DecisionTreeClassifier
        
fig, ax = plt.subplots(1, 4, figsize=(16, 3))
fig.subplots_adjust(left=0.02, right=0.98, wspace=0.1)

for axi, depth in zip(ax, range(1, 5)):
    model = DecisionTreeClassifier(max_depth=depth)
    visualize_tree(model, X, y, ax=axi)
    axi.set_title('depth = {0}'.format(depth))

plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(X, y)

In [None]:
from helpers import visualize_classifier

In [None]:
visualize_classifier(DecisionTreeClassifier(), X, y)

In [None]:
from helpers import plot_tree_interactive

In [None]:
plot_tree_interactive(X, y)
plt.show()

## Applying Decision Trees

### Using Decision Trees With scikit-learn

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", 
           "sex", "hours_per_week", "native_country"]

In [None]:
X = income[columns]
y = income["high_income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
print(f"Accuracy on training set: {clf.score(X_train, y_train):.3f}")

In [None]:
print(f"Accuracy on test set: {clf.score(X_test, y_test):.3f}")

### Evaluating Error With AUC

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
predictions = clf.predict(X_test)

In [None]:
error = roc_auc_score(y_test, predictions)
print(error)

In [None]:
predictions = clf.predict(X_train)
print(roc_auc_score(y_train, predictions))

### Decision Tree Overfitting

### Combat overfitting: Restrict the depth of the tree

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
clf = DecisionTreeClassifier(random_state=1)

In [None]:
param_grid = {'min_samples_split': list(range(2,15))}

grid = GridSearchCV(clf, param_grid)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
# ponovimo oceno z najboljšim parametrom
clf = DecisionTreeClassifier(min_samples_split=14, random_state=1)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
test_auc = roc_auc_score(y_test, predictions)

train_predictions = clf.predict(X_train)
train_auc = roc_auc_score(y_train, train_predictions)

print('Test:', test_auc)
print('Train:',train_auc)

In [None]:
clf = DecisionTreeClassifier(random_state=1, min_samples_split=14)

param_grid = {'max_depth': list(range(2,12))}

grid = GridSearchCV(clf, param_grid)

grid.fit(X_train, y_train)

print(grid.best_params_)

In [None]:
# ponovimo oceno z najboljšim parametrom
clf = DecisionTreeClassifier(min_samples_split=14, max_depth=7, random_state=1)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
test_auc = roc_auc_score(y_test, predictions)

train_predictions = clf.predict(X_train)
train_auc = roc_auc_score(y_train, train_predictions)

print('Test:', test_auc)
print('Train:',train_auc)

In [None]:
# prikažemo underfitting
clf = DecisionTreeClassifier(min_samples_split=100, max_depth=2, random_state=1)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
test_auc = roc_auc_score(y_test, predictions)

train_predictions = clf.predict(X_train)
train_auc = roc_auc_score(y_train, train_predictions)

print('Test:', test_auc)
print('Train:',train_auc)

In [None]:
# priverjava test/train score za različne globine drevesa

max_depths = list(range(1,25))

test_aucs = []
train_aucs = []

for max_depth in max_depths:
    clf = DecisionTreeClassifier(min_samples_split=14, max_depth=max_depth, random_state=1)
    clf.fit(X_train, y_train)

    test_predictions = clf.predict(X_test)
    test_auc = roc_auc_score(y_test, test_predictions)
    test_aucs.append(test_auc)

    train_predictions = clf.predict(X_train)
    train_auc = roc_auc_score(y_train, train_predictions)
    train_aucs.append(train_auc)

In [None]:
plt.plot(max_depths, test_aucs, label='test')
plt.plot(max_depths, train_aucs, c='r', label='train')
plt.legend()
plt.title('Train/test AUC for different max_depth values')
plt.xlabel('max_depth')
plt.ylabel('AUC')
plt.show()

#### The Bias-Variance Tradeoff

### Analyzing decision trees

In [None]:
clf = DecisionTreeClassifier(max_depth=2, random_state=1)
clf.fit(X_train, y_train)

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(clf, 
                out_file="data/tree.dot", 
                class_names=["<=50K", ">50K"],
                feature_names=columns,
                impurity=False, 
                filled=True)

In [None]:
import graphviz

with open("data/tree.dot") as f:
    dot_graph = f.read()
    
graphviz.Source(dot_graph)

### Feature importance in trees

In [None]:
clf.feature_importances_

In [None]:
n_features = len(columns)
plt.barh(range(n_features), clf.feature_importances_, align='center')
plt.yticks(np.arange(n_features), columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()

### Knowing When to Use Decision Trees

### Prikaz delovanja: Decision trees and over-fitting

In [None]:
from sklearn.datasets import make_blobs
from helpers import visualize_tree

model = DecisionTreeClassifier()

X, y = make_blobs(n_samples=300, centers=4,
                  random_state=0, cluster_std=1.0)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
visualize_tree(model, X[::2], y[::2], boundaries=False, ax=ax[0])
visualize_tree(model, X[1::2], y[1::2], boundaries=False, ax=ax[1])
plt.show()

In [None]:
import helpers

helpers.randomized_tree_interactive(X, y)
plt.show()

## Introduction to Random Forests

### Combining Model Predictions With Ensembles

In [None]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", 
           "race", "sex", "hours_per_week", "native_country"]

In [None]:
X = income[columns]
y = income["high_income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
clf.fit(X_train, y_train)

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)
print(roc_auc_score(y_test, predictions))

In [None]:
predictions = clf2.predict(X_test)
print(roc_auc_score(y_test, predictions))

In [None]:
predictions = clf.predict_proba(X_test[columns])[:,1]
predictions2 = clf2.predict_proba(X_test[columns])[:,1]
combined = (predictions + predictions2) / 2
rounded = np.round(combined)

print(roc_auc_score(y_test, rounded))

### Introducing Variation With Bagging

In [None]:
tree_count = 10

bag_proportion = .6

In [None]:
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))
train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

In [None]:
predictions = []
for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
    clf.fit(bag[columns], bag["high_income"])
    
    predictions.append(clf.predict_proba(test[columns])[:,1])
    
combined = np.sum(predictions, axis=0) / 10
rounded = np.round(combined)

print(roc_auc_score(test["high_income"], rounded))

### Selecting Random Features

In [None]:
predictions = []

for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2, splitter="random", max_features="auto")
    clf.fit(bag[columns], bag["high_income"])
    
    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = np.sum(predictions, axis=0) / 10
rounded = np.round(combined)

print(roc_auc_score(test["high_income"], rounded))

### Prikaz delovanja: Random Forests

In [None]:
X, y = make_blobs(n_samples=300, centers=4,
                  random_state=0, cluster_std=1.0)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8,
                        random_state=1)

bag.fit(X, y)
visualize_classifier(bag, X, y)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0)
visualize_classifier(model, X, y);

### Using RandomForestClassifier 

In [None]:
X = income[columns]
y = income["high_income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(roc_auc_score(y_test, predictions))

In [None]:
print(f"Accuracy on training set: {clf.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {clf.score(X_test, y_test):.3f}")

In [None]:
importances = pd.Series(data=clf.feature_importances_, index= X_train.columns)

In [None]:
importances_sorted = importances.sort_values()

importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

### Tweaking Parameters to Increase Accuracy

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(roc_auc_score(y_test, predictions))

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2, n_jobs=-1)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(roc_auc_score(y_test, predictions))

### Reducing Overfitting

### Summary of Random Forests

## Example: Random Forest for Classifying Digits

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()

In [None]:
# set up the figure
fig = plt.figure(figsize=(6, 6))  # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# plot the digits: each image is 8x8 pixels
for i in range(64):
    ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
    ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
    
    # label the image with the target value
    ax.text(0, 7, str(digits.target[i]))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target,
                                                random_state=0)

model = RandomForestClassifier(n_estimators=1000)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

In [None]:
from sklearn import metrics
print(metrics.classification_report(ypred, ytest))

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(ytest, ypred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)

plt.xlabel('true label')
plt.ylabel('predicted label');

## Gradient boosted regression trees (gradient boosting machines)

In [None]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", 
           "race", "sex", "hours_per_week", "native_country"]

X = income[columns]
y = income["high_income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

In [None]:
print(f"Accuracy on training set: {gbrt.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {gbrt.score(X_test, y_test):.3f}")

In [None]:
gbrt = GradientBoostingClassifier(random_state=0)

param_grid = {'max_depth': list(range(1,7)), 'learning_rate': [0.001, 0.025, 0.01, 0.1, 0.5]}

grid = GridSearchCV(gbrt, param_grid, n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)

print(grid.best_params_)

In [None]:
best_model = grid.best_estimator_

In [None]:
print(f"Accuracy on training set: {best_model.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {best_model.score(X_test, y_test):.3f}")

In [None]:
n_estimators_list = list(range(100,1100,100))

test_aucs = []
train_aucs = []
test_accuracy = []
train_accuracy = []

for n_estimators in n_estimators_list:
    clf = GradientBoostingClassifier(max_depth=5, n_estimators=n_estimators, random_state=0)
    clf.fit(X_train, y_train)

    # Test data
    test_predictions = clf.predict(X_test)
    test_auc = roc_auc_score(y_test, test_predictions)
    test_aucs.append(test_auc)
    
    test_acc = clf.score(X_test, y_test)
    test_accuracy.append(test_acc)
    
    # Train data
    train_predictions = clf.predict(X_train)
    train_auc = roc_auc_score(y_train, train_predictions)
    train_aucs.append(train_auc)
    
    train_acc = clf.score(X_train, y_train)
    train_accuracy.append(train_acc)

In [None]:
plt.plot(n_estimators_list, test_aucs, c='b', label='test AUC')
plt.plot(n_estimators_list, train_aucs, c='r', label='train AUC')

plt.plot(n_estimators_list, test_accuracy, c='skyblue', label='test accuracy')
plt.plot(n_estimators_list, train_accuracy, c='lightcoral', label='train accuracy')

plt.legend()
plt.title('Train/test AUC/accuracy for different n_estimators values')
plt.xlabel('n_estimators')
plt.ylabel('AUC/accuracy')
plt.show()

In [None]:
# priverjava test/train score za različne globine drevesa

max_depths = list(range(1,10))

test_aucs = []
train_aucs = []
test_accuracy = []
train_accuracy = []

for max_depth in max_depths:
    clf = GradientBoostingClassifier(max_depth=max_depth, random_state=0)
    clf.fit(X_train, y_train)

    # Test data
    test_predictions = clf.predict(X_test)
    test_auc = roc_auc_score(y_test, test_predictions)
    test_aucs.append(test_auc)
    
    test_acc = clf.score(X_test, y_test)
    test_accuracy.append(test_acc)
    
    # Train data
    train_predictions = clf.predict(X_train)
    train_auc = roc_auc_score(y_train, train_predictions)
    train_aucs.append(train_auc)
    
    train_acc = clf.score(X_train, y_train)
    train_accuracy.append(train_acc)

In [None]:
plt.plot(max_depths, test_aucs, c='b', label='test AUC')
plt.plot(max_depths, train_aucs, c='r', label='train AUC')

plt.plot(max_depths, test_accuracy, c='skyblue', label='test accuracy')
plt.plot(max_depths, train_accuracy, c='lightcoral', label='train accuracy')

plt.legend()
plt.title('Train/test AUC/accuracy for different max_depth values')
plt.xlabel('max_depth')
plt.ylabel('AUC/accuracy')
plt.show()

In [None]:
# priverjava test/train score za različne learning_rate pri max_depth = 5

learning_rates = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

test_aucs = []
train_aucs = []
test_accuracy = []
train_accuracy = []

for learning_rate in learning_rates:
    clf = GradientBoostingClassifier(max_depth=5, learning_rate=learning_rate, random_state=0)
    clf.fit(X_train, y_train)

    # Test data
    test_predictions = clf.predict(X_test)
    test_auc = roc_auc_score(y_test, test_predictions)
    test_aucs.append(test_auc)
    
    test_acc = clf.score(X_test, y_test)
    test_accuracy.append(test_acc)
    
    # Train data
    train_predictions = clf.predict(X_train)
    train_auc = roc_auc_score(y_train, train_predictions)
    train_aucs.append(train_auc)
    
    train_acc = clf.score(X_train, y_train)
    train_accuracy.append(train_acc)

In [None]:
plt.plot(learning_rates, test_aucs, c='b', label='test AUC')
plt.plot(learning_rates, train_aucs, c='r', label='train AUC')

plt.plot(learning_rates, test_accuracy, c='skyblue', label='test accuracy')
plt.plot(learning_rates, train_accuracy, c='lightcoral', label='train accuracy')

plt.legend()
plt.title('Train/test AUC/accuracy for different learning_rate values')
plt.xlabel('learning_rate')
plt.ylabel('AUC/accuracy')
plt.xscale('log')
plt.show()