# Decision Trees and Random Forests

**Goal: identify the penguin species based on their measures.**

In [None]:
import pickle

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

### Preparations
Similar to the ML Workflow notebook.

In [None]:
# get the data
df = pd.read_csv('../data/penguins_simple.csv', sep=';')

In [None]:
# train-validation-test split
train_val, test = train_test_split(df, test_size=0.2, random_state=42)    # <-- now reproducibility pays off
train, val = train_test_split(train_val, test_size=0.2, random_state=43)

In [None]:
# define X and y
COLUMNS = ['Culmen Length (mm)', 'Culmen Depth (mm)',
           'Flipper Length (mm)', 'Body Mass (g)', 'Sex']

Xtrain = train[COLUMNS]
Xval = val[COLUMNS]

ytrain = train['Species']
yval = val['Species']

### Data Exploration

In [None]:
sns.scatterplot(x='Culmen Length (mm)', y='Culmen Depth (mm)', hue=ytrain, data=Xtrain)

### Building a Decision Tree from Scratch

#### Exercise 1: Edit the conditionals to make one prediction for every penguin

In [None]:
predictions = []
for _, row in Xtrain.iterrows():
    if row['Culmen Depth (mm)'] > 16:      # the column name and threshold are set automatically in a DTC
        if row['Culmen Length (mm)'] < 45:
            predictions.append('Adelie')   # the alternatives are 'Gentoo', 'Chinstrap'
        else:
            predictions.append('Chinstrap')   # the alternatives are 'Gentoo', 'Chinstrap'
    else:
        predictions.append('Gentoo')

In [None]:
# calculate the accuracy of the model
round(accuracy_score(predictions, ytrain),3)

## Find the optimal separation with Scikit

In [None]:
# feature engineering
ohc = ColumnTransformer([
    ('one-hot', OneHotEncoder(drop='first', handle_unknown='error', sparse=False), ['Sex']),
    ('do nothing', 'passthrough', COLUMNS[:-1])
])
ohc.fit(Xtrain)
Xtrans = ohc.transform(Xtrain)

In [None]:
# train the model
m = DecisionTreeClassifier(max_depth=5)  # we allow that many questions
m.fit(Xtrans, ytrain)

In [None]:
# evaluate on the training data
ypred = m.predict(Xtrans)
round(accuracy_score(ypred, ytrain), 3)

In [None]:
# evaluate on the validation data
Xtrans_val = ohc.transform(Xval)
ypred_val = m.predict(Xtrans_val)
round(accuracy_score(ypred_val, yval), 3)

#### Exercise 2: Try a `max_depth` hyperparameter of 1, 5 and 10 as well. What do you observe?

## Plot the tree

In [None]:
plt.figure(figsize=(12, 8))
t = plot_tree(m, 
              feature_names=['Sex'] + COLUMNS[:-1],
              class_names=['Adelie', 'Chinstrap', 'Gentoo'],  # alphabetical
              filled=True
             )

## RandomForests

In [None]:
# same procedure, different model, different hyperparameters
m = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42, n_jobs=-1)
#                          number of trees               random seed
m.fit(Xtrans, ytrain)

In [None]:
# evaluate on the training data
ypred = m.predict(Xtrans)
round(accuracy_score(ypred, ytrain), 3)

In [None]:
# evaluate on the validation data
Xtrans_val = ohc.transform(Xval)
ypred_val = m.predict(Xtrans_val)
round(accuracy_score(ypred_val, yval), 3)

#### Exercise 3: Tune the hyperparamaters so that you get a validation accuracy of at least 97% 

In [None]:
m.feature_importances_

In [None]:
Xtrain.columns

In [None]:
pickle.dump(m, open('penguin_forest.pkl', 'wb'))