# Decision Trees

In [None]:
import graphviz
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz

## Skikit-Learn Decision Trees

The main Decision Tree Classifier in Scikit Learn is the `DecisionTreeClassifier()`.

There are several parameters that you can set for your decision tree model in Scikit Learn too. Here are a few of the more interesting ones to play around with to try and get some better results:
* **max_depth**: The max depth of the tree where we will stop splitting the nodes. This is similar to controlling the maximum number of layers in a deep neural network. Lower will make your model faster but not as accurate; higher can give you accuracy but risks overfitting and may be slow.
* **min_samples_split**: The minimum number of samples required to split a node. We discussed this aspect of decision trees above and how setting it to a higher value would help mitigate overfitting.
* **max_features**: The number of features to consider when looking for the best split. Higher means potentially better results with the tradeoff of training taking longer.
* **min_impurity_split**: Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold. This can be used to tradeoff combating overfitting (high value, small tree) vs high accuracy (low value, big tree).
* **presort**: Whether to presort the data to speed up the finding of best splits in fitting. If we sort our data on each feature beforehand, our training algorithm will have a much easier time finding good values to split on.

https://scikit-learn.org/stable/modules/tree.html#decision-trees

## Example #1
Let's work with our toy example from when we first thought about classification. We'll use our four inputs for training a decision tree (weather outlook, temperature, humidity, and wind) to predict if we would play outside or not.

We'll have to do a bit of preprocessing to get our values to be numeric.

In [None]:
# Assigning features
outlook = ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain',
           'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain']
temp = ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild',
        'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild']
humidity = ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High',
            'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High']
wind = ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak',
        'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong']

# Assigning target vector
play = ["Don't Play", "Don't Play", "Play", "Play", "Play", "Don't Play",
        "Play", "Don't Play", "Play", "Play", "Play", "Play", "Play", "Don't Play"]

#creating labelEncoder
le = LabelEncoder()

# Converting string labels into numbers.
outlook_encoded=le.fit_transform(outlook)
print(f"Weather: {outlook_encoded}")

# Converting string labels into numbers
temp_encoded=le.fit_transform(temp)
print(f"Temp: {temp_encoded}")

# Converting string labels into numbers
humidity_encoded=le.fit_transform(humidity)
print(f"Humidity: {humidity_encoded}")

# Converting string labels into numbers
wind_encoded=le.fit_transform(wind)
print(f"Wind: {wind_encoded}")

# Convert target strings into numbers (0 = No, 1 = Yes)
label=le.fit_transform(play)
print(f"Play: {label}")

In [None]:
#Combinig weather and temp into single listof tuples
features = np.vstack((outlook_encoded, temp_encoded, humidity_encoded, wind_encoded)).T

classification_tree = DecisionTreeClassifier()

# Train our decision tree (tree induction + pruning)
tree_model = classification_tree.fit(features, label)

In [None]:
# Create some dictionaries linking the string value with the encoded value
# This is done using a dictionary comprehension
outlook_dictionary = {key:value for key, value in zip(outlook, outlook_encoded)}
temperature_dictionary = {key:value for key, value in zip(temp, temp_encoded)}
humidity_dictionary = {key:value for key, value in zip(humidity, humidity_encoded)}
wind_dictionary = {key:value for key, value in zip(wind, wind_encoded)}
predict_outcomes = {key:value for key, value in zip(label, play)}

# Weather Possibilities: Sunny, Overcast, Rainy
# Temp Possibilities: Hot, Mild, Cool
# Humidity Possibilities: High, Normal
# Wind Possibilities: Weak, Strong
new_outlook = outlook_dictionary['Rain']
new_temp = temperature_dictionary['Hot']
new_humidity = humidity_dictionary['High']
new_wind = wind_dictionary['Weak']

ypred = tree_model.predict([[new_outlook, new_temp, new_humidity, new_wind]])
print(f'The model predicts: {predict_outcomes[ypred[0]]}')

yprob = tree_model.predict_proba([[new_outlook, new_temp, new_humidity, new_wind]])
print(f"Predicted Probability of Don't Play: {yprob[0, 0]*100:.2f}%")
print(f"Predicted Probability of Play: {yprob[0, 1]*100:.2f}%")

One of the benefits of the Decision Tree is that we can visualize the tree graphically. Here we'll use the graphviz module to make a nice looking tree.

In [None]:
dot_data = export_graphviz(tree_model, out_file=None, 
                           feature_names=['Outlook', 'Temp', 'Humidity', 'Wind'],  
                           class_names=['Play', 'Dont Play'],  
                           filled=True, rounded=True,  
                           special_characters=True)  
graph = graphviz.Source(dot_data)
graph

## Example #2

Now let's work with our Iris dataset and only train our model on a subset of our total data so we can validate our model based on the held back data.

In [None]:
# Load in our dataset
iris_data = load_iris()

xtrain, xtest, ytrain, ytest = train_test_split(iris_data.data, iris_data.target)

# Initialize our decision tree object
classification_tree = DecisionTreeClassifier()

# Train our decision tree (tree induction + pruning)
classification_tree = classification_tree.fit(xtrain, ytrain)

In [None]:
dot_data = export_graphviz(classification_tree, out_file=None, 
                           feature_names=iris_data.feature_names,  
                           class_names=iris_data.target_names,  
                           filled=True, rounded=True,  
                           special_characters=True)  
graph = graphviz.Source(dot_data)
graph
#graph.render("iris", view=True) 

In [None]:
ypred = classification_tree.predict(xtest)

In [None]:
metrics.accuracy_score(ytest, ypred)

In [None]:
sns.heatmap(metrics.confusion_matrix(ytest, ypred), annot=True, cmap=plt.cm.BuPu)
plt.show()