# Decision trees with ScikitLearn

Lets create a decision tree using the popular Iris dataset.

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

In [None]:
import matplotlib.pyplot as plt

class_names = iris.target_names

# Create a scatter plot for each pair of features
for i in range(4):
    for j in range(i + 1, 4):
        plt.figure(figsize=(5, 5))
        plt.scatter(X[:, i], X[:, j], c=y, cmap=plt.cm.Set1, edgecolor='k')
        plt.xlabel(iris.feature_names[i])
        plt.ylabel(iris.feature_names[j])
        plt.title(f"{iris.feature_names[i]} vs {iris.feature_names[j]}")
        plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

clf = DecisionTreeClassifier()

clf = clf.fit(X_train, y_train)

Now, lets show the obtained tree

In [None]:
print(iris.target_names)
tree_rules = export_text(clf, feature_names=iris.feature_names)
print(tree_rules)

And calculate its accuracy

In [None]:
from sklearn import metrics

y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Lets select the best two features in order to visualize the results

In [None]:
selected_features = [0, 2]

X_train_2 = X_train[:, selected_features]
X_test_2 = X_test[:, selected_features]

# Create the SVM classifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train_2, y_train)

In [None]:
import numpy as np

def plot_classification_regions(X, y, clf):
    # Plot the separation hyperplane
    # Generate a grid of points that span the feature space
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))

    # Obtain the predicted labels for each point in the grid
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Reshape the predicted labels into the grid shape
    Z = Z.reshape(xx.shape)

    # Plot the contour filled with the predicted labels
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.Paired)

    # Plot the original data points
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)

    # Show the plot
    plt.show()
    
plot_classification_regions(X_train_2, y_train, clf)

### Limiting the minimum number of elements in leaf

In [None]:
clf = DecisionTreeClassifier(min_samples_leaf=5)

clf = clf.fit(X_train, y_train)

tree_rules = export_text(clf, feature_names=iris.feature_names)
print(tree_rules)

In [None]:
y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

This tree is simpler, but as accurate as the original one:
- The removed branches constains information particular to the training set

Lets see the classification regions now

In [None]:
clf = DecisionTreeClassifier(min_samples_leaf=5)
clf.fit(X_train_2, y_train)
plot_classification_regions(X_train_2, y_train, clf)

### Limiting the tree height

In [None]:
clf = DecisionTreeClassifier(max_depth=2)

clf = clf.fit(X_train, y_train)

tree_rules = export_text(clf, feature_names=iris.feature_names)
print(tree_rules)

In [None]:
y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

The tree is more simple, and it is as accurate as before. It has limits ...

In [None]:
clf = DecisionTreeClassifier(max_depth=1)

clf = clf.fit(X_train, y_train)

tree_rules = export_text(clf, feature_names=iris.feature_names)
print(tree_rules)

In [None]:
y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
y_pred = clf.predict(X_train)
accuracy = metrics.accuracy_score(y_train, y_pred)
print("Accuracy on Training:", accuracy)

Now, we have a clear example of **underfitting**