<a href="https://colab.research.google.com/github/junietan/basicPythonMLlessons/blob/main/3_Iris%2BDT%2Bsplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the relevant packages
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree


### The dataset

In [None]:
# We can load the iris dataset straight from sklearn
iris = load_iris()


In [None]:
# Understanding the iris dataset
print("Feature names:", iris.feature_names)
print("Target names:", iris.target_names)
print("Shape of data:", iris.data.shape)
print("Description:\n", iris.DESCR)

In [None]:
# load data and target from iris dataset
X, y = iris.data, iris.target

# Check what the input looks like
X

In [None]:
# Check what the target looks like
y

In [None]:
# We see that there are 150 samples, each with 4 feauters
np.shape(X)

### Splitting the data
When a model is trained on the entire dataset, it can memorize the data rather than learning the underlying patterns. This phenomenon is called **overfitting**.

The model will perform exceptionally well on the data it has seen but will likely perform poorly on new, unseen data. Splitting the data allows you to train the model on one portion (the training set) and then test its ability to generalize to a different, unseen portion (the testing set)

In [None]:
from sklearn.model_selection import train_test_split

# we need to split the dataset into a train and test samples (e.g., 80% train, 20% test)
train_input, test_input, train_target, test_target = train_test_split(X, y, test_size=0.2, random_state=66) # fix for reproducibility

print(f"Training set size: {train_input.shape[0]} samples")
print(f"Testing set size: {test_input.shape[0]} samples")

### Creating the decision tree

In [None]:
# First, we need to define the decision tree and its parameters, if any
# Then, we need to train/create the tree based on the data
# Both are easily achieved through sklearn, with 2 simple commands

In [None]:
# Defining the tree classifier
clf = DecisionTreeClassifier()

In [None]:
# Training/creating the decision tree
clf = clf.fit(train_input, train_target)

In [None]:
# At this point, we have created a fully working decision tree for the Iris dataset

In [None]:
# We can now give this tree an input, and it will predict the class of the flower
test_pred = clf.predict(test_input)
test_pred

In [None]:
test_target

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(test_target,test_pred)

print(f"The accuracy of the model is: {accuracy:.2f}")

### Visualizing the tree

In [None]:
# With sklearn, we also have capabilities to plot the tree

In [None]:
# The default plot_tree function returns a list with text information about the tree and also plots a small image of it
plot_tree(clf)

In [None]:
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True,
          feature_names=iris.feature_names,
          class_names=iris.target_names)

plt.show()

In [None]:
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True,
          feature_names=iris.feature_names,
          class_names=iris.target_names)

plt.show()

In [None]:
# To better visualize it, we can use matplotlib, and control the size of the figure
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True,
          feature_names=iris.feature_names,
          class_names=iris.target_names)

plt.show()

In [None]:
# As can be seen from the image above, though, the features and classes don't have names
# It is not informative, as we don't know what feature X[2] corresponds to
# So, we can add a list of feature and class names to the function

In [None]:
# Adding the feature and class names, as well
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True,
          feature_names=iris.feature_names,
          class_names=iris.target_names)
plt.show()