## Import necessary libraries

In [38]:
import sqlite3
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Get the data
Connect to a sqlite database in the data/raw directory

In [4]:
conn = sqlite3.connect('../data/raw/datasets_19_420_database.sqlite')
cur = conn.cursor()

Query the database

In [5]:
query = """ SELECT * FROM iris; """

Place the data into a dataframe

In [6]:
cur.execute(query)

df = pd.DataFrame(cur.fetchall())
df.columns = [x[0] for x in cur.description]  # for column names

## Observe the data

In [7]:
# look at the first 5 rows of data
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
# look at a sample of the data
df.sample(6)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
59,60,5.2,2.7,3.9,1.4,Iris-versicolor
135,136,7.7,3.0,6.1,2.3,Iris-virginica
60,61,5.0,2.0,3.5,1.0,Iris-versicolor
69,70,5.6,2.5,3.9,1.1,Iris-versicolor
12,13,4.8,3.0,1.4,0.1,Iris-setosa
108,109,6.7,2.5,5.8,1.8,Iris-virginica


In [10]:
# How many rows are in the dataframe?
len(df)

150

In [11]:
# How many unique species are listed?
len(list(set(df["Species"].tolist())))

3

## Training a Decision Tree

In [12]:
X = df[["PetalLengthCm", "PetalWidthCm"]]
y = df["Species"]

In [20]:
dtc = DecisionTreeClassifier(max_depth=2)
dtc.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## Vizualizing the Decision Tree
The image files live in the results/ d

In [21]:
export_graphviz(dtc,
               feature_names=['PetalLengthCm','PetalWidthCm'],
               class_names=list(set(df["Species"].tolist())),
               rounded=True,
               filled=True,
               out_file='../results/iris_tree.dot'
               )

In [22]:
! dot -Tpng ../results/iris_tree.dot -o ../results/iris_tree.png

In [28]:
dtc_entropy = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dtc_entropy.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [29]:
export_graphviz(dtc_entropy,
               feature_names=['PetalLengthCm','PetalWidthCm'],
               class_names=list(set(df["Species"].tolist())),
               rounded=True,
               filled=True,
               out_file='../results/iris_tree_entropy.dot'
               )

In [30]:
! dot -Tpng ../results/iris_tree_entropy.dot -o ../results/iris_tree_entropy.png

## Experimenting with `max_depth`

In [48]:
for n in range(2, 6):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    dtc = DecisionTreeClassifier(max_depth=n)
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    print(n , classification_report(y_true=y_test, y_pred=y_pred))

2                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       0.88      1.00      0.94        15
 Iris-virginica       1.00      0.88      0.93        16

       accuracy                           0.96        45
      macro avg       0.96      0.96      0.96        45
   weighted avg       0.96      0.96      0.96        45

3                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        16
Iris-versicolor       1.00      0.81      0.90        16
 Iris-virginica       0.81      1.00      0.90        13

       accuracy                           0.93        45
      macro avg       0.94      0.94      0.93        45
   weighted avg       0.95      0.93      0.93        45

4                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        18
Iris-versicolor       1.00      1.00      1.00        16
 Iris-virginica   

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
dtc_experiment = DecisionTreeClassifier(max_depth=4)
dtc_experiment.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [51]:
export_graphviz(dtc_experiment,
               feature_names=['PetalLengthCm','PetalWidthCm'],
               class_names=list(set(df["Species"].tolist())),
               rounded=True,
               filled=True,
               out_file='../results/iris_tree_experiment.dot'
               )

! dot -Tpng ../results/iris_tree_experiment.dot -o ../results/iris_tree_experiment.png