## Import necessary libraries

In [33]:
import sqlite3
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz

## Get the data
Connect to a sqlite database in the data/raw directory

In [5]:
conn = sqlite3.connect('../data/raw/datasets_19_420_database.sqlite')
cur = conn.cursor()

Query the database

In [9]:
query = """ SELECT * FROM iris; """

Place the data into a dataframe

In [12]:
cur.execute(query)

df = pd.DataFrame(cur.fetchall())
df.columns = [x[0] for x in cur.description]  # for column names

## Observe the data

In [11]:
# look at the first 5 rows of data
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [15]:
# look at a sample of the data
df.sample(6)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
80,81,5.5,2.4,3.8,1.1,Iris-versicolor
25,26,5.0,3.0,1.6,0.2,Iris-setosa
59,60,5.2,2.7,3.9,1.4,Iris-versicolor
128,129,6.4,2.8,5.6,2.1,Iris-virginica
95,96,5.7,3.0,4.2,1.2,Iris-versicolor
135,136,7.7,3.0,6.1,2.3,Iris-virginica


In [16]:
# How many rows are in the dataframe?
len(df)

150

In [61]:
# How many unique species are listed?
len(list(set(df["Species"].tolist())))

3

## Training a Decision Tree

In [20]:
X = df[["PetalLengthCm", "PetalWidthCm"]]
y = df["Species"]

In [21]:
dtc = DecisionTreeClassifier(max_depth=2)
dtc.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## Vizualizing the Decision Tree
The image files live in the results/ d

In [65]:
export_graphviz(dtc,
               feature_names=['PetalLengthCm','PetalWidthCm'],
               class_names=list(set(df["Species"].tolist())),
               rounded=True,
               filled=True,
               out_file='../results/iris_tree.dot'
               )

In [66]:
! dot -Tpng ../results/iris_tree.dot -o ../results/iris_tree.png

In [67]:
dtc_entropy = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dtc_entropy.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [68]:
export_graphviz(dtc2,
               feature_names=['PetalLengthCm','PetalWidthCm'],
               class_names=list(set(df["Species"].tolist())),
               rounded=True,
               filled=True,
               out_file='../results/iris_tree_entropy.dot'
               )

In [69]:
! dot -Tpng ../results/iris_tree_entropy.dot -o ../results/iris_tree_entropy.png