<a href="https://colab.research.google.com/github/harnalashok/classification/blob/main/decisionTree_iris.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 21st April, 2021
# Myfolder: D:\data\OneDrive\Documents\decision_trees
# Ref: https://towardsdatascience.com/a-guide-to-decision-trees-for-machine-learning-and-data-science-fe2607241956
# Objectives:
#            i) To quickly create a decision tree
#           ii) To see the decision tree


In [64]:
# 1.0 Call libraries

import numpy as np
import pandas as pd
import os

# 1.1 Call sklearn libraries
# 1.1.1 Convert target values from string to integers
from sklearn.preprocessing import LabelEncoder as le

# 1.1.2 Split data into train and test data
from sklearn.model_selection import train_test_split

# 1.1.3 PReprocessong:
from sklearn.preprocessing import StandardScaler

# 1.1.4 Import class DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier as dtree

# 1.1.5 To draw decision tree
from sklearn.tree import export_graphviz

# 1.1.6
import graphviz

In [None]:
# 1.2 For tree visualization

"""
Ref: https://stackoverflow.com/questions/33433274/anaconda-graphviz-cant-import-after-installation
Install on Anaconda using following two commands, as:

conda install python-graphviz

"""

In [None]:
# 1.3 Display multiple commands output from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 2.0 path to dataset
#     'iris_wheader.csv'

path = "/content/"
os.chdir(path)
os.listdir()

In [None]:
# 2.0.1 Read data:

iris = pd.read_csv(
                  "iris_wheader.csv",     # Data is without headers
                   header = None,
                   names = ["c1","c2","c3", "c4", "target"]
                   )


# 2.0.2 Data
iris.head()

In [None]:
# 2.1 Separate predictors and target:

X = iris.iloc[: , 0:4]    # Predictors: First 4 columns
y = iris.iloc[:, 4]       # Target: Last, 5th column

In [None]:
# 2.2 Standardize data:
#     Even though, for a decision tree,
#     data standardization is not needed,
#     but make it a habit to do so:

# 2.2.1 Instantiate object
ss = StandardScaler()

# 2.2.2 Train object on data
ss.fit(X)

# 2.2.3 Trandform data
X = ss.transform(X)

In [None]:
# 2.2 Split X and y into train and test data:

X_train, X_test, y_train, y_test = train_test_split(
                                                     X,
                                                     y,
                                                     test_size = 0.3
                                                    )

# 2.3 About train data:
                                   
X_train.shape  # (105,4)
print()

# 2.3.1 Test data
X_test.shape   # (45,4)
print()
y_train.value_counts()

In [67]:
# 2.4 Encode y_train from object to inetger
# Instantiate labelEncoder object

enc = le()                  # Create an instance of class labelencoder

In [None]:
# 2.4.1 Train 'enc' object:

enc.fit(y_train)             # Let the object learn data

In [None]:
# 2.4.2 Transform y_train. Output is np array:

y_tr = enc.transform(y_train)      # Let it encode
y_tr

In [None]:
# 2.5 Check mapping of classes to integers:

enc.classes_     # array(['setosa', 'versicolor', 'virginica']
                 # Corresponds to 0,1,2

In [None]:
# 2.6 Verify transformation:

enc.transform(['setosa','versicolor', 'virginica'])

In [72]:
# 3. Start modeling
# 3.1 Initialize our decision tree object.
#     Supply relevant parameters

ct = dtree(
           criterion="gini",    # Alternative 'entropy'
           max_depth=None       # Alternative, specify an integer
                              # 'None' means full tree till single leaf
           )

In [73]:
# 3.2 Train our decision tree:

c_tree = ct.fit(X_train,y_tr)

In [74]:
# 4.0 Make predictions of test data
# 4.1 First transform y_test into inetgers
#     just as in y_tr
#     We use the already trained enc() object:

y_te = enc.transform(y_test)

In [None]:
# 4.2 Now make prediction
out = ct.predict(X_test)
out

In [None]:
# 4.3 Get accuracy:

np.sum((out == y_te))/out.size

In [None]:
# 5.0 Which features are important:

fi = ct.feature_importances_
fi

In [None]:
# 5.1 Get a list:

list(zip(X.columns, fi))

In [None]:
######### Drop 'c2' and repeat above steps #############

# 6. Start modeling
# 6.1 Initialize our decision tree object
ct1 = dtree(
            criterion="gini",    # Alternative 'entropy'
            splitter="best",     # Alternative 'random'
            max_depth=None       # Alternative, specify an integer
                              # 'None' means full tree till single leaf
            )

In [None]:
# 6.2 Train our decision tree (tree induction + pruning)
ct1.fit(X_train[['c1', 'c3', 'c4']],y_tr)

In [None]:
# 6.3 Now make prediction
out = ct1.predict(X_test[['c1', 'c3', 'c4']])
out

In [None]:
# 6.4 Get accuracy
np.sum((out == y_te))/out.size

In [None]:
# 7.0 Which features are important
fi = ct1.feature_importances_
fi
list(zip(X[['c1', 'c3', 'c4']].columns, fi))

In [None]:
# Ref: https://stackoverflow.com/a/46374279/3282777

feature_names=[ 'c1','c2','c3','c4']
class_names = ['setosa','versicolor', 'virginica']
dot_data = export_graphviz(c_tree, out_file=None,
                     feature_names=feature_names,
                     class_names=class_names,
                     filled=True, rounded=True,
                     special_characters=True)

# 3.1
graph = graphviz.Source(dot_data)

# 3.2 A pdf file is created in your current folder
graph.render("iris")

In [None]:
####### I am done ###########33