# Decision Trees implementation using sklearn

In [17]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## IRIS dataset

In [6]:
#data is a dictionary
data = load_iris()
print('Classes to predict: ', data.target_names)

Classes to predict:  ['setosa' 'versicolor' 'virginica']


In [7]:
X = data.data
Y = data.target
print('Number of examples in the data:', X.shape[0])

Number of examples in the data: 150


In [4]:
# Display the first 4 training examples
X[:4, :]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2]])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 47, test_size = 0.25)

In [10]:
clf = DecisionTreeClassifier(criterion = 'entropy')
# decision tree is constructed on the basis of training examples
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [11]:
y_pred =  clf.predict(X_test)

In [12]:

print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.973684210526


In [15]:
# fine tuning the decision tree to prevent overfitting as indicated by above accuracy scores
clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=50)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=50,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [16]:
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on the test data: ', accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)))

Accuracy Score on train data:  0.955357142857
Accuracy Score on the test data:  0.973684210526


## Balance Scale Weight & Distance Database

In [19]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-'+'databases/balance-scale/balance-scale.data',sep= ',', header = None)   
print ("Dataset Lenght: ", len(data))
print ("Dataset Shape: ", data.shape)    
print ("Dataset: ",data.head())

Dataset Lenght:  625
Dataset Shape:  (625, 5)
Dataset:     0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5


In [20]:
X = data.values[:,1:5]
Y = data.values[:,0]
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3, random_state = 100)


In [21]:
# Build a decision tree with gini index 
clf_gini = DecisionTreeClassifier(criterion= "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [23]:
# Build a decision tree with entropy 
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 3, min_samples_leaf = 5)
clf_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [28]:
print("Accuracy measure using gini index")
y_pred_gini = clf_gini.predict(X_test)
#print("Confusion Matrix: ",confusion_matrix(y_test, y_pred_gini))
print ("Accuracy : ", accuracy_score(y_test,y_pred_gini)*100)
#print("Report : ", classification_report(y_test, y_pred_gini))

Accuracy measure using gini index
Accuracy :  73.4042553191


In [29]:
print("Accuracy measure using entropy")
y_pred_entropy = clf_entropy.predict(X_test)
#print("Confusion Matrix: ",confusion_matrix(y_test, y_pred_gini))
print ("Accuracy : ", accuracy_score(y_test,y_pred_entropy)*100)
#print("Report : ", classification_report(y_test, y_pred_gini))

Accuracy measure using entropy
Accuracy :  70.7446808511


In [31]:
DecisionTreeClassifier?