In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
#from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score




In [2]:
balance_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data',
                           sep= ',', header= None)

In [8]:
balance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 5 columns):
0    625 non-null object
1    625 non-null int64
2    625 non-null int64
3    625 non-null int64
4    625 non-null int64
dtypes: int64(4), object(1)
memory usage: 24.5+ KB


In [9]:
print(balance_data.shape)

balance_data.head()

(625, 5)


Unnamed: 0,0,1,2,3,4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


### Checking missing values

In [10]:
balance_data.isnull().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64

### Splitting as X and Y

In [15]:
X = balance_data.values[:, 1:5]
Y = balance_data.values[:,0]

#balance_data[balance_data[0]=='B']

###  Splitting into train and test

**train_test_split function under cross_validation**

In [49]:
from sklearn.cross_validation import train_test_split
#from sklearn.model_selection import train_test_split - Same as above (No change in output)

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

Decision Tree Training
Now we fit Decision tree algorithm on training data, predicting labels for validation dataset and printing the accuracy of the model using various parameters.

DecisionTreeClassifier(): This is the classifier function for DecisionTree. It is the main function for implementing the algorithms. Some important parameters are:

criterion: It defines the function to measure the quality of a split. Sklearn supports “gini” criteria for Gini Index & “entropy” for Information Gain. By default, it takes “gini” value.
splitter: It defines the strategy to choose the split at each node. Supports “best” value to choose the best split & “random” to choose the best random split. By default, it takes “best” value.
max_features: It defines the no. of features to consider when looking for the best split. We can input integer, float, string & None value.
If an integer is inputted then it considers that value as max features at each split.
If float value is taken then it shows the percentage of features at each split.
If “auto” or “sqrt” is taken then max_features=sqrt(n_features).
If “log2” is taken then max_features= log2(n_features).
If None, then max_features=n_features. By default, it takes “None” value.
max_depth: The max_depth parameter denotes maximum depth of the tree. It can take any integer value or None. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. By default, it takes “None” value.
min_samples_split: This tells above the minimum no. of samples reqd. to split an internal node. If an integer value is taken then consider min_samples_split as the minimum no. If float, then it shows percentage. By default, it takes “2” value.
min_samples_leaf: The minimum number of samples required to be at a leaf node. If an integer value is taken then consider min_samples_leaf as the minimum no. If float, then it shows percentage. By default, it takes “1” value.
max_leaf_nodes: It defines the maximum number of possible leaf nodes. If None then it takes an unlimited number of leaf nodes. By default, it takes “None” value.
min_impurity_split: It defines the threshold for early stopping tree growth. A node will split if its impurity is above the threshold otherwise it is a leaf.
Let’s build classifiers using criterion as gini index & information gain. We need to fit our classifier using fit(). We will plot our decision tree classifier’s visualization too.

Decision Tree Classifier with criterion gini index

### Model creation

**_1. Gini_**                                                                                                                   
**_2. Entropy_**


In [46]:
# Using Gini as a criteria

clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [19]:
# Using Entropy as a criteria

clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
 max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [20]:
# Predicting for a sample input - 1D input array

clf_gini.predict([[1, 5, 4, 4]])

array(['R'], dtype=object)

### Test Data Prediction

In [23]:
# Predicting for a sample input - 1D input array

clf_entropy.predict([[1, 5, 4, 4]])

array(['R'], dtype=object)

In [32]:
# pd.Series(clf_gini.predict(X_test)).value_counts()

In [47]:
y_pred_gini = clf_gini.predict(X_test)
y_pred_gini

array(['R', 'L', 'R', 'R', 'R', 'L', 'R', 'L', 'L', 'L', 'R', 'L', 'L',
       'L', 'R', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'L', 'L', 'R', 'L',
       'L', 'L', 'R', 'L', 'L', 'L', 'R', 'L', 'L', 'L', 'L', 'R', 'L',
       'L', 'R', 'L', 'R', 'L', 'R', 'R', 'L', 'L', 'R', 'L', 'R', 'R',
       'L', 'R', 'R', 'L', 'R', 'R', 'L', 'L', 'R', 'R', 'L', 'L', 'L',
       'L', 'L', 'R', 'R', 'L', 'L', 'R', 'R', 'L', 'R', 'L', 'R', 'R',
       'R', 'L', 'R', 'L', 'L', 'L', 'L', 'R', 'R', 'L', 'R', 'L', 'R',
       'R', 'L', 'L', 'L', 'R', 'R', 'L', 'L', 'L', 'R', 'L', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'R', 'R', 'L', 'R', 'R',
       'R', 'R', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'R',
       'R', 'R', 'R', 'L', 'R', 'R', 'R', 'L', 'L', 'R', 'L', 'R', 'L',
       'R', 'L', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'L', 'R', 'R', 'R',
       'L', 'R', 'R', 'R', 'R', 'R', 'L', 'L', 'R', 'R', 'R', 'R', 'L',
       'R', 'R', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'R', 'R', 'L

In [34]:
y_pred_entropy = clf_entropy.predict(X_test)
y_pred_entropy

array(['R', 'L', 'R', 'L', 'R', 'L', 'R', 'L', 'R', 'R', 'R', 'R', 'L',
       'L', 'R', 'L', 'R', 'L', 'L', 'R', 'L', 'R', 'L', 'L', 'R', 'L',
       'R', 'L', 'R', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R',
       'L', 'R', 'L', 'R', 'L', 'R', 'R', 'L', 'L', 'R', 'L', 'L', 'R',
       'L', 'L', 'R', 'L', 'R', 'R', 'L', 'R', 'R', 'R', 'L', 'L', 'R',
       'L', 'L', 'R', 'L', 'L', 'L', 'R', 'R', 'L', 'R', 'L', 'R', 'R',
       'R', 'L', 'R', 'L', 'L', 'L', 'L', 'R', 'R', 'L', 'R', 'L', 'R',
       'R', 'L', 'L', 'L', 'R', 'R', 'L', 'L', 'L', 'R', 'L', 'L', 'R',
       'R', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'R', 'R', 'L', 'R', 'R',
       'L', 'R', 'R', 'L', 'R', 'R', 'R', 'L', 'L', 'L', 'L', 'L', 'R',
       'R', 'R', 'R', 'L', 'R', 'R', 'R', 'L', 'L', 'R', 'L', 'R', 'L',
       'R', 'L', 'R', 'R', 'L', 'L', 'R', 'L', 'R', 'R', 'R', 'R', 'R',
       'L', 'R', 'R', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'R', 'R', 'L',
       'R', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'R

### Metric Evaluation

### Gini Model

In [48]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test, y_pred_gini))
print(classification_report(y_test, y_pred_gini))
print ("Accuracy for Gini is ", accuracy_score(y_test,y_pred_gini)*100)

[[ 0  6  7]
 [ 0 67 18]
 [ 0 19 71]]
             precision    recall  f1-score   support

          B       0.00      0.00      0.00        13
          L       0.73      0.79      0.76        85
          R       0.74      0.79      0.76        90

avg / total       0.68      0.73      0.71       188

Accuracy for Gini is  73.40425531914893


  'precision', 'predicted', average, warn_for)


### Entropy Model

In [44]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test, y_pred_entropy))
print(classification_report(y_test, y_pred_entropy))
print ("Accuracy for Gini is ", accuracy_score(y_test,y_pred_entropy)*100)

[[ 0  6  7]
 [ 0 63 22]
 [ 0 20 70]]
             precision    recall  f1-score   support

          B       0.00      0.00      0.00        13
          L       0.71      0.74      0.72        85
          R       0.71      0.78      0.74        90

avg / total       0.66      0.71      0.68       188

Accuracy for Gini is  70.74468085106383


  'precision', 'predicted', average, warn_for)


## Comparison with Logistic Regression Model

In [54]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred_log = logreg.predict(X_test)
y_pred_log

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Metric Evaluation for Logistic Regression

In [60]:
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
print ("Accuracy for Logistic Regression is ", accuracy_score(y_test,y_pred_log)*100)

[[ 0  8  5]
 [ 0 79  6]
 [ 0  5 85]]
             precision    recall  f1-score   support

          B       0.00      0.00      0.00        13
          L       0.86      0.93      0.89        85
          R       0.89      0.94      0.91        90

avg / total       0.81      0.87      0.84       188

Accuracy for Logistic Regression is  87.2340425531915


  'precision', 'predicted', average, warn_for)


__Logistic Regression model seems performing better with the current data__ 

Model (can also use single decision tree)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)

Train
model.fit(X_train, y_train)
Extract single tree
estimator = model.estimators_[5]

from sklearn.tree import export_graphviz
Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = iris.feature_names,
                class_names = iris.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

Convert to png using system command (requires Graphviz)
#from subprocess import call
#call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

#call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

Display in jupyter notebook
#from IPython.display import Image
#Image(filename = 'tree.png')