In [1]:
# Load libraries
import pandas as pd

### The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome. Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

In [2]:

# load dataset
pima = pd.read_csv("diabetes.csv" )
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Feature Selection

In [3]:
#split dataset in features and target variable

X = pima.iloc[:,0:8] # Features
y = pima.Outcome # Target variable
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [4]:
feature_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


# Splitting Data

In [5]:
from sklearn.model_selection import train_test_split # Import train_test_split function
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5) # 70% training and 30% test

# Building Decision Tree Model

In [6]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Evaluating Model

In [7]:

from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7186147186147186


 classification rate of 67.53%, can be improved by tuning the parameters in the Decision Tree Algorithm.

# Visualizing Decision Trees

In [8]:
# export_graphviz function converts decision tree classifier into dot file and
# pydotplus convert this dot file to png or displayable form on Jupyter

In [21]:
sudo apt-get install graphviz

SyntaxError: invalid syntax (<ipython-input-21-ed26e7b4d7d8>, line 1)

In [9]:
pip install  pydotplus

Note: you may need to restart the kernel to use updated packages.


In [24]:
conda install graphviz

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\logic\anaconda3

  added / updated specs:
    - graphviz


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.13.0               |   py38haa95532_0         926 KB
    ------------------------------------------------------------
                                           Total:         926 KB

The following packages will be UPDATED:

  conda                               4.12.0-py38haa95532_0 --> 4.13.0-py38haa95532_0



Downloading and Extracting Packages

conda-4.13.0         | 926 KB    |            |   0% 
conda-4.13.0         | 926 KB    | 1          |   2% 
conda-4.13.0         | 926 KB    | ########2  |  83% 
conda-4.13.0         | 926 KB    | ########## | 100% 
Preparing transaction: ...working... done
Verifying 

In [17]:
pip install StringIO

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement StringIO (from versions: none)
ERROR: No matching distribution found for StringIO


In [25]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_names,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('diabetes.png')
Image(graph.create_png())

InvocationException: GraphViz's executables not found

# Optimizing Decision Tree Performance

### criterion : optional (default=”gini”) or Choose attribute selection measure: This parameter allows us to use the different-different attribute selection measure. Supported criteria are “gini” for the Gini index and “entropy” for the information gain.

### splitter : string, optional (default=”best”) or Split Strategy: This parameter allows us to choose the split strategy. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

### max_depth : int or None, optional (default=None) or Maximum Depth of a Tree: The maximum depth of the tree. If None, then nodes are expanded until all the leaves contain less than min_samples_split samples.The higher value of maximum depth causes overfitting, and a lower value causes underfitting 

In [22]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7532467532467533


# Pros
### Decision trees are easy to interpret and visualize.
### It can easily capture Non-linear patterns.
### It requires fewer data preprocessing from the user, for example, there is no need to normalize columns.
### It can be used for feature engineering such as predicting missing values, suitable for variable selection.
### The decision tree has no assumptions about distribution because of the non-parametric nature of the algorithm. 

# Cons
### Sensitive to noisy data. It can overfit noisy data.
### The small variation(or variance) in data can result in the different decision tree. This can be reduced by bagging and boosting algorithms.
### Decision trees are biased with imbalance dataset, so it is recommended that balance out the dataset before creating the decision tree.