## Importing the necessary libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

## Getting the data ready

### Pulling the data

In [2]:
data_raw = pd.read_csv('drugs.csv')

# data_raw.shape # 200 rows, 6 columns
# data_raw[:]
# As we will not need the drug name, we can remove it from the data
data = data_raw[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
target = data_raw['Drug']

# See the result
data[0:5]
target[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

The categories 'Sex', 'BP' and 'Cholesterol' are strings, and decision trees can only work with numbers, so we have to transform them into int or float.

In [3]:
from sklearn import preprocessing
# Sex
transform_sex = preprocessing.LabelEncoder()
transform_sex.fit(['F', 'M'])
data[:,1] = transform_sex.transform(data[:,1])

# BP
transform_bp = preprocessing.LabelEncoder()
transform_bp.fit(['LOW', 'NORMAL', 'HIGH'])
data[:,2] = transform_bp.transform(data[:,2])

# Cholesterol
transform_cholesterol = preprocessing.LabelEncoder()
transform_cholesterol.fit(['NORMAL', 'HIGH'])
data[:,3] = transform_cholesterol.transform(data[:,3])

data[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

## Setting up the Decision Tree
### Creating the train and test models

In [4]:
from sklearn.model_selection import train_test_split as tts

x_train, x_test, y_train, y_test = tts(data, target, test_size = 0.3, random_state = 3)

# Just to make sure they all have the same shape
print('Size of X train: {}\nSize of Y train: {}\n'.format( x_train.shape, y_train.shape ))

Size of X train: (140, 5)
Size of Y train: (140,)



### Modelling the tree

In [5]:
drugTree = DecisionTreeClassifier(criterion='entropy', max_depth = 5)
# Max value is 5 because our data contains 5 categories

# Now we'll fit this tree with our data
drugTree.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5)

### Prediction
With the tree made, we can use the test models to see if our tree is good

In [6]:
prediction = drugTree.predict(x_test)

print(prediction[0:5])
print(y_test[0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


We can see that our model has predicted the first 5 cases correctly, but we need to evaluate it with the whole scope of the data

### Evaluation

In [7]:
from sklearn import metrics
import matplotlib.pyplot as plt

print('Decision Tree Accuracy: ', metrics.accuracy_score(y_test, prediction))

Decision Tree Accuracy:  0.9833333333333333


Our tree has a 98,33% accuracy, that's amazing!

## Visualization

First we need to install some libraries (if you don't have them yet)

In [9]:
#!conda install -c conda-forge pydotplus -y
#!conda install -c conda-forge python-graphviz -y

In [10]:
from  io import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

In [None]:
# File config
dot_data = StringIO()
filename = 'drugtree.png'

# Plot config
featureNames = data_raw.columns[0:5]
out = tree.export_graphviz(drugTree, feature_names = featureNames, out_file = dot_data, class_names = np.unique(y_train), filled = True, special_characters = True, rotate=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(400, 800))
plt.imshow(img,interpolation='nearest')

<matplotlib.image.AxesImage at 0x26a04c9ad60>