In [1]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
my_data = pd.read_csv("drug_train_data_updated.csv")
my_data[0:5]

Unnamed: 0,AGE,SEX,BP,CHOLESTEROL,NA,K,DRUG
0,43.0,M,HIGH,HIGH,0.656371,0.046979,drugA
1,32.0,M,HIGH,NORMAL,0.52975,0.056087,drugA
2,37.0,F,HIGH,HIGH,0.559171,0.042713,drugA
3,24.0,M,HIGH,NORMAL,0.613261,0.064726,drugA
4,29.0,M,HIGH,HIGH,0.625272,0.048637,drugA


In [3]:
X = my_data[['AGE', 'SEX', 'BP', 'CHOLESTEROL', 'NA','K']].values
X[0:5]

array([[43.0, 'M', 'HIGH', 'HIGH', 0.656371, 0.046979],
       [32.0, 'M', 'HIGH', 'NORMAL', 0.52975, 0.056087],
       [37.0, 'F', 'HIGH', 'HIGH', 0.559171, 0.042713],
       [24.0, 'M', 'HIGH', 'NORMAL', 0.613261, 0.064726],
       [29.0, 'M', 'HIGH', 'HIGH', 0.625272, 0.048637]], dtype=object)

In [4]:
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1]) 


le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

X[0:5]


array([[43.0, 1, 0, 0, 0.656371, 0.046979],
       [32.0, 1, 0, 1, 0.52975, 0.056087],
       [37.0, 0, 0, 0, 0.559171, 0.042713],
       [24.0, 1, 0, 1, 0.613261, 0.064726],
       [29.0, 1, 0, 0, 0.625272, 0.048637]], dtype=object)

In [5]:
y = my_data["DRUG"]


In [6]:
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

In [7]:
print(X_trainset.shape)
print(X_testset.shape)
print(y_trainset.shape)
print(y_testset.shape)


(126, 6)
(55, 6)
(126,)
(55,)


In [8]:
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [9]:
drugTree.fit(X_trainset,y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
predTree = drugTree.predict(X_testset)

In [11]:
print (predTree [0:5])
print (y_testset [0:5])

['drugA' 'drugA' 'drugX' 'drugY' 'drugY']
25     drugB
3      drugA
82     drugX
178    drugY
157    drugY
Name: DRUG, dtype: object


In [12]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.9090909090909091


In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_testset, predTree)
cm

array([[ 8,  0,  0,  0,  1],
       [ 2,  1,  0,  0,  0],
       [ 0,  0,  2,  0,  0],
       [ 0,  0,  0, 18,  2],
       [ 0,  0,  0,  0, 21]], dtype=int64)

In [17]:
predTree

array(['drugA', 'drugA', 'drugX', 'drugY', 'drugY', 'drugY', 'drugY',
       'drugX', 'drugX', 'drugY', 'drugX', 'drugC', 'drugX', 'drugY',
       'drugA', 'drugY', 'drugA', 'drugX', 'drugX', 'drugA', 'drugY',
       'drugY', 'drugY', 'drugX', 'drugY', 'drugY', 'drugY', 'drugX',
       'drugX', 'drugY', 'drugY', 'drugX', 'drugX', 'drugX', 'drugX',
       'drugY', 'drugX', 'drugY', 'drugY', 'drugX', 'drugA', 'drugX',
       'drugC', 'drugY', 'drugA', 'drugY', 'drugY', 'drugY', 'drugY',
       'drugA', 'drugA', 'drugB', 'drugY', 'drugX', 'drugA'], dtype=object)

In [18]:
y_testset

25     drugB
3      drugA
82     drugX
178    drugY
157    drugY
132    drugY
6      drugA
86     drugX
70     drugX
58     drugX
83     drugX
34     drugC
38     drugX
101    drugY
4      drugA
115    drugY
23     drugB
77     drugX
87     drugX
15     drugA
167    drugY
71     drugX
162    drugY
47     drugX
120    drugY
98     drugY
172    drugY
42     drugX
40     drugX
143    drugY
122    drugY
89     drugX
51     drugX
73     drugX
72     drugX
102    drugY
64     drugX
136    drugY
153    drugY
46     drugX
17     drugA
41     drugX
31     drugC
154    drugY
5      drugA
165    drugY
92     drugY
174    drugY
175    drugY
12     drugA
14     drugA
24     drugB
100    drugY
67     drugX
8      drugA
Name: DRUG, dtype: object

In [19]:
y_c=list(y_testset)

In [21]:
d_dict = {'real_value':y_c, 'predicted_value':predTree}
drug_compare = pd.DataFrame.from_dict(d_dict)


In [22]:
drug_compare

Unnamed: 0,real_value,predicted_value
0,drugB,drugA
1,drugA,drugA
2,drugX,drugX
3,drugY,drugY
4,drugY,drugY
5,drugY,drugY
6,drugA,drugY
7,drugX,drugX
8,drugX,drugX
9,drugX,drugY


In [24]:
count=0
for i in range(54):
    if (drug_compare["real_value"][i]==drug_compare["predicted_value"][i]):
        count=count+1

In [26]:
print("correct value= ",count)

correct value=  49


In [27]:
print("wrong prediction =",55-count)

wrong prediction = 6
