In [28]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import tree

In [29]:
iris = datasets.load_iris() 
data = pd.DataFrame(data=iris.data) 
target = pd.DataFrame(data=iris.target) 

df = pd.concat((data,target), axis=1) 

col_names = ['sepal_length', 'sepal_width', 'petal_length', \
             'petal_width', 'species'] 

df.columns = col_names

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [30]:
vals_to_replace = {0: 'setosa', 1: 'versicolor', 2: 'vergincia'}
df['species'] = df['species'].map(vals_to_replace)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [31]:
# split into train and test sets
# 70% will be train and 30% will be test

train_number = int(df['sepal_length'].count() * (0.70))
print(train_number)

105


In [32]:
train = df.sample(n = train_number, random_state = 1)
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
14,5.8,4.0,1.2,0.2,setosa
98,5.1,2.5,3.0,1.1,versicolor
75,6.6,3.0,4.4,1.4,versicolor
16,5.4,3.9,1.3,0.4,setosa
131,7.9,3.8,6.4,2.0,vergincia


In [33]:
test = df.drop(train.index)
test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa


In [34]:
train_X = np.asarray(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
train_y = np.asarray(train['species'])

In [35]:
test_X = np.asarray(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
test_y = np.asarray(test['species'])

In [36]:
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(train_X, train_y)

In [40]:
y_pred = clf.fit(train_X, train_y).predict(test_X)

print("Number of mislabeled points out of a total {} points : {}"
      .format(len(test),(test_y != y_pred).sum()))

Number of mislabeled points out of a total 45 points : 6


In [42]:
1 - ((test_y != y_pred).sum()/len(test)) # get success rate

0.8666666666666667

In [44]:
test['evaluation'] = (test_y == y_pred)
test['pred'] = y_pred

In [45]:
test[test['evaluation'] == False] # get misclassified predictions

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,evaluation,pred
68,6.2,2.2,4.5,1.5,versicolor,False,vergincia
70,5.9,3.2,4.8,1.8,versicolor,False,vergincia
106,4.9,2.5,4.5,1.7,vergincia,False,versicolor
129,7.2,3.0,5.8,1.6,vergincia,False,versicolor
133,6.3,2.8,5.1,1.5,vergincia,False,versicolor
134,6.1,2.6,5.6,1.4,vergincia,False,versicolor


In [38]:
z = zip(col_names[0:4],clf.feature_importances_)
list(z)

[('sepal_length', 0.0),
 ('sepal_width', 0.027622412916362648),
 ('petal_length', 0.0),
 ('petal_width', 0.97237758708363731)]