In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import export_text

[Mushroom classification](https://www.kaggle.com/uciml/mushroom-classification)

This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family Mushroom drawn from The Audubon Society Field Guide to North American Mushrooms (1981). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like "leaflets three, let it be'' for Poisonous Oak and Ivy.

In [2]:
data = pd.read_csv('data/mushrooms.csv')
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


**Attribute Information**

- classes: edible=e, poisonous=p
- cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
- cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
- cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
- bruises: bruises=t,no=f
- odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
- gill-attachment: attached=a,descending=d,free=f,notched=n
- gill-spacing: close=c,crowded=w,distant=d
- gill-size: broad=b,narrow=n
- gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
- stalk-shape: enlarging=e,tapering=t
- stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
- stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
- stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
- stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
- stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
- veil-type: partial=p,universal=u
- veil-color: brown=n,orange=o,white=w,yellow=y
- ring-number: none=n,one=o,two=t
- ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
- spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
- population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
- habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

1. Train a decision tree classifier, print the tree and evaluate its accuracy.

**Hint:** you will need to convert the variables to numerical variables

2. Retrain and evaluate the tree using separate train and test data, compare the accuracy to the previous step
3. Prune the tree by changing its hyper parameters, evaluate the accuracy of the new tree.

In [3]:
X = pd.get_dummies(data.drop('class', axis=1))
y = data['class']
dtree = DecisionTreeClassifier()
dtree.fit(X, y)
print(export_text(dtree, feature_names=list(X.columns)))

|--- odor_n <= 0.50
|   |--- stalk-root_c <= 0.50
|   |   |--- stalk-root_r <= 0.50
|   |   |   |--- spore-print-color_u <= 0.50
|   |   |   |   |--- odor_a <= 0.50
|   |   |   |   |   |--- odor_l <= 0.50
|   |   |   |   |   |   |--- class: p
|   |   |   |   |   |--- odor_l >  0.50
|   |   |   |   |   |   |--- class: e
|   |   |   |   |--- odor_a >  0.50
|   |   |   |   |   |--- class: e
|   |   |   |--- spore-print-color_u >  0.50
|   |   |   |   |--- class: e
|   |   |--- stalk-root_r >  0.50
|   |   |   |--- class: e
|   |--- stalk-root_c >  0.50
|   |   |--- bruises_f <= 0.50
|   |   |   |--- class: e
|   |   |--- bruises_f >  0.50
|   |   |   |--- class: p
|--- odor_n >  0.50
|   |--- spore-print-color_r <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- cap-surface_g <= 0.50
|   |   |   |   |--- cap-shape_c <= 0.50
|   |   |   |   |   |--- gill-size_n <= 0.50
|   |   |   |   |   |   |--- class: e
|   |   |   |   |   |--- gill-size_n >  0.50
|   |   |   |   |

In [4]:
confusion_matrix(y, dtree.predict(X))

array([[4208,    0],
       [   0, 3916]])

In [5]:
accuracy_score(y, dtree.predict(X))

1.0

In [6]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)
dtree = DecisionTreeClassifier()
dtree.fit(train_X, train_y)
print(export_text(dtree, feature_names=list(X.columns)))

|--- odor_n <= 0.50
|   |--- stalk-root_c <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- gill-spacing_w <= 0.50
|   |   |   |   |--- class: p
|   |   |   |--- gill-spacing_w >  0.50
|   |   |   |   |--- bruises_f <= 0.50
|   |   |   |   |   |--- class: e
|   |   |   |   |--- bruises_f >  0.50
|   |   |   |   |   |--- class: p
|   |   |--- stalk-surface-below-ring_y >  0.50
|   |   |   |--- class: e
|   |--- stalk-root_c >  0.50
|   |   |--- stalk-color-above-ring_w <= 0.50
|   |   |   |--- class: p
|   |   |--- stalk-color-above-ring_w >  0.50
|   |   |   |--- class: e
|--- odor_n >  0.50
|   |--- spore-print-color_r <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- cap-shape_c <= 0.50
|   |   |   |   |--- cap-surface_g <= 0.50
|   |   |   |   |   |--- gill-size_n <= 0.50
|   |   |   |   |   |   |--- class: e
|   |   |   |   |   |--- gill-size_n >  0.50
|   |   |   |   |   |   |--- population_c <= 0.50
|   |   |   |   |   |   |   |--- cl

In [7]:
confusion_matrix(valid_y, dtree.predict(valid_X))

array([[1670,    0],
       [   0, 1580]])

In [8]:
accuracy_score(valid_y, dtree.predict(valid_X))

1.0

In [9]:
dtree = DecisionTreeClassifier(max_depth=2)
dtree.fit(train_X, train_y)
print(export_text(dtree, feature_names=list(X.columns)))

|--- odor_n <= 0.50
|   |--- stalk-root_c <= 0.50
|   |   |--- class: p
|   |--- stalk-root_c >  0.50
|   |   |--- class: e
|--- odor_n >  0.50
|   |--- spore-print-color_r <= 0.50
|   |   |--- class: e
|   |--- spore-print-color_r >  0.50
|   |   |--- class: p



In [10]:
confusion_matrix(valid_y, dtree.predict(valid_X))

array([[1559,  111],
       [  40, 1540]])

In [11]:
accuracy_score(valid_y, dtree.predict(valid_X))

0.9535384615384616