## Poisonous Mushroom classifier

I used the https://archive.ics.uci.edu/ml/datasets/mushroom dataset to train a classifier to predict if a mushroom is poisonous or edible. Although I describe a decision tree classifier, svm and knn classifiers performed equally well (although svm is visibly slower).


*An interestint additional project will be to perform the same classification based in images of these mushrooms, using a convolutional deep neural network.*




In [167]:
import operator
import pickle

#import pydot
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import r2_score
from sklearn.tree import export_graphviz
from sklearn import metrics
from sklearn.externals.six import StringIO

In [168]:
# read data from remote file
data_url = 'https://s3.amazonaws.com/lfcunha-files/data.csv'
data = pd.read_csv(data_url)

In [204]:
data.head()

Unnamed: 0,edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
3,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
4,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,BROWN,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS


In [169]:
#Labels

print("{}: {}\n".format("edible", ", ".join(list(set(data['edible'])))))
print("{}: {}".format("Number of Edible examples: ", len(data[data['edible'].str.contains("EDIBLE")])))
print("{}: {}\n".format("Number of POISONOUS examples: ", len(data[data['edible'].str.contains("POISONOUS")])))

edible: POISONOUS, EDIBLE

Number of Edible examples: : 4488
Number of POISONOUS examples: : 3928



In [170]:
#Features

for x in data:
    if x == "edible": continue
    print("{}: {}".format(x, ", ".join(list(set(data[x])))))

cap-shape: FLAT, SUNKEN, BELL, KNOBBED, CONVEX, CONICAL
cap-surface: FIBROUS, GROOVES, SMOOTH, SCALY
cap-color: PINK, BROWN, GRAY, PURPLE, YELLOW, GREEN, CINNAMON, WHITE, RED, BUFF
bruises: BRUISES, NO
odor: NONE, ANISE, ALMOND, PUNGENT, MUSTY, FOUL, SPICY, FISHY, CREOSOTE
gill-attachment: ATTACHED, FREE
gill-spacing: CLOSE, CROWDED
gill-size: BROAD, NARROW
gill-color: PINK, BROWN, GRAY, PURPLE, YELLOW, CHOCOLATE, BLACK, ORANGE, GREEN, WHITE, RED, BUFF
stalk-shape: ENLARGING, TAPERING
stalk-root: CLUB, BULBOUS, ROOTED, ?, EQUAL
stalk-surface-above-ring: FIBROUS, SCALY, SMOOTH, SILKY
stalk-surface-below-ring: FIBROUS, SMOOTH, SILKY, SCALY
stalk-color-above-ring: PINK, GRAY, BROWN, YELLOW, CINNAMON, ORANGE, WHITE, RED, BUFF
stalk-color-below-ring: PINK, GRAY, BROWN, YELLOW, CINNAMON, ORANGE, WHITE, RED, BUFF
veil-type: PARTIAL
veil-color: ORANGE, BROWN, WHITE, YELLOW
 ring-number: NONE, TWO, ONE
ring-type: LARGE, FLARING, EVANESCENT, PENDANT, NONE
spore-print-color: BROWN, PURPLE, YELLOW

In [195]:
""" Feature distribution """

print(data.describe().T)

                         count unique       top  freq
edible                    8416      2    EDIBLE  4488
cap-shape                 8416      6    CONVEX  3796
cap-surface               8416      4     SCALY  3268
cap-color                 8416     10     BROWN  2320
bruises                   8416      2        NO  5040
odor                      8416      9      NONE  3808
gill-attachment           8416      2      FREE  8200
gill-spacing              8416      2     CLOSE  6824
gill-size                 8416      2     BROAD  5880
gill-color                8416     12      BUFF  1728
stalk-shape               8416      2  TAPERING  4864
stalk-root                8416      5   BULBOUS  3800
stalk-surface-above-ring  8416      4    SMOOTH  5316
stalk-surface-below-ring  8416      4    SMOOTH  5076
stalk-color-above-ring    8416      9     WHITE  4744
stalk-color-below-ring    8416      9     WHITE  4640
veil-type                 8416      1   PARTIAL  8416
veil-color                84

In [185]:
""" Encode labels (categorial data to numerically encoded) """

"""general transform"""
#le = LabelEncoder()
#data_encoded = data.apply(le.fit_transform)

"""we need a hack to store and recover the encodings"""
encoder = defaultdict(LabelEncoder)
data_encoded = data.apply(lambda x: encoder[x.name].fit_transform(x))


""" Later can recover the encodings """
## Invert the encoded
#data_encoded.apply(lambda x: encoder[x.name].inverse_transform(x))

## Using the dictionary to label future data
#df.apply(lambda x: encoder[x.name].transform(x))


' Later can recover the encodings '

In [196]:
""" fill in any missing data"""
for c in data_encoded.columns:
    data_encoded[c]=data_encoded[c].fillna(-1)

In [201]:
"""Overview the encoded data """
print(data_encoded.describe().T)


                           count      mean       std  min  25%  50%  75%   max
edible                    8416.0  0.466730  0.498922  0.0  0.0  0.0  1.0   1.0
cap-shape                 8416.0  2.494297  0.892014  0.0  2.0  2.0  3.0   5.0
cap-surface               8416.0  1.733840  1.190765  0.0  0.0  2.0  3.0   3.0
cap-color                 8416.0  4.264734  3.403228  0.0  0.0  3.0  8.0   9.0
bruises                   8416.0  0.598859  0.490159  0.0  0.0  1.0  1.0   1.0
odor                      8416.0  4.828897  1.961047  0.0  4.0  6.0  6.0   8.0
gill-attachment           8416.0  0.974335  0.158144  0.0  1.0  1.0  1.0   1.0
gill-spacing              8416.0  0.189163  0.391662  0.0  0.0  0.0  0.0   1.0
gill-size                 8416.0  0.301331  0.458863  0.0  0.0  0.0  1.0   1.0
gill-color                8416.0  4.692490  3.353737  0.0  2.0  4.0  7.0  11.0
stalk-shape               8416.0  0.577947  0.493916  0.0  0.0  1.0  1.0   1.0
stalk-root                8416.0  1.168251  1.092390

In [202]:
data_encoded.head()

Unnamed: 0,edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,6,4,6
1,0,2,3,8,0,0,1,1,1,10,...,3,7,7,0,2,1,4,1,4,6
2,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,6,4,6
3,0,2,3,8,0,0,1,1,1,7,...,3,7,7,0,2,1,4,1,4,6
4,0,2,3,8,0,0,1,1,1,1,...,3,7,7,0,2,1,4,6,4,6


In [174]:
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data_encoded.iloc[:, 1:], data_encoded.iloc[:,0], test_size=0.20, random_state=1)

In [186]:
n_edible = reduce(lambda a, b: a+b, y_test)
print("{}: {}".format("number of edible examples in test set", n_edible))
print("{}: {}".format("number of poisonous examples in test set", len(y_test)-n_edible))

number of edible examples in test set: 823
number of poisonous examples in test set: 861


In [187]:
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between
        true and predicted values based on the metric chosen. """

    # Calculate the performance score between 'y_true' and 'y_predict'
    return r2_score(y_true, y_predict)

In [177]:
def fit_model(X, y):
    """Grid search for the best tree depth
    """
    # Create cross-validation sets from the training data
    rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)
    cv_sets = rs.get_n_splits(X)

    # Create a decision tree classifier object
    classifier = DecisionTreeClassifier(random_state=0)
    
    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {"max_depth": range(1, 11)}
    
    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search cv object --> GridSearchCV()
    grid = GridSearchCV(classifier, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    
    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)
    print(pd.DataFrame(grid.cv_results_))
    
    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [178]:
# grid search for the best tree depth of the classifier
reg = fit_model(X_train, y_train)
reg.fit(X_train, y_train)

   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0       0.003511         0.000540         0.399834          0.399837   
1       0.004806         0.000611         0.758502          0.758500   
2       0.006659         0.000681         0.855937          0.870151   
3       0.005768         0.000686         0.961743          0.959949   
4       0.006290         0.000599         0.971308          0.970775   
5       0.005733         0.000537         0.995816          0.995284   
6       0.005603         0.000589         1.000000          1.000000   
7       0.005580         0.000493         1.000000          1.000000   
8       0.005390         0.000459         1.000000          1.000000   
9       0.005617         0.002011         1.000000          1.000000   

  param_max_depth              params  rank_test_score  split0_test_score  \
0               1   {u'max_depth': 1}               10           0.426856   
1               2   {u'max_depth': 2}                

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [189]:
# save model
Z = reg.predict(X_test)
model = pickle.dumps(reg)
# pickle.dump(reg, "model.pk")  # save to file on disk

In [191]:
""" Confusion Matrix """

from IPython.display import Markdown


cm = metrics.confusion_matrix(y_test, Z)

Markdown("""
|               | predict poisonous     | predict edible  |
| ------------- |:---------------------:| ---------------:|
|  poisonous    | {}                    |   {}            |
|  edible       | {}                    |   {}            |

""".format(cm[0][0], cm[0][1], cm[1][0], cm[1][1]))



|               | predict poisonous     | predict edible  |
| ------------- |:---------------------:| ---------------:|
|  poisonous    | 861                    |   0            |
|  edible       | 0                    |   823            |



In [192]:
""" Accuracy metrics"""

print("{}:{}\n".format("r squared", performance_metric(y_test, Z)))

print(metrics.classification_report(y_test, Z))

r squared:1.0

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       861
          1       1.00      1.00      1.00       823

avg / total       1.00      1.00      1.00      1684



In [193]:
"""Relative importance of each feature in predicting the label"""

feature_importances = reg.feature_importances_

feature_importance = dict(zip(feature_importances, list(data)[1:]))
fi_sorted = sorted(feature_importance.items(), key=operator.itemgetter(0), reverse=True)

for score, feature in fi_sorted:
    print("{}: {}".format(feature, round(score, 2)))

spore-print-color: 0.53
 ring-number: 0.16
gill-size: 0.14
gill-spacing: 0.05
veil-color: 0.04
bruises: 0.02
stalk-root: 0.02
cap-surface: 0.02
odor: 0.01
stalk-shape: 0.01
stalk-surface-below-ring: 0.01
population: 0.01
habitat: 0.0


In [194]:
"""Save Tree graph """

#dot_data = StringIO()
#export_graphviz(reg, out_file="dot.dot", feature_names=list(data)[1:], class_names=["poisonous", "edible"])
#export_graphviz(reg, out_file=dot_data, feature_names=list(data)[1:], class_names=["poisonous", "edible")
# graph_ = pydot.graph_from_dot_data(dot_data.getvalue())
# graph_.write_pdf("tree.pdf")

'Save Tree graph '

### Convert dot to pdf file

In [None]:
%%bash
dot -Tpdf dot.dot -o tree.pdf; open tree.pdf