In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

In [51]:
df = pd.read_csv("mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

In [53]:
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
class,8124,2,e,4208
cap-shape,8124,6,x,3656
cap-surface,8124,4,y,3244
cap-color,8124,10,n,2284
bruises,8124,2,f,4748
odor,8124,9,n,3528
gill-attachment,8124,2,f,7914
gill-spacing,8124,2,c,6812
gill-size,8124,2,b,5612
gill-color,8124,12,b,1728


In [54]:
for col in df:
    print(df[col].value_counts(), '\n')

e    4208
p    3916
Name: class, dtype: int64 

x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64 

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64 

n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
r      16
u      16
Name: cap-color, dtype: int64 

f    4748
t    3376
Name: bruises, dtype: int64 

n    3528
f    2160
s     576
y     576
l     400
a     400
p     256
c     192
m      36
Name: odor, dtype: int64 

f    7914
a     210
Name: gill-attachment, dtype: int64 

c    6812
w    1312
Name: gill-spacing, dtype: int64 

b    5612
n    2512
Name: gill-size, dtype: int64 

b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: gill-color, dtype: int64 

t    4608
e    3516
Name: stalk-shape, dtype: int64 

b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64 

s    5176
k    2372
f     552
y 

In [55]:
# drop stalk-root since it has many missing values (2480 labelled '?')
df.drop(columns=['stalk-root'], inplace=True)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [56]:
# unnecessary but I'm curious
grouped=df[['odor', 'class']].groupby(['class', 'odor']).size().sort_index(ascending=False).reset_index(name='num')
grouped

Unnamed: 0,class,odor,num
0,p,y,576
1,p,s,576
2,p,p,256
3,p,n,120
4,p,m,36
5,p,f,2160
6,p,c,192
7,e,n,3408
8,e,l,400
9,e,a,400


In [69]:
sum(grouped[(grouped.odor=='n') & (grouped['class'] == 'e')].num) / sum(grouped[(grouped.odor=='n')].num) * 100

96.5986394557823

In [70]:
# ~97% of mushrooms without odor were edible

In [71]:
# before doing classification we need to convert text to numerical
labelencoder=LabelEncoder()
for col in df.columns:
    df[col] = labelencoder.fit_transform(df[col])
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [72]:
# first split the train and test sets
x = np.array(df.iloc[:, 1:])
y = np.array(df['class'])
seed = 33
t_size = 0.2

In [73]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=t_size, random_state=seed)

In [74]:
# now we can try several algorithms
models = {'KNN Clustering n=5': KNeighborsClassifier(n_neighbors=5),
          'Logistic Regression': LogisticRegression(solver='liblinear', multi_class='ovr'), 
          'Linear Discriminant Analysis': LinearDiscriminantAnalysis(), 
          'Decision Tree': DecisionTreeClassifier(), 
          'Gaussian Naive Bayes': GaussianNB(), 
          'Support Vector': SVC(gamma='auto')
         }

In [75]:
# evaluate each model
for name, model in models.items():
    fitted = model.fit(x_train, y_train)
    predicted = model.predict(x_test)
    result = accuracy_score(y_test, predicted)
    print(name, ': ', result.round(5))

KNN Clustering n=5 :  1.0
Logistic Regression :  0.952
Linear Discriminant Analysis :  0.94585
Decision Tree :  1.0
Gaussian Naive Bayes :  0.91631




Support Vector :  1.0


In [76]:
# all give excellent results 
# accuracy is not a good measure however, try with cross val score 
results = []
names = []
for name, model in models.items():
    kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)
    crossval_result = model_selection.cross_val_score(model, x_train, y_train, cv=kfold)
    results.append(crossval_result)
    names.append(name)
    print(name, crossval_result.mean().round(5), crossval_result.std().round(5))

KNN Clustering n=5 0.99754 0.00157
Logistic Regression 0.94722 0.00973




Linear Discriminant Analysis 0.94568 0.00812
Decision Tree 1.0 0.0
Gaussian Naive Bayes 0.91414 0.01156
Support Vector 0.99985 0.00046


In [77]:
# evaluate each model
# in particular we want precision of 1 (poisonous) to be high as false negatives are hazardous
for name, model in models.items():
    fitted = model.fit(x_train, y_train)
    predicted = model.predict(x_test)
    print(name)
    print("Accuracy: ", accuracy_score(y_test, predicted))
    print("Confusion Matrix: ")
    print(confusion_matrix(y_test, predicted))
    print("Classification Report: ")
    print(classification_report(y_test, predicted))

KNN Clustering n=5
Accuracy:  1.0
Confusion Matrix: 
[[831   0]
 [  0 794]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       831
           1       1.00      1.00      1.00       794

   micro avg       1.00      1.00      1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Logistic Regression
Accuracy:  0.952
Confusion Matrix: 
[[794  37]
 [ 41 753]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       831
           1       0.95      0.95      0.95       794

   micro avg       0.95      0.95      0.95      1625
   macro avg       0.95      0.95      0.95      1625
weighted avg       0.95      0.95      0.95      1625

Linear Discriminant Analysis
Accuracy:  0.9458461538461539
Confusion Matrix: 
[[794  37]
 [ 51 743]]
Classification Report: 
              prec



Support Vector
Accuracy:  1.0
Confusion Matrix: 
[[831   0]
 [  0 794]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       831
           1       1.00      1.00      1.00       794

   micro avg       1.00      1.00      1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [None]:
#kNN, decision tree, and support vector have the best results