In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('mushrooms.csv')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
class,p,e,e,p,e
cap-shape,x,x,b,x,x
cap-surface,s,s,s,y,s
cap-color,n,y,w,w,g
bruises,t,t,t,t,f
odor,p,a,l,p,n
gill-attachment,f,f,f,f,f
gill-spacing,c,c,c,c,w
gill-size,n,b,b,n,b
gill-color,k,k,n,n,k


### Let's check for null values.  There are none!  Wow!

In [4]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### These are all objects, which makes sense since they are all letters.  The first column, class, is our y-variable.  This column represents whether or not a mushroom is poisonous (p) or edible (e).

Attribute Information: 
    
classes: edible=e, poisonous=p

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

### Let's go straight into our train/test splits and the explore our train set.

In [37]:
train, test = train_test_split(df, test_size=.3, random_state=123, stratify=df[['class']])

In [38]:
train.shape

(5686, 23)

In [39]:
test.shape

(2438, 23)

In [40]:
train.apply(lambda x: x.value_counts()).T.stack()

class              e    2945.0
                   p    2741.0
cap-shape          b     331.0
                   c       4.0
                   f    2229.0
                   k     569.0
                   s      19.0
                   x    2534.0
cap-surface        f    1602.0
                   g       3.0
                   s    1790.0
                   y    2291.0
cap-color          b     118.0
                   c      31.0
                   e    1035.0
                   g    1297.0
                   n    1567.0
                   p      98.0
                   r      10.0
                   u      12.0
                   w     754.0
                   y     764.0
bruises            f    3330.0
                   t    2356.0
odor               a     280.0
                   c     137.0
                   f    1510.0
                   l     300.0
                   m      28.0
                   n    2439.0
                         ...  
ring-number        n      28.0
        

## Hypothesis 1: Odor can be a strong indicator of poisonous or edible.  Fishy (y), foul (f), or spicy (s) may contain more poisonous mushrooms.

In [41]:
train.groupby('odor')['class'].value_counts()

odor  class
a     e         280
c     p         137
f     p        1510
l     e         300
m     p          28
n     e        2365
      p          74
p     p         175
s     p         406
y     p         411
Name: class, dtype: int64

### First off: WOW!  It appears that odor is a very useful feature in distinguishing between poisnous and edible as all but one odor is classified in only one class.
### Second, H1 is proven to be true.
### Let's take a deeper dive into mushrooms with no odor to find what differences we can see between the edible and the 74 poisonous varieties.

### We can now see that given there is no odor and the cap-shape is conical, the mushroom is poisonous.  Additionally, given no odor and the cap-shape is sunken, it is edible.

In [42]:
train[train.odor == 'n'].groupby('cap-shape')['class'].value_counts()

cap-shape  class
b          e         105
           p          32
c          p           4
f          e        1026
           p          26
k          e         157
           p           9
s          e          19
x          e        1058
           p           3
Name: class, dtype: int64

### Similar to above, given the mushroom has no odor, if it's habitat is a meadow, it's poisonous. If it's habitat is path, urban, or waste (yummy!), it's edible.  Are you really going to eat a mushroom found in waste though?

In [43]:
train[train.odor == 'n'].groupby('habitat')['class'].value_counts()

habitat  class
d        e        1230
         p          16
g        e         741
         p          24
l        e         171
         p          11
m        p          23
p        e          25
u        e          66
w        e         132
Name: class, dtype: int64

### Gill-color is very helpful.  Given the mushroom is odorless, there are multiple sub-categories that take the guess work out of whether or mushroom is edible or poisonous.

In [44]:
train[train.odor == 'n'].groupby('gill-color')['class'].value_counts()

gill-color  class
e           e         65
g           e         85
            p         14
h           e        149
k           e        146
n           e        505
o           e         46
p           e        537
r           p         15
u           e        296
w           e        490
            p         42
y           e         46
            p          3
Name: class, dtype: int64

### Let's combine the use of gill-color and then habitat on the odorless mushrooms that we have.  It's really starting to narrow down as we can see.

In [46]:
train[train.odor == 'n'].groupby(['gill-color', 'habitat'])['class'].value_counts()

gill-color  habitat  class
e           w        e         65
g           g        e         67
                     p          8
            m        p          6
            u        e         18
h           d        e          9
            g        e        140
k           g        e        128
            u        e         18
n           d        e        305
            g        e        137
            l        e         46
            u        e         17
o           l        e         46
p           d        e        317
            g        e        207
            u        e         13
r           g        p          7
            m        p          8
u           d        e        296
w           d        e        303
                     p         16
            g        e         62
                     p          9
            l        e         33
                     p          8
            m        p          9
            p        e         25
            w        