In [17]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction.text import CountVectorizer

## Cleaning


In [13]:

labels = ["edible", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment",
          "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring",
          "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color",
          "ring-number", "ring-type", "spore-print-color", "population", "habitat"]
df = pd.read_csv("agaricus-lepiota.data", names=labels)
df.replace('?', np.NaN, inplace=True)

# EDA
# (8124, 23)
print(df.shape)
print(df.head(5))
# Response variable is "edible": either 'e' for "edible", or 'p' for "poisonous"
count = df["edible"].value_counts()
total = count["p"] + count["e"]
print((count["p"] / total) * 100, "% of the dataset is poisonous")
print((count["e"] / total) * 100, "% of the dataset is edible")
# 48% are poisonous
# 52% are edible
# Good split
na_vec = df.isnull().sum()
print(na_vec)
# All missing values (2480) are found in "stalk-root" column
# Exploring "stalk-root" column
count = df["stalk-root"].value_counts()
print(count)
# Drop?
df = df.drop(["stalk-root"], axis=1)
print(df.shape)
shrooms = df

(8124, 23)
  edible cap-shape cap-surface cap-color bruises odor gill-attachment  \
0      p         x           s         n       t    p               f   
1      e         x           s         y       t    a               f   
2      e         b           s         w       t    l               f   
3      p         x           y         w       t    p               f   
4      e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p      

In [14]:
#just an extra box for testing stuff
rng = np.random.RandomState(1)
# This creates an matrix of 6X100 rangeing from 0 to 5
# (5 being the "low" parameter, and if high parameter isn't specified default is from 0 to low)
X = rng.randint(5, size=(6, 100))

y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

clf.fit(X, y)
print(X[2:3])
print(clf.predict(X[3:4]))

[[2 4 4 0 3 3 0 3 1 0 2 2 2 0 2 1 4 0 4 4 1 3 1 4 1 2 1 0 0 2 4 1 0 0 3 1
  0 4 3 2 3 4 4 3 0 0 0 4 1 4 1 2 2 4 3 4 4 0 3 2 4 3 4 2 3 0 2 1 3 2 0 1
  4 1 3 3 1 2 0 2 4 0 2 4 3 4 3 0 4 2 2 4 1 2 1 1 1 0 4 4]]
[4]


#### Implementing a naive bayes algorithm

In [44]:
# Split data into training and testing

X_train, X_test, y_train, y_test = train_test_split(shrooms, shrooms["edible"], random_state=0)


In [45]:
#Here I'm making the training numeric, because turns out the nb function doesn't like strings
X_trainDum = pd.get_dummies(X_train)

In [66]:
print(X_trainDum.columns)
print(X_testDum.isin(X_trainDum))
X_trainDum.columns[~X_trainDum.columns.isin(X_testDum.columns)]

Index(['edible_e', 'edible_p', 'cap-shape_b', 'cap-shape_c', 'cap-shape_f',
       'cap-shape_k', 'cap-shape_s', 'cap-shape_x', 'cap-surface_f',
       'cap-surface_g',
       ...
       'population_s', 'population_v', 'population_y', 'habitat_d',
       'habitat_g', 'habitat_l', 'habitat_m', 'habitat_p', 'habitat_u',
       'habitat_w'],
      dtype='object', length=114)
      edible_e  edible_p  cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  \
380      False     False        False        False        False        False   
3641     False     False        False        False        False        False   
273      False     False        False        False        False        False   
1029     False     False        False        False        False        False   
684      False     False        False        False        False        False   
...        ...       ...          ...          ...          ...          ...   
1859     False     False        False        False        False  

Index(['cap-surface_g'], dtype='object')

In [91]:
#Running the function to see what happens lol ###REMEMBER TO TRY THIS AGAIN WITHOUT DROPPING NA's
mnb = MultinomialNB()
mnb.fit(X_trainDum,y_train)
mnb.get_params()
print(mnb.predict(X_trainDum[0:20]))
print(y_train[0:20] == mnb.predict(X_trainDum[0:20]))
#These two check out. Probably a good enough sample, lets see how it compares with the test data.
#Though a problem I'm comprehending from this is that there are a lot of zeros. I don't know if this counts each zero as an instance, or how it works all the way. Still, lets look at accuracy
 



['p' 'e' 'e' 'p' 'p' 'e' 'e' 'e' 'e' 'e' 'e' 'p' 'e' 'p' 'p' 'p' 'e' 'e'
 'p' 'e']
5832    True
601     True
1601    True
4941    True
7492    True
2103    True
1421    True
7349    True
4945    True
3464    True
3680    True
5927    True
2455    True
5871    True
5358    True
7409    True
2529    True
1692    True
5336    True
3373    True
Name: edible, dtype: bool


In [92]:
#Testing with training data
mnb = MultinomialNB()
mnb.fit(X_trainDum,y_train)
mnb.get_params()
labels1 = mnb.predict(X_trainDum)
len = y_train.shape[0]
prop_correct = sum(labels1[0:len-1] == y_train[0:len-1])/len 
print(prop_correct)

0.9973740357787625


In [93]:
#Test data time

#converting X_test to 
X_testDum = pd.get_dummies(X_test)
# So compared to the data above, this is missing a column ['cap-surface_g'], I will add it with 0's
X_testDum['cap-surface_g'] = 0
labels = mnb.predict(X_testDum)

# for i in range(labels.shape[0]):
#     labels[i] == y_test[0:i] 
len = y_test.shape[0]
prop_correct = sum(labels[0:len-1] == y_test[0:len-1])/len 
print(prop_correct)
#Dang that didn't work. 


0.6745445593303792


Feature names must be in the same order as they were in fit.

