In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [83]:
import pandas as pd
import random
import math
random.seed(31415)
mdata = pd.DataFrame.from_csv('mdata/expanded.csv',index_col=False).drop_duplicates()
mdata=mdata.sample(frac=1,random_state=31415)
len(mdata.values)

8124

In [114]:
#check edible to poisonous target class split
len(mdata[mdata['class']=='EDIBLE'])/len(mdata)

0.517971442639094

In [84]:
#create dataset of complete rows
complete=mdata[mdata.stalk_root!='?'].drop_duplicates()
len(complete.values)

5644

In [85]:
mdata.head(1)

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
5895,EDIBLE,CONVEX,SMOOTH,RED,BRUISES,NONE,FREE,CLOSE,BROAD,RED,...,SMOOTH,RED,RED,PARTIAL,WHITE,TWO,EVANESCENT,WHITE,CLUSTERED,WASTE


In [86]:
#create dataset with column of missing values removed
roots = mdata.stalk_root
ignore_roots=mdata.drop('stalk_root',1)
ignore_roots.head(1)

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
5895,EDIBLE,CONVEX,SMOOTH,RED,BRUISES,NONE,FREE,CLOSE,BROAD,RED,...,SMOOTH,RED,RED,PARTIAL,WHITE,TWO,EVANESCENT,WHITE,CLUSTERED,WASTE


In [87]:
#split data and target
target = ignore_roots['class']
data = ignore_roots.drop('class',1)

In [88]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)


In [89]:
data_dict = data.to_dict('records')

In [90]:
#encode target class as 0 or 1
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
len(target_list)

#target_dict = target.to_dict('records')

8124

In [91]:
#hot encode attributes
enc_data = v.fit_transform(data_dict)
len(enc_data)

8124

In [92]:
enc_data

array([[ 1.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  0., ...,  1.,  0.,  1.],
       [ 1.,  0.,  0., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  0., ...,  1.,  0.,  1.]])

In [93]:
from numpy import array
target_array = array(target_list)

In [94]:
#show naive bayes fit on total data set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 8124 points : 91


In [95]:
#generate random sample from data

test_ids=random.sample(range(0, len(enc_data)), math.floor(len(enc_data)*.3))

In [96]:
#split data into training set and test set
train_data=[]
test_data=[]
train_target=[]
test_target=[]
for i in range(len(enc_data)):
    if i in test_ids:
        test_data.append(enc_data[i])
        test_target.append(target_array[i])
    else:
        train_data.append(enc_data[i])
        train_target.append(target_array[i])
        


In [97]:
len(train_data)
train_data=array(train_data)

In [98]:
#accuracy of naive bayes based on training set
gnb = GaussianNB()
new_pred = gnb.fit(train_data, train_target).predict(test_data)
print("Number of mislabeled points out of a total %d points : %d" % (len(test_data),(test_target != new_pred).sum()))


Number of mislabeled points out of a total 2437 points : 34


In [99]:
#repeat with complete rows
target = complete['class']
data = complete.drop('class',1)

In [100]:
#encode target class as 0 or 1
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
len(target_list)


5644

In [101]:
target_array = array(target_list)

In [102]:
data_dict = data.to_dict('records')
enc_data = v.fit_transform(data_dict)
len(enc_data)

5644

In [103]:
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 5644 points : 8


In [104]:
random.seed(31415)
test_ids=random.sample(range(0, len(enc_data)), math.floor(len(enc_data)*.3))

In [105]:
#split data into training set and test set
train_data=[]
test_data=[]
train_target=[]
test_target=[]
for i in range(len(enc_data)):
    if i in test_ids:
        test_data.append(enc_data[i])
        test_target.append(target_array[i])
    else:
        train_data.append(enc_data[i])
        train_target.append(target_array[i])

In [106]:
len(train_data)
train_data=array(train_data)

In [107]:
#accuracy of naive bayes based on training set
new_pred = gnb.fit(train_data, train_target).predict(test_data)
print("Number of mislabeled points out of a total %d points : %d" % (len(test_data),(test_target != new_pred).sum()))


Number of mislabeled points out of a total 1693 points : 3


In [108]:
curve= gnb.fit(train_data, train_target)

In [109]:
curve.predict_proba(test_data)

array([[  1.00000000e+000,   3.45951074e-089],
       [  0.00000000e+000,   1.00000000e+000],
       [  0.00000000e+000,   1.00000000e+000],
       ..., 
       [  1.00000000e+000,   2.03279129e-133],
       [  1.00000000e+000,   1.02043540e-121],
       [  1.00000000e+000,   0.00000000e+000]])

In [110]:
curve.score(test_data, test_target)

0.99822799763733017

In [115]:
mdata.columns

Index(['class', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [130]:
#split data and target
target = ignore_roots['class']
data = ignore_roots.drop('class',1)

col_count=[]
for col in data.columns:
    temp=mdata[col]
    col_count.append(temp.nunique())
col_count   

[6, 4, 10, 2, 9, 2, 2, 2, 12, 2, 4, 4, 9, 9, 1, 4, 3, 5, 9, 6, 7]

In [131]:
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
target_array = array(target_list)
len(target_list)

8124

In [136]:
data_dict = data[[data.columns[0]]].to_dict('records')
enc_data = v.fit_transform(data_dict)
len(enc_data)

8124

In [137]:
gnb = GaussianNB()
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 8124 points : 3820


In [None]:
for col in data.columns:
    data_dict = data[[col]].to_dict('records')
    enc_data = v.fit_transform(data_dict)
    gnb = GaussianNB()
    y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
    print("Number of mislabeled points out of a total %d points : %d for col=%s" % (enc_data.shape[0],(target_array != y_pred).sum(),col))


Number of mislabeled points out of a total 8124 points : 3820 for col=cap_shape
Number of mislabeled points out of a total 8124 points : 3912 for col=cap_surface
Number of mislabeled points out of a total 8124 points : 4156 for col=cap_color
Number of mislabeled points out of a total 8124 points : 2080 for col=bruises
Number of mislabeled points out of a total 8124 points : 120 for col=odor
Number of mislabeled points out of a total 8124 points : 4034 for col=gill_attachment
Number of mislabeled points out of a total 8124 points : 3120 for col=gill_spacing
Number of mislabeled points out of a total 8124 points : 1980 for col=gill_size
Number of mislabeled points out of a total 8124 points : 1796 for col=gill_color
Number of mislabeled points out of a total 8124 points : 3632 for col=stalk_shape
Number of mislabeled points out of a total 8124 points : 1832 for col=stalk_surface_above_ring
Number of mislabeled points out of a total 8124 points : 1900 for col=stalk_surface_below_ring
Numb