In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [83]:
import pandas as pd
import random
import math
random.seed(31415)
mdata = pd.DataFrame.from_csv('mdata/expanded.csv',index_col=False).drop_duplicates()
mdata=mdata.sample(frac=1,random_state=31415)
len(mdata.values)

8124

In [114]:
#check edible to poisonous target class split
len(mdata[mdata['class']=='EDIBLE'])/len(mdata)

0.517971442639094

In [84]:
#create dataset of complete rows
complete=mdata[mdata.stalk_root!='?'].drop_duplicates()
len(complete.values)

5644

In [85]:
mdata.head(1)

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
5895,EDIBLE,CONVEX,SMOOTH,RED,BRUISES,NONE,FREE,CLOSE,BROAD,RED,...,SMOOTH,RED,RED,PARTIAL,WHITE,TWO,EVANESCENT,WHITE,CLUSTERED,WASTE


In [86]:
#create dataset with column of missing values removed
roots = mdata.stalk_root
ignore_roots=mdata.drop('stalk_root',1)
ignore_roots.head(1)

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
5895,EDIBLE,CONVEX,SMOOTH,RED,BRUISES,NONE,FREE,CLOSE,BROAD,RED,...,SMOOTH,RED,RED,PARTIAL,WHITE,TWO,EVANESCENT,WHITE,CLUSTERED,WASTE


In [87]:
#split data and target
target = ignore_roots['class']
data = ignore_roots.drop('class',1)

In [88]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)


In [89]:
data_dict = data.to_dict('records')

In [90]:
#encode target class as 0 or 1
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
len(target_list)

#target_dict = target.to_dict('records')

8124

In [91]:
#hot encode attributes
enc_data = v.fit_transform(data_dict)
len(enc_data)

8124

In [92]:
enc_data

array([[ 1.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  0., ...,  1.,  0.,  1.],
       [ 1.,  0.,  0., ...,  1.,  0.,  1.],
       ..., 
       [ 0.,  1.,  0., ...,  1.,  0.,  1.],
       [ 0.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  1.,  0., ...,  1.,  0.,  1.]])

In [93]:
from numpy import array
target_array = array(target_list)

In [94]:
#show naive bayes fit on total data set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 8124 points : 91


In [95]:
#generate random sample from data

test_ids=random.sample(range(0, len(enc_data)), math.floor(len(enc_data)*.3))

In [96]:
#split data into training set and test set
train_data=[]
test_data=[]
train_target=[]
test_target=[]
for i in range(len(enc_data)):
    if i in test_ids:
        test_data.append(enc_data[i])
        test_target.append(target_array[i])
    else:
        train_data.append(enc_data[i])
        train_target.append(target_array[i])
        


In [97]:
len(train_data)
train_data=array(train_data)

In [98]:
#accuracy of naive bayes based on training set
gnb = GaussianNB()
new_pred = gnb.fit(train_data, train_target).predict(test_data)
print("Number of mislabeled points out of a total %d points : %d" % (len(test_data),(test_target != new_pred).sum()))


Number of mislabeled points out of a total 2437 points : 34


In [99]:
#repeat with complete rows
target = complete['class']
data = complete.drop('class',1)

In [100]:
#encode target class as 0 or 1
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
len(target_list)


5644

In [101]:
target_array = array(target_list)

In [102]:
data_dict = data.to_dict('records')
enc_data = v.fit_transform(data_dict)
len(enc_data)

5644

In [103]:
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 5644 points : 8


In [104]:
random.seed(31415)
test_ids=random.sample(range(0, len(enc_data)), math.floor(len(enc_data)*.3))

In [105]:
#split data into training set and test set
train_data=[]
test_data=[]
train_target=[]
test_target=[]
for i in range(len(enc_data)):
    if i in test_ids:
        test_data.append(enc_data[i])
        test_target.append(target_array[i])
    else:
        train_data.append(enc_data[i])
        train_target.append(target_array[i])

In [106]:
len(train_data)
train_data=array(train_data)

In [107]:
#accuracy of naive bayes based on training set
new_pred = gnb.fit(train_data, train_target).predict(test_data)
print("Number of mislabeled points out of a total %d points : %d" % (len(test_data),(test_target != new_pred).sum()))


Number of mislabeled points out of a total 1693 points : 3


In [108]:
curve= gnb.fit(train_data, train_target)

In [109]:
curve.predict_proba(test_data)

array([[  1.00000000e+000,   3.45951074e-089],
       [  0.00000000e+000,   1.00000000e+000],
       [  0.00000000e+000,   1.00000000e+000],
       ..., 
       [  1.00000000e+000,   2.03279129e-133],
       [  1.00000000e+000,   1.02043540e-121],
       [  1.00000000e+000,   0.00000000e+000]])

In [110]:
curve.score(test_data, test_target)

0.99822799763733017

In [115]:
mdata.columns

Index(['class', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [130]:
#split data and target
target = ignore_roots['class']
data = ignore_roots.drop('class',1)

col_count=[]
for col in data.columns:
    temp=mdata[col]
    col_count.append(temp.nunique())
col_count   

[6, 4, 10, 2, 9, 2, 2, 2, 12, 2, 4, 4, 9, 9, 1, 4, 3, 5, 9, 6, 7]

In [131]:
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
target_array = array(target_list)
len(target_list)

8124

In [136]:
data_dict = data[[data.columns[0]]].to_dict('records')
enc_data = v.fit_transform(data_dict)
len(enc_data)

8124

In [137]:
gnb = GaussianNB()
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 8124 points : 3820


In [139]:
#show predictions by single attribute
for col in data.columns:
    data_dict = data[[col]].to_dict('records')
    enc_data = v.fit_transform(data_dict)
    gnb = GaussianNB()
    y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
    print("Number of mislabeled points out of a total %d points : %d for col=%s" % (enc_data.shape[0],(target_array != y_pred).sum(),col))


Number of mislabeled points out of a total 8124 points : 3820 for col=cap_shape
Number of mislabeled points out of a total 8124 points : 3912 for col=cap_surface
Number of mislabeled points out of a total 8124 points : 4156 for col=cap_color
Number of mislabeled points out of a total 8124 points : 2080 for col=bruises
Number of mislabeled points out of a total 8124 points : 120 for col=odor
Number of mislabeled points out of a total 8124 points : 4034 for col=gill_attachment
Number of mislabeled points out of a total 8124 points : 3120 for col=gill_spacing
Number of mislabeled points out of a total 8124 points : 1980 for col=gill_size
Number of mislabeled points out of a total 8124 points : 1796 for col=gill_color
Number of mislabeled points out of a total 8124 points : 3632 for col=stalk_shape
Number of mislabeled points out of a total 8124 points : 1832 for col=stalk_surface_above_ring
Number of mislabeled points out of a total 8124 points : 1900 for col=stalk_surface_below_ring
Numb

In [154]:
#odor has very accurate prediction rate
counts={}
od=mdata[['class','odor']]
for row in od.values:
    if row[1] not in counts:
        counts[row[1]]={'EDIBLE':0,'POISONOUS':0}
    counts[row[1]][row[0]]+=1
counts

{'ALMOND': {'EDIBLE': 400, 'POISONOUS': 0},
 'ANISE': {'EDIBLE': 400, 'POISONOUS': 0},
 'CREOSOTE': {'EDIBLE': 0, 'POISONOUS': 192},
 'FISHY': {'EDIBLE': 0, 'POISONOUS': 576},
 'FOUL': {'EDIBLE': 0, 'POISONOUS': 2160},
 'MUSTY': {'EDIBLE': 0, 'POISONOUS': 36},
 'NONE': {'EDIBLE': 3408, 'POISONOUS': 120},
 'PUNGENT': {'EDIBLE': 0, 'POISONOUS': 256},
 'SPICY': {'EDIBLE': 0, 'POISONOUS': 576}}

In [157]:
no_odor=mdata[(mdata.odor=='NONE')].drop('odor',1) # & (mdata['class']=='POISONOUS')]

In [163]:
#repeat with no odor rows
target = no_odor['class']
data = no_odor.drop('class',1)

#encode target class as 0 or 1
target_list=[]
for c in target.values:
    if c=='EDIBLE': 
        target_list.append(0)
    else :
        target_list.append(1)
len(target_list)
target_array = array(target_list)

data_dict = data.to_dict('records')
enc_data = v.fit_transform(data_dict)
len(enc_data)


3528

In [164]:
gnb = GaussianNB()
y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
print("Number of mislabeled points out of a total %d points : %d" % (enc_data.shape[0],(target_array != y_pred).sum()))


Number of mislabeled points out of a total 3528 points : 6


In [167]:
target[target_array != y_pred]


5860    EDIBLE
5848    EDIBLE
5824    EDIBLE
5812    EDIBLE
5836    EDIBLE
5800    EDIBLE
Name: class, dtype: object

In [168]:
data[target_array != y_pred]

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
5860,KNOBBED,SCALY,BROWN,NO,FREE,CROWDED,NARROW,WHITE,ENLARGING,BULBOUS,...,SMOOTH,WHITE,BROWN,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,LEAVES
5848,KNOBBED,FIBROUS,BROWN,NO,FREE,CROWDED,NARROW,WHITE,ENLARGING,BULBOUS,...,SMOOTH,WHITE,BROWN,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,LEAVES
5824,FLAT,FIBROUS,BROWN,NO,FREE,CROWDED,NARROW,WHITE,ENLARGING,BULBOUS,...,SMOOTH,WHITE,BROWN,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,LEAVES
5812,CONVEX,SCALY,BROWN,NO,FREE,CROWDED,NARROW,WHITE,ENLARGING,BULBOUS,...,SMOOTH,WHITE,BROWN,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,LEAVES
5836,FLAT,SCALY,BROWN,NO,FREE,CROWDED,NARROW,WHITE,ENLARGING,BULBOUS,...,SMOOTH,WHITE,BROWN,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,LEAVES
5800,CONVEX,FIBROUS,BROWN,NO,FREE,CROWDED,NARROW,WHITE,ENLARGING,BULBOUS,...,SMOOTH,WHITE,BROWN,PARTIAL,WHITE,ONE,EVANESCENT,WHITE,SEVERAL,LEAVES


In [169]:
#show predictions by single attribute
for col in data.columns:
    data_dict = data[[col]].to_dict('records')
    enc_data = v.fit_transform(data_dict)
    gnb = GaussianNB()
    y_pred = gnb.fit(enc_data, target_array).predict(enc_data)
    print("Number of mislabeled points out of a total %d points : %d for col=%s" % (enc_data.shape[0],(target_array != y_pred).sum(),col))


Number of mislabeled points out of a total 3528 points : 216 for col=cap_shape
Number of mislabeled points out of a total 3528 points : 116 for col=cap_surface
Number of mislabeled points out of a total 3528 points : 1688 for col=cap_color
Number of mislabeled points out of a total 3528 points : 120 for col=bruises
Number of mislabeled points out of a total 3528 points : 3216 for col=gill_attachment
Number of mislabeled points out of a total 3528 points : 120 for col=gill_spacing
Number of mislabeled points out of a total 3528 points : 264 for col=gill_size
Number of mislabeled points out of a total 3528 points : 916 for col=gill_color
Number of mislabeled points out of a total 3528 points : 912 for col=stalk_shape
Number of mislabeled points out of a total 3528 points : 112 for col=stalk_root
Number of mislabeled points out of a total 3528 points : 3000 for col=stalk_surface_above_ring
Number of mislabeled points out of a total 3528 points : 2808 for col=stalk_surface_below_ring
Numbe

In [174]:

counts={}
od=no_odor[['class','stalk_root']]
for row in od.values:
    if row[1] not in counts:
        counts[row[1]]={'EDIBLE':0,'POISONOUS':0}
    counts[row[1]][row[0]]+=1
counts

{'?': {'EDIBLE': 720, 'POISONOUS': 32},
 'BULBOUS': {'EDIBLE': 1824, 'POISONOUS': 80},
 'CLUB': {'EDIBLE': 0, 'POISONOUS': 8},
 'EQUAL': {'EDIBLE': 864, 'POISONOUS': 0}}

In [177]:
mdata[(mdata.odor=='NONE') & (mdata['class']=='POISONOUS') & (mdata['stalk_root']=='CLUB')]

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
7880,POISONOUS,BELL,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,YELLOW,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7886,POISONOUS,KNOBBED,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,YELLOW,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7884,POISONOUS,FLAT,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,YELLOW,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7885,POISONOUS,FLAT,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,WHITE,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7887,POISONOUS,KNOBBED,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,WHITE,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7882,POISONOUS,CONICAL,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,YELLOW,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7883,POISONOUS,CONICAL,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,WHITE,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES
7881,POISONOUS,BELL,SCALY,YELLOW,NO,NONE,FREE,CROWDED,NARROW,WHITE,...,SCALY,YELLOW,YELLOW,PARTIAL,YELLOW,ONE,EVANESCENT,WHITE,CLUSTERED,LEAVES


In [194]:
counts={}
for col in no_odor.columns:
    if col=='class':
        continue
    od=no_odor[['class',col]]
    for row in od.values:
        if col+'_'+row[1] not in counts:
            counts[col+'_'+row[1]]={'EDIBLE':0,'POISONOUS':0}
        counts[col+'_'+row[1]][row[0]]+=1
counts

{'bruises_BRUISES': {'EDIBLE': 1952, 'POISONOUS': 80},
 'bruises_NO': {'EDIBLE': 1456, 'POISONOUS': 40},
 'cap_color_BROWN': {'EDIBLE': 1168, 'POISONOUS': 16},
 'cap_color_BUFF': {'EDIBLE': 48, 'POISONOUS': 24},
 'cap_color_CINNAMON': {'EDIBLE': 32, 'POISONOUS': 0},
 'cap_color_GRAY': {'EDIBLE': 1032, 'POISONOUS': 0},
 'cap_color_GREEN': {'EDIBLE': 16, 'POISONOUS': 0},
 'cap_color_PINK': {'EDIBLE': 56, 'POISONOUS': 24},
 'cap_color_PURPLE': {'EDIBLE': 16, 'POISONOUS': 0},
 'cap_color_RED': {'EDIBLE': 624, 'POISONOUS': 0},
 'cap_color_WHITE': {'EDIBLE': 416, 'POISONOUS': 32},
 'cap_color_YELLOW': {'EDIBLE': 0, 'POISONOUS': 24},
 'cap_shape_BELL': {'EDIBLE': 148, 'POISONOUS': 48},
 'cap_shape_CONICAL': {'EDIBLE': 0, 'POISONOUS': 4},
 'cap_shape_CONVEX': {'EDIBLE': 1548, 'POISONOUS': 8},
 'cap_shape_FLAT': {'EDIBLE': 1452, 'POISONOUS': 48},
 'cap_shape_KNOBBED': {'EDIBLE': 228, 'POISONOUS': 12},
 'cap_shape_SUNKEN': {'EDIBLE': 32, 'POISONOUS': 0},
 'cap_surface_FIBROUS': {'EDIBLE': 1512, 

In [195]:
interesting = []
for t in counts:
    if counts[t]['EDIBLE']==0 or counts[t]['POISONOUS']==0:
        counts[t]['col']=t
        counts[t]['total']=counts[t]['EDIBLE']+counts[t]['POISONOUS']
        interesting.append(counts[t])
#interesting=array(interesting)
from operator import itemgetter
sorted(interesting, key=itemgetter('total'),reverse=True)

[{'EDIBLE': 2496,
  'POISONOUS': 0,
  'col': 'stalk_shape_TAPERING',
  'total': 2496},
 {'EDIBLE': 1344,
  'POISONOUS': 0,
  'col': 'spore_print_color_BROWN',
  'total': 1344},
 {'EDIBLE': 1296,
  'POISONOUS': 0,
  'col': 'spore_print_color_BLACK',
  'total': 1296},
 {'EDIBLE': 1032, 'POISONOUS': 0, 'col': 'cap_color_GRAY', 'total': 1032},
 {'EDIBLE': 968, 'POISONOUS': 0, 'col': 'population_SOLITARY', 'total': 968},
 {'EDIBLE': 864, 'POISONOUS': 0, 'col': 'stalk_root_EQUAL', 'total': 864},
 {'EDIBLE': 756, 'POISONOUS': 0, 'col': 'gill_color_PINK', 'total': 756},
 {'EDIBLE': 712, 'POISONOUS': 0, 'col': 'gill_color_BROWN', 'total': 712},
 {'EDIBLE': 624, 'POISONOUS': 0, 'col': 'cap_color_RED', 'total': 624},
 {'EDIBLE': 576,
  'POISONOUS': 0,
  'col': 'stalk_color_above_ring_GRAY',
  'total': 576},
 {'EDIBLE': 576,
  'POISONOUS': 0,
  'col': 'stalk_color_below_ring_GRAY',
  'total': 576},
 {'EDIBLE': 576,
  'POISONOUS': 0,
  'col': 'stalk_color_above_ring_PINK',
  'total': 576},
 {'EDIBL