In [1]:
# Some exploratory data analysis conducted on a dataset. Each instance represents an individual mushroom and the attributes
# represent its properties. The species of the mushroom is ommitted. The task is to classify a mushroom as either 'poisonous'
# or 'not poisonous' based on its properties. Data obtained from www.kaggle.com.

In [2]:
import numpy 
import pandas 
import plotly

In [3]:
# Load data from file and display it for inspection.

pandas.options.display.max_columns=None
data=pandas.read_csv('mushrooms.csv')
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l


In [4]:
# Show summary statistics for dataset.

data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [5]:
# Check if there are any missing values. This is done by first transforming the dataset to a table of Boolean vaules; 'True'
# if the value is missing, 'False' if not. The number of unique vaules in this new table is then calculated. It is shown to
# have only 1 unique value, 'False'. Therefore there are no missing values from the original dataset.

missing_vals=pandas.DataFrame(data.isnull().nunique())
missing_vals.columns=['Number of unique \'is null\' values']
missing_vals

Unnamed: 0,Number of unique 'is null' values
class,1
cap-shape,1
cap-surface,1
cap-color,1
bruises,1
odor,1
gill-attachment,1
gill-spacing,1
gill-size,1
gill-color,1


In [6]:
# Create a table to show each unique attribute value for every attribute. 

def extn(inp):
    length=len(inp)
    while length<12:
        inp=numpy.concatenate((inp,['-']),axis=0)
        length+=1
    return inp

class_vals=extn(data['class'].unique())
cap_shape_vals=extn(data['cap-shape'].unique())
cap_suf_vals=extn(data['cap-surface'].unique())
cap_col_vals=extn(data['cap-color'].unique())
bruises_vals=extn(data['bruises'].unique())
odor_vals=extn(data['odor'].unique())
gill_att_vals=extn(data['gill-attachment'].unique())
gill_spac_vals=extn(data['gill-spacing'].unique())
gill_size_vals=extn(data['gill-size'].unique())
gill_col_vals=extn(data['gill-color'].unique())
stalk_shape_vals=extn(data['stalk-shape'].unique())
stalk_root_vals=extn(data['stalk-root'].unique())
ssab_vals=extn(data['stalk-surface-above-ring'].unique())
ssbr_vals=extn(data['stalk-surface-below-ring'].unique())
scar_vals=extn(data['stalk-color-above-ring'].unique())
scbr_vals=extn(data['stalk-color-below-ring'].unique())
veil_type_vals=extn(data['veil-type'].unique())
veil_col_vals=extn(data['veil-color'].unique())
ring_num_vals=extn(data['ring-number'].unique())
ring_type_vals=extn(data['ring-type'].unique())
spc_vals=extn(data['spore-print-color'].unique())
pop_vals=extn(data['population'].unique())
hab_vals=extn(data['habitat'].unique())

unq_vals=pandas.DataFrame(numpy.array((class_vals, cap_shape_vals, cap_suf_vals, cap_col_vals, bruises_vals,
                                       odor_vals, gill_att_vals, gill_spac_vals, gill_size_vals, gill_col_vals,
                                       stalk_shape_vals, stalk_root_vals, ssab_vals, ssbr_vals, scar_vals, scbr_vals,
                                       veil_type_vals, veil_col_vals, ring_num_vals, ring_type_vals, spc_vals,
                                       pop_vals, hab_vals)))

unq_vals=unq_vals.rename(index={0:'class',1:'cap-shape',2:'cap-surface',3:'cap-color',4:'bruises',5:'odor',
                       6:'gill-attachment',7:'gill-spacing',8:'gill-size',9:'gill-color',10:'stalk-shape',
                       11:'stalk-root',12:'stalk-surface-above-ring',13:'stalk-surface-below-ring',
                       14:'stalk-color-above-ring',15:'stalk-color-below-ring',16:'veil-type',17:'veil-color',
                       18:'ring-number',19:'ring-type',20:'spore-print-color',21:'population',22:'habitat'})

unq_vals

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
class,p,e,-,-,-,-,-,-,-,-,-,-
cap-shape,x,b,s,f,k,c,-,-,-,-,-,-
cap-surface,s,y,f,g,-,-,-,-,-,-,-,-
cap-color,n,y,w,g,e,p,b,u,c,r,-,-
bruises,t,f,-,-,-,-,-,-,-,-,-,-
odor,p,a,l,n,f,c,y,s,m,-,-,-
gill-attachment,f,a,-,-,-,-,-,-,-,-,-,-
gill-spacing,c,w,-,-,-,-,-,-,-,-,-,-
gill-size,n,b,-,-,-,-,-,-,-,-,-,-
gill-color,k,n,g,p,w,h,u,e,b,r,y,o


In [7]:
# It can be seen that the 'stalk-root' attribute has some instances where the value is '?'. This indicates that there are
# some missing values after all, but they have been filled in rather than left empty. The number of these values is counted. 

pandas.DataFrame(data['stalk-root'].value_counts())

Unnamed: 0,stalk-root
b,3776
?,2480
e,1120
c,556
r,192


In [8]:
# The number of instances of each class is counted in order to check how balanced the data is. It is shown to be well balanced. 

pandas.DataFrame(data['class'].value_counts())

Unnamed: 0,class
e,4208
p,3916
