# Mushroom Classification
### Classify whether the mushroom is edible or posinous.

In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing

Read the dataset.
Data source: https://archive.ics.uci.edu/ml/datasets/mushroom

In [2]:
df = pd.read_csv('MushroomData-Class.csv')

In [3]:
df.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,f,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,t,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,t,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,f,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,t,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class_edible                8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

In [5]:
df.shape

(8124, 23)

In [6]:
df['class_edible'].value_counts()

t    4208
f    3916
Name: class_edible, dtype: int64

In [7]:
df.describe()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,t,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


From the above its clear that 't' stands for true(edible) and 'f' stands for false(not edible or poisonous). All the features are categorical. So we need to transform this into categorical value.

#### Encoding categorical features

In [8]:
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)
df = df.apply(lambda x: d[x.name].fit_transform(x))
df.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,1,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,1,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,0,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,1,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [9]:
d.keys()

dict_keys(['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'])

In [10]:
# distinct values in one column
d['class_edible'].classes_

array(['f', 't'], dtype=object)

In [11]:
# get all the distinct values
for key in d.keys():
    print (key, d[key].classes_)

class_edible ['f' 't']
cap-shape ['b' 'c' 'f' 'k' 's' 'x']
cap-surface ['f' 'g' 's' 'y']
cap-color ['b' 'c' 'e' 'g' 'n' 'p' 'r' 'u' 'w' 'y']
bruises ['f' 't']
odor ['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']
gill-attachment ['a' 'f']
gill-spacing ['c' 'w']
gill-size ['b' 'n']
gill-color ['b' 'e' 'g' 'h' 'k' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
stalk-shape ['e' 't']
stalk-root ['?' 'b' 'c' 'e' 'r']
stalk-surface-above-ring ['f' 'k' 's' 'y']
stalk-surface-below-ring ['f' 'k' 's' 'y']
stalk-color-above-ring ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
stalk-color-below-ring ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
veil-type ['p']
veil-color ['n' 'o' 'w' 'y']
ring-number ['n' 'o' 't']
ring-type ['e' 'f' 'l' 'n' 'p']
spore-print-color ['b' 'h' 'k' 'n' 'o' 'r' 'u' 'w' 'y']
population ['a' 'c' 'n' 's' 'v' 'y']
habitat ['d' 'g' 'l' 'm' 'p' 'u' 'w']


#### Now copy this new encoded data into another dataframe

In [12]:
df.to_csv('mushroom_encoded_all.csv', index=False)

#### Now divided the encoded training data into Training and Validatation sets

In [13]:
columns = df.columns.values
columns

array(['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
       'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'], dtype=object)

In [14]:
# Randomize the dataset.
# Training is 70% of data
# Validation is 30% of data
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [15]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)
rows, train, test

(8124, 5686, 2437)

In [16]:
# Write Training Set
df[:train].to_csv('mushroom_train.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=columns)

In [17]:
# Write Validation Set
df[train:].to_csv('mushroom_validation.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=columns)

In [18]:
# Write Column List
with open('mushroom_train_column_list.txt','w') as f:
    f.write(','.join(columns))

#### Now train the Model with XGBoost 

In [19]:
column_list_file = 'mushroom_train_column_list.txt'
train_file = 'mushroom_train.csv'
validation_file = 'mushroom_validation.csv'

In [20]:
columns

array(['class_edible', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
       'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'], dtype=object)

In [21]:
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [22]:
df_train.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,5,2,0,1,2,1,0,0,3,...,0,7,7,0,2,1,4,1,3,1
1,0,3,3,4,0,7,1,0,1,0,...,1,6,6,0,2,1,0,7,4,0
2,0,2,3,9,0,2,1,0,0,2,...,1,6,6,0,2,1,2,1,4,1
3,1,0,3,8,1,0,1,0,0,10,...,2,7,7,0,2,1,4,2,2,3
4,1,5,3,2,1,5,1,0,0,7,...,2,6,3,0,2,1,4,2,4,0


In [23]:
df_validation.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,2,0,8,0,5,1,1,0,7,...,2,7,7,0,2,1,0,2,0,1
1,1,5,2,3,0,5,1,1,0,5,...,2,7,7,0,2,1,0,3,3,1
2,1,5,0,8,0,5,1,1,0,4,...,2,7,7,0,2,1,0,2,0,1
3,1,5,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,4,2
4,1,5,3,3,1,5,1,0,0,5,...,2,6,6,0,2,1,4,3,5,0


In [24]:
df_train.iloc[:,0].ravel()

array([0, 0, 0, ..., 1, 1, 1])

In [25]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [26]:
# Launch a classifier
classifier = xgb.XGBClassifier (objective='binary:logistic',n_estimators=50)

In [27]:
classifier

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [28]:
classifier.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)], eval_metric=['logloss'])

[0]	validation_0-logloss:0.610465	validation_1-logloss:0.611321
[1]	validation_0-logloss:0.542738	validation_1-logloss:0.544332
[2]	validation_0-logloss:0.486333	validation_1-logloss:0.488571
[3]	validation_0-logloss:0.438521	validation_1-logloss:0.440464
[4]	validation_0-logloss:0.397596	validation_1-logloss:0.40004
[5]	validation_0-logloss:0.362393	validation_1-logloss:0.365304
[6]	validation_0-logloss:0.331737	validation_1-logloss:0.334261
[7]	validation_0-logloss:0.301356	validation_1-logloss:0.303454
[8]	validation_0-logloss:0.274903	validation_1-logloss:0.276641
[9]	validation_0-logloss:0.251862	validation_1-logloss:0.253197
[10]	validation_0-logloss:0.231699	validation_1-logloss:0.232743
[11]	validation_0-logloss:0.214647	validation_1-logloss:0.216295
[12]	validation_0-logloss:0.199651	validation_1-logloss:0.201667
[13]	validation_0-logloss:0.182004	validation_1-logloss:0.184341
[14]	validation_0-logloss:0.166672	validation_1-logloss:0.169211
[15]	validation_0-logloss:0.153098	v

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [29]:
df.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
5350,0,5,2,0,1,2,1,0,0,3,...,0,7,7,0,2,1,4,1,3,1
7926,0,3,3,4,0,7,1,0,1,0,...,1,6,6,0,2,1,0,7,4,0
4804,0,2,3,9,0,2,1,0,0,2,...,1,6,6,0,2,1,2,1,4,1
569,1,0,3,8,1,0,1,0,0,10,...,2,7,7,0,2,1,4,2,2,3
1844,1,5,3,2,1,5,1,0,0,7,...,2,6,3,0,2,1,4,2,4,0


In [30]:
eval_result = classifier.evals_result()

In [31]:
training_rounds = range(len(eval_result['validation_0']['logloss']))
training_rounds

range(0, 50)

In [32]:
# plt.scatter(x=training_rounds,y=eval_result['validation_0']['logloss'],label='Training Error')
# plt.scatter(x=training_rounds,y=eval_result['validation_1']['logloss'],label='Validation Error')
# plt.grid(True)
# plt.xlabel('Input Feature')
# plt.ylabel('LogLoss')
# plt.title('Training Vs Validation Error')
# plt.legend()

In [33]:
# xgb.plot_importance(classifier)

#### Now Test the Model.

For testing the model we will use the encoded csv file. We will use only the feature column, i.e. the 2nd column onwards.

In [34]:
df = pd.read_csv('mushroom_encoded_all.csv')

In [35]:
df.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,1,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,1,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,0,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,1,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [36]:
X_test = df.iloc[:,1:]
print(X_test[:5])

   cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0          5            2          4        1     6                1   
1          5            2          9        1     0                1   
2          0            2          8        1     3                1   
3          5            3          8        1     6                1   
4          5            2          3        0     5                1   

   gill-spacing  gill-size  gill-color  stalk-shape   ...     \
0             0          1           4            0   ...      
1             0          0           4            0   ...      
2             0          0           5            0   ...      
3             0          1           5            0   ...      
4             1          0           4            1   ...      

   stalk-surface-below-ring  stalk-color-above-ring  stalk-color-below-ring  \
0                         2                       7                       7   
1                       

In [37]:
result = classifier.predict(X_test)

In [38]:
result[:5]

array([0, 1, 1, 0, 1])

Now we have predicted the results. Let us keep the predicted results in the new column called predicted results.

In [39]:
df['predicted_class'] = result

In [40]:
df.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,predicted_class
0,0,5,2,4,1,6,1,0,1,4,...,7,7,0,2,1,4,2,3,5,0
1,1,5,2,9,1,0,1,0,0,4,...,7,7,0,2,1,4,3,2,1,1
2,1,0,2,8,1,3,1,0,0,5,...,7,7,0,2,1,4,3,2,3,1
3,0,5,3,8,1,6,1,0,1,5,...,7,7,0,2,1,4,2,3,5,0
4,1,5,2,3,0,5,1,1,0,4,...,7,7,0,2,1,0,3,0,1,1


Let us encode back the class edible and predicted class columns from [0,1] to [edible, posionous]

In [41]:
df.class_edible = df.class_edible.map({1: 'edible', 0: 'posionous'})
df.predicted_class = df.predicted_class.map({1: 'edible', 0: 'posionous'})

In [42]:
df.head()

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,predicted_class
0,posionous,5,2,4,1,6,1,0,1,4,...,7,7,0,2,1,4,2,3,5,posionous
1,edible,5,2,9,1,0,1,0,0,4,...,7,7,0,2,1,4,3,2,1,edible
2,edible,0,2,8,1,3,1,0,0,5,...,7,7,0,2,1,4,3,2,3,edible
3,posionous,5,3,8,1,6,1,0,1,5,...,7,7,0,2,1,4,2,3,5,posionous
4,edible,5,2,3,0,5,1,1,0,4,...,7,7,0,2,1,0,3,0,1,edible


Count how may values in class edible(original) and predicted class(predicted) columns

In [43]:
df.class_edible.value_counts()

edible       4208
posionous    3916
Name: class_edible, dtype: int64

In [44]:
df.predicted_class.value_counts()

edible       4264
posionous    3860
Name: predicted_class, dtype: int64

In [45]:
print('Confusion matrix - Actual versus Predicted')
pd.crosstab(df['class_edible'], df['predicted_class'])

Confusion matrix - Actual versus Predicted


predicted_class,edible,posionous
class_edible,Unnamed: 1_level_1,Unnamed: 2_level_1
edible,4208,0
posionous,56,3860


In [46]:
import sklearn.metrics as metrics
print(metrics.classification_report(df['class_edible'], df['predicted_class']))

             precision    recall  f1-score   support

     edible       0.99      1.00      0.99      4208
  posionous       1.00      0.99      0.99      3916

avg / total       0.99      0.99      0.99      8124

