# Detect poison mushrooms with support vector machines (SVM) in python
11-detect-poison-mushrooms-w-svm-in-python

In [1]:
# Set-up libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [3]:
# Read data into dataframe
df = pd.read_csv('../00-Datasets/mushrooms.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
df.tail()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l
8123,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,o,c,l


In [7]:
df.sample(6)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3934,p,f,f,y,f,f,f,c,b,h,...,k,b,b,p,w,o,l,h,v,p
2487,e,x,s,w,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,s,g
7754,p,k,s,n,f,f,f,c,n,b,...,s,p,w,p,w,o,e,w,v,p
6451,p,x,s,e,f,y,f,c,n,b,...,k,p,w,p,w,o,e,w,v,l
447,e,f,y,n,t,l,f,c,b,n,...,y,w,w,p,w,o,p,k,s,g
5808,p,x,s,b,t,f,f,c,b,h,...,s,w,w,p,w,o,p,h,s,u


In [8]:
# Check for missing values
df.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [7]:
# Check for duplicate values
df.duplicated().sum()

0

In [9]:
# Explore tabular summary
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [11]:
# Prepare the data
df = df.sample(5000, random_state=0)
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 380 to 1100
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   class                     5000 non-null   int32
 1   cap-shape                 5000 non-null   int32
 2   cap-surface               5000 non-null   int32
 3   cap-color                 5000 non-null   int32
 4   bruises                   5000 non-null   int32
 5   odor                      5000 non-null   int32
 6   gill-attachment           5000 non-null   int32
 7   gill-spacing              5000 non-null   int32
 8   gill-size                 5000 non-null   int32
 9   gill-color                5000 non-null   int32
 10  stalk-shape               5000 non-null   int32
 11  stalk-root                5000 non-null   int32
 12  stalk-surface-above-ring  5000 non-null   int32
 13  stalk-surface-below-ring  5000 non-null   int32
 14  stalk-color-above-ring    5000 non-nul

In [13]:
# Split dataset into 80% train and 20% test
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
# Build model
model = SVC()
model.fit(X_train, y_train)

In [16]:
# Feed test data to model
y_predict = model.predict(X_test)

actual_vs_predict = pd.DataFrame({'Actual': y_test,
                                 'Predict': y_predict
                                 })

actual_vs_predict.sample(12)

Unnamed: 0,Actual,Predict
2913,0,0
3112,0,0
422,1,1
1476,0,0
2943,0,0
59,0,0
6638,1,1
1702,0,0
4681,1,1
2714,0,0


In [17]:
# Evaluate model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       512
           1       1.00      0.98      0.99       488

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



## Notes
* We wanted to detect poisoned mushrooms.
* Our dataset had 8124 records and 23 features, all of which are of data type object.
* A quick look showed that there were no missing or duplicated values.
* We picked a sample size of 5000 to use in our model.
* The dataset was split into 80% train and 20% test sets. No validation set was created or used. No hyperparameter tuning occurred.
* We built, trained, and tested an SVM.
* We evaluated our model using ground truth and several metrics including precision, recall, and f1-score. 