In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('mushrooms.csv')

In [3]:
df.shape

(8124, 23)

In [7]:
df.describe().T

Unnamed: 0,count,unique,top,freq
class,8124,2,e,4208
cap-shape,8124,6,x,3656
cap-surface,8124,4,y,3244
cap-color,8124,10,n,2284
bruises,8124,2,f,4748
odor,8124,9,n,3528
gill-attachment,8124,2,f,7914
gill-spacing,8124,2,c,6812
gill-size,8124,2,b,5612
gill-color,8124,12,b,1728


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [10]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [11]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [14]:
for i in df.columns:
    print(f'{i} -> {df[i].unique()}')

class -> ['p' 'e']
cap-shape -> ['x' 'b' 's' 'f' 'k' 'c']
cap-surface -> ['s' 'y' 'f' 'g']
cap-color -> ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises -> ['t' 'f']
odor -> ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill-attachment -> ['f' 'a']
gill-spacing -> ['c' 'w']
gill-size -> ['n' 'b']
gill-color -> ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-shape -> ['e' 't']
stalk-root -> ['e' 'c' 'b' 'r' '?']
stalk-surface-above-ring -> ['s' 'f' 'k' 'y']
stalk-surface-below-ring -> ['s' 'f' 'y' 'k']
stalk-color-above-ring -> ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring -> ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-type -> ['p']
veil-color -> ['w' 'n' 'o' 'y']
ring-number -> ['o' 't' 'n']
ring-type -> ['p' 'e' 'l' 'f' 'n']
spore-print-color -> ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population -> ['s' 'n' 'a' 'v' 'y' 'c']
habitat -> ['u' 'g' 'm' 'd' 'p' 'w' 'l']


In [18]:
df['cap-shape'].head()

0    x
1    x
2    b
3    x
4    x
Name: cap-shape, dtype: object

In [20]:
df['class'].head() # either posioned or not

0    p
1    e
2    e
3    p
4    e
Name: class, dtype: object

In [21]:
# get the numeric representation
for i in df.columns: #all columns are objects so we will make it all of them numerical.
    df[i] = pd.factorize(df[i])[0]

In [23]:
df['class'].head()

0    0
1    1
2    1
3    0
4    1
Name: class, dtype: int32

In [26]:
pd.crosstab(df['cap-shape'], df['class']) # computes a frequency table of the factors

class,0,1
cap-shape,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1708,1948
1,48,404
2,0,32
3,1556,1596
4,600,228
5,4,0


### Feature Selection 

In [28]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.feature_selection import mutual_info_classif

X = df.drop('class', axis = 1)
y = df['class']

In [29]:
threshold = VarianceThreshold(threshold = 0.01)
threshold.fit(X,y) # Feature selector that removes all low-variance features. 

VarianceThreshold(threshold=0.01)

In [30]:
threshold.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True])

In [31]:
sum(threshold.get_support())

21

In [32]:
X = threshold.transform(X)

The chi-square test helps you to solve the problem in feature selection by testing the relationship between the features.

In [35]:
chi2selector = SelectKBest(chi2, k=11)
X_kbest = chi2selector.fit_transform(X,y)

In [36]:
X_kbest.shape

(8124, 11)

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train,X_test,y_train,y_test = train_test_split(X_kbest,y,test_size=0.15,random_state=1)

In [39]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features=9,max_depth=5,n_estimators=10)
rf.fit(X_train,y_train)
rf.score(X_train, y_train)

0.9991310644460536

In [40]:
cross_val_score(rf,X_train,y_train,cv=5)

array([0.99131064, 0.99927589, 0.99855177, 0.99855177, 1.        ])

In [41]:
rf.feature_importances_

array([0.00303876, 0.81065711, 0.01097902, 0.03303339, 0.        ,
       0.00779766, 0.00271875, 0.00104771, 0.00511861, 0.00528502,
       0.12032397])

In [42]:
rf.score(X_test,y_test)

0.9983593109105825

In [43]:
from sklearn.metrics import classification_report,roc_auc_score,roc_curve,auc

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       604
           1       1.00      1.00      1.00       615

    accuracy                           1.00      1219
   macro avg       1.00      1.00      1.00      1219
weighted avg       1.00      1.00      1.00      1219



In [44]:
roc_auc_score(y_test,y_pred)

0.9983443708609271

- The roc_auc_score always runs from 0 to 1, and is sorting predictive possibilities. 0.5 is the baseline for random guessing, so you want to always get above 0.5.


- AUC - ROC curve is a performance measurement for classification problem at various thresholds settings. ROC is a probability curve and AUC represents degree or measure of separability