In [184]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load and Explore data

In [185]:
df = pd.read_csv('./mushroom2020_dataset.csv')

df.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [186]:
target_col = 'gill-size'

na_amt = df[df[target_col].isna()].shape[0]

print(f"Column {target_col} has {na_amt} rows that be NaN")

Column gill-size has 121 rows that be NaN


In [187]:
drop_col = ['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', \
            'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type']
df.drop(columns=drop_col, inplace=True)

df.dropna(subset=['label'], inplace=True)

df.reset_index()

df

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5819,e,k,s,f,n,e,o,p,b,c,l,1.0
5820,e,x,s,f,n,e,o,p,b,v,l,1.0
5821,e,f,s,f,n,e,o,p,b,c,l,1.0
5822,p,k,y,f,y,t,o,e,w,v,l,1.0


In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5737 non-null   object 
 3   bruises            5665 non-null   object 
 4   odor               5665 non-null   object 
 5   stalk-shape        5643 non-null   object 
 6   ring-number        5702 non-null   object 
 7   ring-type          5702 non-null   object 
 8   spore-print-color  5708 non-null   object 
 9   population         5708 non-null   object 
 10  habitat            5733 non-null   object 
 11  cap-color-rate     5737 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


In [189]:
print(f"Shape of data is : {df.shape}")

missed_col = []

for col in df.columns:
    na_amt = df[col].isna().sum()
    if na_amt != 0:
        print(f"For column {col:20}has {na_amt:4} rows that be NaN ({100*na_amt/df.shape[0]:3.4f}%)")
        missed_col.append(col)

Shape of data is : (5764, 12)
For column cap-surface         has   27 rows that be NaN (0.4684%)
For column bruises             has   99 rows that be NaN (1.7176%)
For column odor                has   99 rows that be NaN (1.7176%)
For column stalk-shape         has  121 rows that be NaN (2.0992%)
For column ring-number         has   62 rows that be NaN (1.0756%)
For column ring-type           has   62 rows that be NaN (1.0756%)
For column spore-print-color   has   56 rows that be NaN (0.9715%)
For column population          has   56 rows that be NaN (0.9715%)
For column habitat             has   31 rows that be NaN (0.5378%)
For column cap-color-rate      has   27 rows that be NaN (0.4684%)


# Process data

In [190]:
mapped_dict = {}
for col in missed_col:
    if df[col].dtype == np.float64: # Numeric
        mapped_dict[col] = df[col].mean()
    else: # Category
        mapped_dict[col] = df[col].mode()[0]

df.fillna(mapped_dict, inplace=True)

df.head()

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0


In [191]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5764 non-null   object 
 3   bruises            5764 non-null   object 
 4   odor               5764 non-null   object 
 5   stalk-shape        5764 non-null   object 
 6   ring-number        5764 non-null   object 
 7   ring-type          5764 non-null   object 
 8   spore-print-color  5764 non-null   object 
 9   population         5764 non-null   object 
 10  habitat            5764 non-null   object 
 11  cap-color-rate     5764 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


In [192]:
label_map = {'e':1,
             'p':0}

df['label'].replace(label_map, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   int64  
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5764 non-null   object 
 3   bruises            5764 non-null   object 
 4   odor               5764 non-null   object 
 5   stalk-shape        5764 non-null   object 
 6   ring-number        5764 non-null   object 
 7   ring-type          5764 non-null   object 
 8   spore-print-color  5764 non-null   object 
 9   population         5764 non-null   object 
 10  habitat            5764 non-null   object 
 11  cap-color-rate     5764 non-null   float64
dtypes: float64(1), int64(1), object(10)
memory usage: 585.4+ KB


In [193]:
# show value count of each label
df['label'].value_counts()

label
0    3660
1    2104
Name: count, dtype: int64

In [194]:
# find column to encode
obj_col = []
for col in df.columns:
    if df[col].dtype == object:
        obj_col.append(col)

# create encoder
enc = OneHotEncoder(drop='first').fit(df[obj_col])

## encode
df[enc.get_feature_names_out()] = enc.transform(df[obj_col]).toarray()

# drop original
df.drop(columns=obj_col, inplace=True)

# show df
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   label                5764 non-null   int64  
 1   cap-color-rate       5764 non-null   float64
 2   cap-shape_c          5764 non-null   float64
 3   cap-shape_f          5764 non-null   float64
 4   cap-shape_k          5764 non-null   float64
 5   cap-shape_x          5764 non-null   float64
 6   cap-surface_g        5764 non-null   float64
 7   cap-surface_s        5764 non-null   float64
 8   cap-surface_y        5764 non-null   float64
 9   bruises_t            5764 non-null   float64
 10  odor_c               5764 non-null   float64
 11  odor_f               5764 non-null   float64
 12  odor_l               5764 non-null   float64
 13  odor_m               5764 non-null   float64
 14  odor_n               5764 non-null   float64
 15  odor_p               5764 non-null   float6

In [195]:
label = 'label'
features = list(df.columns)
features.remove(label)

X_train, X_test, y_train, y_test = train_test_split(df[features], 
                                                    df[label], 
                                                    test_size=0.2,
                                                    stratify=df[label],
                                                    random_state=2020)

print(f"X_train shape : {X_train.shape}")
print(f"X_test shape : {X_test.shape}")
print(f"y_train shape : {y_train.shape}")
print(f"y_test shape : {y_test.shape}")

X_train shape : (4611, 42)
X_test shape : (1153, 42)
y_train shape : (4611,)
y_test shape : (1153,)


# Fit model

In [196]:
base_model = RandomForestClassifier()

param_grid = {'criterion': ['gini','entropy'],
              'max_depth': [2,3,6],
              'min_samples_leaf':[2,5,10],
              'n_estimators':[100,200],
              'random_state': [2020]
}

model = GridSearchCV(base_model,
                     param_grid=param_grid,
                     cv=5)

model.fit(X_train, y_train)

model.best_params_

KeyboardInterrupt: 

# Evaluate

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.993     0.999     0.996       732
           1      0.998     0.988     0.993       421

    accuracy                          0.995      1153
   macro avg      0.995     0.993     0.994      1153
weighted avg      0.995     0.995     0.995      1153



In [None]:
print(f'\t   False    True')
for idx, row in enumerate(confusion_matrix(y_test, y_pred)):
    print(['Negative','Positive'][idx], end='')
    for element in row:
        print(f"{element:8}", end='')
    print()

	   False    True
Negative     731       1
Positive       5     416
