In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

In [2]:
# download the dataset from Kaggle
!kaggle datasets download -d uciml/mushroom-classification -p ../Data

Dataset URL: https://www.kaggle.com/datasets/uciml/mushroom-classification
License(s): CC0-1.0
Downloading mushroom-classification.zip to ../Data
  0% 0.00/34.2k [00:00<?, ?B/s]
100% 34.2k/34.2k [00:00<00:00, 36.4MB/s]


In [3]:
# unzip the dataset
!unzip ../Data/mushroom-classification.zip -d ../Data

Archive:  ../Data/mushroom-classification.zip
  inflating: ../Data/mushrooms.csv   


In [10]:
mushroom = pd.read_csv('../Data/mushrooms.csv')
mushroom

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [11]:
# splitting the dataset
train_data, test_data = train_test_split(mushroom, test_size = 0.2)

In [12]:
# checking for null or '?' values for train data
# and filling them with NA values
print(train_data.isnull().sum())
print(train_data[train_data == '?'].count())

train_data.replace('?', pd.NA, inplace = True)

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64
class                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0


In [13]:
# checking for null or '?' values for test data
# and filling them with NA values
print(test_data.isnull().sum())
print(test_data[test_data == '?'].count())

test_data.replace('?', pd.NA, inplace = True)

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64
class                         0
cap-shape                     0
cap-surface                   0
cap-color                     0
bruises                       0
odor                          0
gill-attachment               0
gill-spacing                  0
gill-size                     0
gill-colo

In [14]:
# filling NA values in the stalk-root column
most_frequent_value = train_data['stalk-root'].mode()[0]

train_data['stalk-root'] = train_data['stalk-root'].fillna(most_frequent_value)
test_data['stalk-root'] = test_data['stalk-root'].fillna(most_frequent_value)

In [15]:
# encoding the categorical features
label_encoders = {}

columns = train_data.drop(columns = ['class']).columns

for column in columns:
    label_encoder = LabelEncoder()

    train_data.loc[:, column] = label_encoder.fit_transform(train_data[column])

    test_data.loc[:, column] = label_encoder.transform(test_data[column])

train_data.loc[:, 'class'] = train_data['class'].map({'e': 0, 'p': 1})
test_data.loc[:, 'class'] = test_data['class'].map({'e': 0, 'p': 1})

In [16]:
train_data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
5604,0,3,3,5,1,5,1,0,0,1,...,2,2,7,0,2,2,0,7,1,6
1888,0,2,0,3,1,5,1,0,0,5,...,2,3,3,0,2,1,4,2,4,0
7764,1,3,2,2,0,7,1,0,1,0,...,2,6,7,0,2,1,0,7,4,4
771,0,0,2,9,1,3,1,0,0,4,...,2,7,7,0,2,1,4,3,3,3
40,0,0,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,3,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,0,5,0,3,1,5,1,0,0,10,...,2,6,6,0,2,1,4,2,4,0
2481,0,5,0,2,1,5,1,0,0,5,...,2,7,3,0,2,1,4,2,5,0
4003,0,2,0,2,1,5,1,0,0,10,...,2,3,6,0,2,1,4,2,5,0
5553,0,2,3,7,0,5,1,0,1,7,...,0,7,7,0,2,1,1,1,5,0


In [27]:
# splitting the dataset
X_train = train_data.drop(columns = ['class'])
X_test = test_data.drop(columns = ['class'])
y_train = train_data['class']
y_test = test_data['class']

In [34]:
# convert y_train and y_test to integers to ensure consistent numeric encoding
y_train.unique()
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [33]:
# training the model
model = RandomForestClassifier(
    n_estimators = 100,
    max_depth = None,
    bootstrap = True,
    oob_score = True,
    max_samples = None,
)
model.fit(X_train, y_train)

In [31]:
# predicting on the train and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [32]:
# model evaluation
recall = recall_score(y_train, y_train_pred, pos_label = 1, average='weighted')
print(f"Recall on Training Data: {recall}")

recall = recall_score(y_test, y_test_pred, pos_label = 1, average='weighted')
print(f"Recall on Test Data: {recall}")

Recall on Training Data: 1.0
Recall on Test Data: 1.0
