In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

# AI related imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
columnNames = ['edible?', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 
               'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
               'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 
               'spore-print-color', 'population', 'habitat']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', names=columnNames, dtype=str)
df.head()

Unnamed: 0,edible?,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
print(f"Number of Poisonous Mushrooms: {df['edible?'].value_counts()[1]}\nNumber of Edible Mushrooms: {df['edible?'].value_counts()[0]}")
print(f"Shape of df = {df.shape}")

Number of Poisonous Mushrooms: 3916
Number of Edible Mushrooms: 4208
Shape of df = (8124, 23)


In [None]:
# finding missing values (labelled as '?')
df.isin(['?']).sum()

edible?                        0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [None]:
# using Modal impute
df.replace({'?': df['stalk-root'].mode()[0]}, inplace=True)
# check if data is clean
df.isin(['?']).sum().sum() == 0

True

In [None]:
# Remove unwanted features
df.var()
"""
edible?                      0.249708
cap-shape                    2.573872
cap-surface                  1.512586
cap-color                    6.481204
bruises                      0.242900
odor                         4.425676
gill-attachment              0.025184
gill-spacing                 0.135432
gill-size                    0.213624
gill-color                  12.534142
stalk-shape                  0.245513
stalk-root                   3.040309
stalk-surface-above-ring     0.386212
stalk-surface-below-ring     0.456941
stalk-color-above-ring       3.616643
stalk-color-below-ring       3.637761
veil-type                    0.000000
veil-color                   0.058888
ring-number                  0.073476
ring-type                    3.246022
spore-print-color            5.677084
population                   1.567709
habitat                      2.958316
"""
# from this array, veil-type, gill-attachment, veil-color, ring-number can be removed
df.drop(['veil-type', 'gill-attachment', 'veil-color', 'ring-number'], axis='columns', inplace=True)

  df.var()


In [None]:
# Label encoding the df
for column in df.columns:
    df[column] = LabelEncoder().fit_transform(df[column])
df.head()

Unnamed: 0,edible?,cap-shape,cap-surface,cap-color,bruises,odor,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,0,1,4,0,2,2,2,7,7,4,2,3,5
1,0,5,2,9,1,0,0,0,4,0,1,2,2,7,7,4,3,2,1
2,0,0,2,8,1,3,0,0,5,0,1,2,2,7,7,4,3,2,3
3,1,5,3,8,1,6,0,1,5,0,2,2,2,7,7,4,2,3,5
4,0,5,2,3,0,5,1,0,4,1,2,2,2,7,7,0,3,0,1


In [None]:
# defining X and y
X = df.iloc[:, 1:].to_numpy()
y = df.iloc[:, 0].to_numpy()

In [None]:
# spliting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
len(X_train) == len(y_train)

True

In [None]:
# making the model
RFmodel = RFC(n_estimators=2, random_state=42)
RFmodel.fit(X_train, y_train)

In [None]:
y_pred = RFmodel.predict(X_test)
print(f"Random Forest Classifier statistics:\n----------------\nAccuracy: {accuracy_score(y_test, y_pred)*100:.4f}%\n\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}\n----------------")

Random Forest Classifier statistics:
----------------
Accuracy: 99.9180%

Confusion Matrix:
[[1272    0]
 [   2 1164]]
----------------


In [None]:
LRmodel = LR(max_iter=500)
LRmodel.fit(X_train, y_train)

In [None]:
LRy_pred = LRmodel.predict(X_test)
print(f"Logistic Regression statistics:\n----------------\nAccuracy: {accuracy_score(y_test, LRy_pred)*100:.4f}%\n\nConfusion Matrix:\n{confusion_matrix(y_test, LRy_pred)}\n----------------")

Logistic Regression statistics:
----------------
Accuracy: 94.7498%

Confusion Matrix:
[[1211   61]
 [  67 1099]]
----------------
