# Import Libraries

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sb
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

test_size = 0.3
random_state = 100

# Import and view data

In [None]:
data = pd.read_csv("./mushrooms.csv")
data.info()

In [None]:
data.head(10)

# Handle null values

In [None]:
data.isnull().sum()

# Handle categorical values

In [None]:
data.dtypes

In [None]:
mapper_class = {key: value for value, key in enumerate(data[data.columns[0]].unique())}
mapper_cap_shape = {key: value for value, key in enumerate(data[data.columns[1]].unique())}
mapper_cap_surface = {key: value for value, key in enumerate(data[data.columns[2]].unique())}
mapper_cap_color = {key: value for value, key in enumerate(data[data.columns[3]].unique())}
mapper_bruises = {key: value for value, key in enumerate(data[data.columns[4]].unique())}
mapper_odor = {key: value for value, key in enumerate(data[data.columns[5]].unique())}
mapper_gill_attachment = {key: value for value, key in enumerate(data[data.columns[6]].unique())}
mapper_gill_spacing = {key: value for value, key in enumerate(data[data.columns[7]].unique())}
mapper_gill_size = {key: value for value, key in enumerate(data[data.columns[8]].unique())}
mapper_gill_color = {key: value for value, key in enumerate(data[data.columns[9]].unique())}
mapper_stalk_shape = {key: value for value, key in enumerate(data[data.columns[10]].unique())}
mapper_stalk_root = {key: value for value, key in enumerate(data[data.columns[11]].unique())}
mapper_stalk_surface_above_ring = {key: value for value, key in enumerate(data[data.columns[12]].unique())}
mapper_stalk_surface_below_ring = {key: value for value, key in enumerate(data[data.columns[13]].unique())}
mapper_stalk_color_above_ring = {key: value for value, key in enumerate(data[data.columns[14]].unique())}
mapper_stalk_color_below_ring = {key: value for value, key in enumerate(data[data.columns[15]].unique())}
mapper_veil_type = {key: value for value, key in enumerate(data[data.columns[16]].unique())}
mapper_veil_color = {key: value for value, key in enumerate(data[data.columns[17]].unique())}
mapper_ring_number = {key: value for value, key in enumerate(data[data.columns[18]].unique())}
mapper_ring_type = {key: value for value, key in enumerate(data[data.columns[19]].unique())}
mapper_spore_print_color = {key: value for value, key in enumerate(data[data.columns[20]].unique())}
mapper_population = {key: value for value, key in enumerate(data[data.columns[21]].unique())}
mapper_habitat = {key: value for value, key in enumerate(data[data.columns[22]].unique())}

In [None]:
data[data.columns[0]] = data[data.columns[0]].map(mapper_class)
data[data.columns[1]] = data[data.columns[1]].map(mapper_cap_shape)
data[data.columns[2]] = data[data.columns[2]].map(mapper_cap_surface)
data[data.columns[3]] = data[data.columns[3]].map(mapper_cap_color)
data[data.columns[4]] = data[data.columns[4]].map(mapper_bruises)
data[data.columns[5]] = data[data.columns[5]].map(mapper_odor)
data[data.columns[6]] = data[data.columns[6]].map(mapper_gill_attachment)
data[data.columns[7]] = data[data.columns[7]].map(mapper_gill_spacing)
data[data.columns[8]] = data[data.columns[8]].map(mapper_gill_size)
data[data.columns[9]] = data[data.columns[9]].map(mapper_gill_color)
data[data.columns[10]] = data[data.columns[10]].map(mapper_stalk_shape)
data[data.columns[11]] = data[data.columns[11]].map(mapper_stalk_root)
data[data.columns[12]] = data[data.columns[12]].map(mapper_stalk_surface_above_ring)
data[data.columns[13]] = data[data.columns[13]].map(mapper_stalk_surface_below_ring)
data[data.columns[14]] = data[data.columns[14]].map(mapper_stalk_color_above_ring)
data[data.columns[15]] = data[data.columns[15]].map(mapper_stalk_color_below_ring)
data[data.columns[16]] = data[data.columns[16]].map(mapper_veil_type)
data[data.columns[17]] = data[data.columns[17]].map(mapper_veil_color)
data[data.columns[18]] = data[data.columns[18]].map(mapper_ring_number)
data[data.columns[19]] = data[data.columns[19]].map(mapper_ring_type)
data[data.columns[20]] = data[data.columns[20]].map(mapper_spore_print_color)
data[data.columns[21]] = data[data.columns[21]].map(mapper_population)
data[data.columns[22]] = data[data.columns[22]].map(mapper_habitat)

data.head(10)

In [None]:
data.info()

# Derive Data

In [None]:
data.head()

In [None]:
x = data.iloc[:, 1:]
y = data.iloc[:, 0]

# Scaling

In [None]:
scaler = MinMaxScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)
x

# Explore

In [None]:
sb.set(rc={"figure.figsize":(20, 15)})
sb.heatmap(data.corr(), annot = True)

There are no useless columns.

In [None]:
sb.pairplot(data)

# Train-Test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = random_state)

# Create and train model

## Logistic Regression

In [None]:
model_lr = LogisticRegression(penalty = "l2", random_state = random_state, max_iter = 10000, n_jobs = -1)
model_lr.fit(x_train, y_train)
model_lr.score(x_test, y_test)

While nearly 100% accuracy is not normal, we need to see that veil_type has almose 1 correlation so this is actually expected.

In [None]:
y_predict_lr = model_lr.predict(x_test)
confusion_matrix(y_predict_lr, y_test)

## Random Forest Classifier

In [None]:
grid_rf = GridSearchCV(RandomForestClassifier(criterion = "gini", random_state = random_state), {"n_estimators": range(2, 502, 10)}, cv = 10)
grid_rf.fit(x_train, y_train)
model_rf = grid_rf.best_estimator_
model_rf.score(x_test, y_test)

In [None]:
y_predict_rf = model_rf.predict(x_test)
confusion_matrix(y_predict_rf, y_test)

100% accuracy again can be attributed to the high correlation in data, so pay no heed.

In [None]:
model_rf.get_params()

## Naive Bayes

In [None]:
model_nb = GaussianNB()
model_nb.fit(x_train, y_train)
model_nb.score(x_test, y_test)

In [None]:
y_predict_nb = model_nb.predict(x_test)
confusion_matrix(y_predict_nb, y_test)

## ANN

In [None]:
inputs = tf.keras.Input(shape = (x_train.shape[1], ))
x = tf.keras.layers.Dense(32, activation = "relu")(inputs)
x = tf.keras.layers.Dense(64, activation = "relu")(x)
outputs = tf.keras.layers.Dense(1, activation = "sigmoid")(x)

model = tf.keras.Model(inputs = inputs, outputs = outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.05)
metrics = [
    tf.keras.metrics.BinaryAccuracy(name = "acc"),
    tf.keras.metrics.AUC(name = "auc")
]

model.compile(optimizer, loss = "binary_crossentropy", metrics = metrics)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience = 5)
history = model.fit(x_train, y_train, validation_split = 0.2, batch_size = 100, epochs = 99999999, callbacks=[early_stopping])

In [None]:
sb.set(rc={"figure.figsize":(5, 5)})
sb.lineplot(x = range(len(history.history["loss"])), y = history.history["loss"], label = "Training")
sb.lineplot(x = range(len(history.history["loss"])), y = history.history["val_loss"], label = "Validation")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend()

In [None]:
sb.lineplot(x = range(len(history.history["loss"])), y = history.history["acc"], label = "Training")
sb.lineplot(x = range(len(history.history["loss"])), y = history.history["val_acc"], label = "Validation")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend()