In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Mushroom/mushroom_dataset.csv", sep=";")

df.head(5)

In [None]:
features = df.shape[1]
print(f"There are {features - 1} features.")

print("It is a binary classification problem since the target variable is a class which has two categories - edible(e) or poisonus(p)")

In [None]:
data_points = (df.shape[0])
print(f'{data_points} datapoints.')

In [None]:
print(df.dtypes)

print()

print("Our dataset has a mix of quantitative and categorical features.\n-->quantitative features (cap-diameter, stem-height, stem-width).\n-->categorical features (all others, including target class).")

In [None]:
df.iloc[34920:35240]

In [None]:
print(df['class'].value_counts())
print("\nClass distribution (percentages):")
print(df['class'].value_counts(normalize=True) * 100)

In [None]:
import matplotlib.pyplot as plt

class_counts = df['class'].value_counts().sort_index()

labels = [f'edible', 'poisonous']

plt.figure(figsize=(6,4))
plt.bar(labels, class_counts.values, color=['skyblue', 'salmon'])
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution of Mushrooms')
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df.keys()

In [None]:
df.info()

In [None]:
df.drop(['stem-root', 'veil-type', 'veil-color', 'spore-print-color', 'has-ring'], axis=1, inplace=True)
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
numerical_data = df.select_dtypes(include='number')
numerical_features=numerical_data.columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

In [None]:
categorical_data=df.select_dtypes(include= 'object')
categorical_features=categorical_data.columns.tolist()
print(f'There are {len(categorical_features)} categorical features:', '\n')
print(categorical_features)

In [None]:
df['gill-spacing'].fillna('c', inplace=True)
df['stem-surface'].fillna('s', inplace=True)
df['cap-surface'].fillna('t', inplace=True)
df['gill-attachment'].fillna('a', inplace=True)
df['ring-type'].fillna('f', inplace=True)

In [None]:
print(len(categorical_features))
for _ in categorical_features:
  print(f"{_}{df[_].unique()}")

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for column in df.columns:
    if column not in numerical_features:
      df[column] = encoder.fit_transform(df[column].astype(str))

In [None]:
df.isnull().sum()

In [None]:
numerical_data = df.select_dtypes(include='number')
numerical_features=numerical_data.columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

In [None]:
df.iloc[34920:35240]

In [None]:
numerical_data.describe().T

In [None]:
categorical_data.describe().T

In [None]:
numerical_data.var()

In [None]:
numerical_data.skew()

In [None]:
numerical_data.hist(figsize=(12,12),bins=20)
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(20, 30))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), 1, i)
    sns.boxplot(x=df[col], color='skyblue')
    plt.title(f'Boxplot of {col}', fontsize=12)
    plt.tight_layout()

plt.show()


In [None]:
numerical_data.nunique()

In [None]:
numerical_data.isnull().sum()

In [None]:
unique_counts=categorical_data.nunique()
print(unique_counts)

In [None]:
for col in categorical_features:
    plt.title(f'Distribution of {col}')
    categorical_data[col].value_counts().sort_index().plot(kind='bar', rot=0, xlabel=col,ylabel='count')
    plt.show()

In [None]:
correlation_matrix = numerical_data.corr()
correlation_matrix

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.3f', linewidths=0.3)
plt.show()

In [None]:
target_col = 'class'

fig, axes = plt.subplots(3, 1, figsize=(10, 12))

corr_pearson = numerical_data.corr(method='pearson')[[target_col]].sort_values(by=target_col, ascending=False)
corr_spearman = numerical_data.corr(method='spearman')[[target_col]].sort_values(by=target_col, ascending=False)
corr_kendall = numerical_data.corr(method='kendall')[[target_col]].sort_values(by=target_col, ascending=False)

titles = ['Pearson Correlation', 'Spearman Correlation', 'Kendall Correlation']
correlations = [corr_pearson, corr_spearman, corr_kendall]

for ax, corr, title in zip(axes, correlations, titles):
    sns.heatmap(corr, ax=ax, annot=True, cmap="coolwarm", cbar=False, fmt=".2f")
    ax.set_title(title, fontsize=14)

plt.tight_layout()
plt.show()


In [None]:
class_counts=df.groupby("class").size()

columns=['class','count','percentage']
outcome=[0,1]
count=list()
percentage=list()

for val in range(2):
    count.append(class_counts[val])
    percent=(class_counts[val]/105000)*100
    percentage.append(percent)

imbalance_df=pd.DataFrame(list(zip(outcome,count,percentage)),columns=columns)
imbalance_df

In [None]:
numerical_data.plot(kind='density', figsize=(14,14), subplots=True, layout=(4,4), title="Density plot of Numerical features", sharex=False)
plt.show()

In [None]:
X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)

In [None]:
fig,ax = plt.subplots()
ax.hist(y_train)
ax.set_xlabel('class label')
ax.set_ylabel('no of instances')
ax.grid(True)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred_KNN = knn.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy_KNN = accuracy_score(y_test, y_pred_KNN)
print("Accuracy k-nearest neighbors(KNN) = %f " % (accuracy_KNN * 100) + '%')

error_KNN = 1 - accuracy_KNN
print("Error k-nearest neighbors(KNN) = %f " % (error_KNN * 100) + '%')

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_KNN)
report = classification_report(y_test, y_pred_KNN)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred_decision_tree = clf.predict(X_test)
from sklearn.metrics import accuracy_score

accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
print("Accuracy Decision Tree = %f " % (accuracy_decision_tree * 100) + '%')

error_decision_tree = 1 - accuracy_decision_tree
print("Error Decision Tree = %f " % (error_decision_tree * 100) + '%')

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_decision_tree)
report = classification_report(y_test, y_pred_decision_tree)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

In [None]:
from sklearn.neural_network import MLPClassifier
model_MLP = MLPClassifier()
model_MLP.fit(X_train, y_train)
pred_MLP = model_MLP.predict(X_test)
accuracy_neural = accuracy_score(y_test, pred_MLP)
print("Accuracy neural = %f " % (accuracy_neural * 100) + '%')

error_neural = 1 - accuracy_neural
print("Error neural = %f " % (error_neural * 100) + '%')

In [None]:
conf_matrix = confusion_matrix(y_test, pred_MLP)
report = classification_report(y_test, pred_MLP)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

In [None]:
categories = ['KNN', 'Decision Tree', "Neural Network"]
values = [accuracy_KNN, accuracy_decision_tree, accuracy_neural]

plt.bar(categories, values, width= 0.6)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Prediction Accuracy")
plt.xticks(rotation=45, ha='right')
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

sil_score = silhouette_score(X_scaled, clusters)
print(f"Silhouette score for KMeans (k=2): {sil_score:.4f}")

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

fig, axes = plt.subplots(1, 2, figsize=(12,5))
axes[0].scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap='tab10', s=8)
axes[0].set_title('KMeans Clusters (k=2)')
axes[0].set_xlabel('PC1'); axes[0].set_ylabel('PC2')

axes[1].scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='tab10', s=8)
axes[1].set_title('True Labels')
axes[1].set_xlabel('PC1'); axes[1].set_ylabel('PC2')

plt.tight_layout()
plt.show()

contingency = pd.crosstab(clusters, y, rownames=['cluster'], colnames=['true_label'])
print("Contingency table (clusters vs true labels):")
print(contingency)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("Shapes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report,
    log_loss
)

models = {
    'KNN': KNeighborsClassifier(n_neighbors=6),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(max_iter=500, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        try:
            df_scores = model.decision_function(X_test)
            y_proba = (df_scores - df_scores.min()) / (df_scores.max() - df_scores.min() + 1e-8)
        except:
            y_proba = None
    else:
        y_proba = None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    ll = log_loss(y_test, y_proba) if y_proba is not None else None

    results.append({
        'model': name,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'auc': auc,
        'log_loss': ll,
        'y_pred': y_pred,
        'y_proba': y_proba,
        'estimator': model
    })

metrics_df = pd.DataFrame([{
    'model': r['model'],
    'accuracy': r['accuracy'],
    'precision': r['precision'],
    'recall': r['recall'],
    'f1': r['f1'],
    'auc': r['auc'],
    'log_loss': r['log_loss']
} for r in results]).set_index('model')

display(metrics_df.sort_values('accuracy', ascending=False))


In [None]:
plt.figure(figsize=(10,5))
metrics_df['accuracy'].plot(kind='bar')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.ylim(0,1)
plt.grid(alpha=0.2)
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ind = np.arange(len(metrics_df))
width = 0.35

ax.bar(ind - width/2, metrics_df['precision'], width, label='Precision')
ax.bar(ind + width/2, metrics_df['recall'], width, label='Recall')
ax.set_xticks(ind)
ax.set_xticklabels(metrics_df.index, rotation=45, ha='right')
ax.set_ylim(0,1)
ax.set_ylabel('Score')
ax.set_title('Precision and Recall by Model')
ax.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
for r in results:
    name = r['model']
    y_proba = r['y_proba']
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc = roc_auc_score(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
    else:
        print(f"Skipping ROC for {name} (no probability scores available)")

plt.plot([0,1], [0,1], linestyle='--', alpha=0.6)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
plt.grid(alpha=0.2)
plt.show()


In [None]:
for r in results:
    name = r['model']
    y_pred = r['y_pred']
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    print(f"Classification report for {name}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("-"*60)


In [None]:
metrics_df_rounded = metrics_df.round(4)
display(metrics_df_rounded)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df.drop('class', axis=1)
y = df['class']

encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == 'object':
      X[column] = encoder.fit_transform(X[column].astype(str))

y = encoder.fit_transform(y.astype(str))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linreg = LinearRegression().fit(X_train, y_train)
residuals = y_train - linreg.predict(X_train)
outlier_threshold = 3*np.std(residuals)
outliers = np.abs(residuals) > outlier_threshold

linreg_score = 0

if np.all(outliers):
    print("All samples are outliers, cannot fit a linear regression model")
else:
    X_train_cleaned, y_train_cleaned = X_train[~outliers], y_train[~outliers]

    linreg_cleaned = LinearRegression().fit(X_train_cleaned, y_train_cleaned)

    y_pred_cleaned = linreg_cleaned.predict(X_test)

    mse_cleaned = mean_squared_error(y_test, y_pred_cleaned)
    rmse_cleaned = np.sqrt(mse_cleaned)
    mae = mean_absolute_error(y_test, y_pred_cleaned)

    r2_score_cleaned = r2_score(y_test, y_pred_cleaned) * 100

    linreg_score = r2_score_cleaned
    linreg_mae = mae

    print(f"Mean Squared Error (MSE):{mse_cleaned:.2f}")
    print(f"Root Mean Squared Error (RMSE):{rmse_cleaned:.2f}")
    print(f"Mean Absolute Error (MAE) {mae:.2f}")
    print(" ")
    print(f"R-squared (RÂ²):{r2_score_cleaned:.6f}%")