# Mushroom Classification - Machine Learning Assignment 2

**Student Details:**
- **BITS ID:** 2025AA05835
- **Name:** JANET DEVARAJ
- **Email:** 2025aa05835@wilp.bits-pilani.ac.in

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, matthews_corrcoef,
                             confusion_matrix, classification_report)
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")

## Load and Explore Dataset

In [None]:
# Load dataset from UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
           'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
           'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
           'stalk-surface-below-ring', 'stalk-color-above-ring',
           'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
           'ring-type', 'spore-print-color', 'population', 'habitat']

df = pd.read_csv(url, names=columns)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Dataset information
print("Dataset Info:")
df.info()

print("\nClass Distribution:")
print(df['class'].value_counts())

## Data Preprocessing

In [None]:
# Handle missing values
print(f"Missing values before cleaning: {df.isnull().sum().sum()}")
df = df.replace('?', np.nan)
print(f"Missing values (marked as ?): {df.isnull().sum().sum()}")
df = df.dropna()
print(f"Shape after removing missing values: {df.shape}")

In [None]:
# Label encoding for all categorical features
label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

print("✅ All features encoded successfully!")
df.head()

In [None]:
# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nNumber of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTesting set class distribution:")
print(y_test.value_counts())

## Model Training and Evaluation

In [None]:
def evaluate_model(y_true, y_pred, y_proba=None):
    """Calculate all required evaluation metrics"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='weighted'),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'f1': f1_score(y_true, y_pred, average='weighted'),
        'mcc': matthews_corrcoef(y_true, y_pred)
    }
    
    if y_proba is not None:
        try:
            if len(np.unique(y_true)) == 2:
                metrics['auc'] = roc_auc_score(y_true, y_proba[:, 1])
            else:
                metrics['auc'] = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
        except:
            metrics['auc'] = 0.0
    else:
        metrics['auc'] = 0.0
    
    return metrics

### 1. Logistic Regression

In [None]:
print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_proba = lr_model.predict_proba(X_test)
lr_metrics = evaluate_model(y_test, lr_pred, lr_proba)

print("\nLogistic Regression Metrics:")
for metric, value in lr_metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

### 2. Decision Tree Classifier

In [None]:
print("Training Decision Tree...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_proba = dt_model.predict_proba(X_test)
dt_metrics = evaluate_model(y_test, dt_pred, dt_proba)

print("\nDecision Tree Metrics:")
for metric, value in dt_metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

### 3. K-Nearest Neighbors Classifier

In [None]:
print("Training K-Nearest Neighbors...")
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
knn_proba = knn_model.predict_proba(X_test)
knn_metrics = evaluate_model(y_test, knn_pred, knn_proba)

print("\nK-Nearest Neighbors Metrics:")
for metric, value in knn_metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

### 4. Naive Bayes Classifier

In [None]:
print("Training Naive Bayes...")
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_proba = nb_model.predict_proba(X_test)
nb_metrics = evaluate_model(y_test, nb_pred, nb_proba)

print("\nNaive Bayes Metrics:")
for metric, value in nb_metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

### 5. Random Forest Classifier (Ensemble)

In [None]:
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)
rf_metrics = evaluate_model(y_test, rf_pred, rf_proba)

print("\nRandom Forest Metrics:")
for metric, value in rf_metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

### 6. XGBoost Classifier (Ensemble)

In [None]:
print("Training XGBoost...")
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_proba = xgb_model.predict_proba(X_test)
xgb_metrics = evaluate_model(y_test, xgb_pred, xgb_proba)

print("\nXGBoost Metrics:")
for metric, value in xgb_metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

## Results Comparison

In [None]:
# Compile all results
results = {
    'Logistic Regression': lr_metrics,
    'Decision Tree': dt_metrics,
    'K-Nearest Neighbors': knn_metrics,
    'Naive Bayes': nb_metrics,
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics
}

# Create comparison dataframe
comparison_df = pd.DataFrame(results).T
comparison_df.columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'MCC', 'AUC']
comparison_df = comparison_df[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1 Score', 'MCC']]

print("\n" + "="*80)
print("FINAL RESULTS - All Models Comparison")
print("="*80)
print(comparison_df.round(4))
print("="*80)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

metrics_list = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1 Score', 'MCC']

for idx, metric in enumerate(metrics_list):
    ax = axes[idx // 3, idx % 3]
    comparison_df[metric].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_title(metric, fontweight='bold')
    ax.set_ylabel('Score')
    ax.set_xlabel('Model')
    ax.set_ylim([0, 1.05])
    ax.grid(axis='y', alpha=0.3)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Save Models and Results

In [None]:
# Save all trained models
trained_models = {
    'Logistic Regression': lr_model,
    'Decision Tree': dt_model,
    'K-Nearest Neighbors': knn_model,
    'Naive Bayes': nb_model,
    'Random Forest': rf_model,
    'XGBoost': xgb_model
}

with open('trained_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)

with open('metrics.pkl', 'wb') as f:
    pickle.dump(results, f)

with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

# Save test data
X_test.to_csv('test_data.csv', index=False)
pd.DataFrame({'class': y_test}).to_csv('test_labels.csv', index=False)

print("✅ All models, metrics, and data saved successfully!")

## Conclusion

All 6 models have been successfully trained and evaluated on the mushroom classification dataset. The results show excellent performance across most models, with ensemble methods achieving perfect classification.