In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

# Clean Data

In [None]:
def encode_data(df):
    '''
    - encodes all variables (using drop_first)
    - also where I drop features to explore predictive power of limited feature sets 
    '''
    X = df.drop(['class'], axis=1)
    # drop the most predictive featuers
    # X = df.drop(['class', 'odor','gill-size','bruises', 'ring-type', 'stalk-surface-above-ring', 'spore-print-color', 'population', 'stalk-surface-below-ring',], axis=1)
    # drop just odor 
    # X = df.drop(['class','odor'], axis=1)
    # use only the least predictive features
    # X = df[["cap-shape", "veil-color", "cap-surface","stalk-color-above-ring", "gill-color", "cap-color"]]
    # use only odor
    # X = df["odor"]
    y = df['class'].map({'e':0,'p':1})

    encoded_X = pd.get_dummies(X, drop_first=True).astype(int)
    
    # Add target back
    encoded_df = encoded_X.copy()
    encoded_df['class'] = y
    return encoded_df, encoded_df.drop('class', axis=1), encoded_df['class']

def match_columns(train_X, other_X):
    '''
    ensures test/validation data has all the columns that training data has
    (it's ok if they have extras, but they need at least the training data ones)
    '''
    for col in train_X.columns:
        if col not in other_X.columns:
            other_X[col] = 0  
    return other_X

In [None]:
data = pd.read_csv("mushrooms.csv")

In [None]:
# split the data
train_df, temp_df = train_test_split(data, test_size=0.3, stratify=data['class'], random_state = 9292, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=.5, stratify=temp_df['class'], random_state = 1, shuffle=True)

# encode each dataset
encoded_train, train_X, train_y = encode_data(train_df)
encoded_val, val_X, val_y = encode_data(val_df)
encoded_test, test_X, test_y = encode_data(test_df)

# match the training columns in val and test data 
val_X = match_columns(train_X, val_X)
test_X = match_columns(train_X, test_X)

# sort all columns the same way
val_X = val_X.sort_index(axis=1)
test_X = test_X.sort_index(axis=1)
train_X = train_X.sort_index(axis=1)

In [None]:
train_df.describe()

In [None]:
train_df.isna().sum()

In [None]:
# visualize how each feature contributes to poisonous/edible 
# batch size is used for readablility (puts batch_size variables on the same chart)

batch_size = 6
# Get list of column names excluding 'class'
feature_cols = [col for col in encoded_train.columns if col != 'class']
n_batches = (len(feature_cols) + batch_size - 1) // batch_size

for i in range(n_batches):
    # Slice the list of column names instead of the DataFrame
    batch_cols = feature_cols[i*batch_size : (i+1)*batch_size]
    counts = []
    
    for col in batch_cols:
        poison_count = encoded_train[encoded_train['class'] == 1][col].sum()
        edible_count = encoded_train[encoded_train['class'] == 0][col].sum()
        counts.append([col, poison_count, edible_count])
    
    count_df = pd.DataFrame(counts, columns=['feature', 'poison', 'edible'])
    plt.figure(figsize=(12, 6))
    count_df.plot(x='feature', y=['poison', 'edible'], kind='bar')
    plt.xticks(rotation=45, ha='right')
    plt.title(f'Feature Counts by Class (Batch {i+1} of {n_batches})')
    plt.tight_layout()
    plt.show()

# Create and Train Model

In [None]:
# create model 
rf_model = RandomForestClassifier(
    n_estimators=100,  
    max_depth=None,    
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=103
)

# train it
rf_model.fit(train_X, train_y)

# Validate Model

In [None]:
# validate it
y_pred = rf_model.predict(val_X)
print(classification_report(val_y, y_pred))

In [None]:
# examine most (or least if you flip the ranking) important features
feature_importance = pd.DataFrame({
    'feature': train_X.columns,
    'importance': rf_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Test Model

In [None]:
test_pred = rf_model.predict(test_X)
print(classification_report(test_y, test_pred))