# Load libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Clone full repo from Github
Note: If repo has previously been cloned, but you need to pull latest files, open a Terminal session and run `cd data-innovation-days-2025` to change to the repo directory, then run `git pull origin` to pull all files. Recommend then closing the Colab session and re-starting to ensure everything refreshes.


In [None]:
if not os.path.exists('data-innovation-days-2025'):
  !git clone https://github.com/jeffjaylacs/data-innovation-days-2025.git
  print("Directory 'data-innovation-days-2025' created successfully.")
else:
  print("Directory 'data-innovation-days-2025' already exists. Skipping clone.")

# Read and preview data

In [None]:
df = pd.read_csv('data-innovation-days-2025/data/train_subset.csv')
display(df.head(10))

# Exploratory Data Analysis

## Univariate analysis

In [None]:
# Univariate Analysis on the 'cap-diameter' column

# 1. Descriptive Statistics
print("\nDescriptive Statistics:")
print(df['cap-diameter'].describe())

# 2. Visualization: Histogram
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['cap-diameter'], bins=20, kde=True)
plt.title('cap-diameter Distribution')
plt.xlabel('cap-diameter')
plt.ylabel('Frequency')

# 3. Visualization: Box Plot
plt.subplot(1, 2, 2)
sns.boxplot(x=df['cap-diameter'])
plt.title('Box Plot of cap-diameter')
plt.xlabel('cap-diameter')

# Show the plots
plt.tight_layout()
plt.show()

In [None]:
# Univariate Analysis on the 'gill-spacing' column

# 1. Fill missing values with a placeholder for visualization purposes
df['gill-spacing'] = df['gill-spacing'].fillna('Missing')

# 2. Frequency distribution
frequency_distribution = df['gill-spacing'].value_counts()
print("\nFrequency Distribution:")
print(frequency_distribution)

# 3. Descriptive statistics for categorical data
descriptive_stats = df['gill-spacing'].describe()
print("\nDescriptive Statistics:")
print(descriptive_stats)

# 4. Visualization: Bar Plot
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='gill-spacing',palette='deep')
plt.title('Count of gill-spacing')
plt.xlabel('gill-spacing')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Count missing values in each column
missing_values_count = df.isna().sum()

# Calculate the percentage of missing values in each column
missing_values_percent = (missing_values_count / len(df)) * 100

# Combine the results into a DataFrame for better readability
missing_summary = pd.DataFrame({
    'Missing Count': missing_values_count,
    'Missing Percentage': missing_values_percent
})

# Print the results
print("\nMissing Values Count and Percentage in Each Column:")
print(missing_summary)

## Weight of Evidence  
WoE provides insights into the relationship between a predictor variable (i.e. a feature) and the target variable. It is frequently used in logistic regression for understanding how each bin relates to the target, but can be helpful as part of EDA for any type of classification model.

In [None]:
# Calculate the total number of edible (good) and poisonous (bad) outcomes
total_edible = df['class'].value_counts().get('e', 0)
total_poisonous = df['class'].value_counts().get('p', 0)

# Calculate the distribution of target variable by category
woe_df = df.groupby('gill-spacing')['class'].value_counts().unstack(fill_value=0).reset_index()
woe_df.columns.name = None  # Remove the name of the index

# Rename columns for clarity
woe_df.columns = ['gill-spacing', 'edible', 'poisonous']

# Calculate the proportions
woe_df['edible_proportion'] = woe_df['edible'] / total_edible
woe_df['poisonous_proportion'] = woe_df['poisonous'] / total_poisonous

# Calculate WoE
woe_df['WoE'] = np.log(woe_df['edible_proportion'] / woe_df['poisonous_proportion'])

# Display the WoE DataFrame
print("\nWeight of Evidence DataFrame:")
print(woe_df[['gill-spacing', 'edible', 'poisonous', 'WoE']])

# Visualization
plt.figure(figsize=(8, 5))
sns.barplot(data=woe_df, x='gill-spacing', y='WoE', palette='pastel')
plt.title('Weight of Evidence by gill-spacing')
plt.xlabel('gill-spacing')
plt.ylabel('Weight of Evidence (WoE)')
plt.axhline(0, color='red', linestyle='--')  # Reference line at WoE = 0
plt.show()

# Feature Engineering

In [None]:
# Create a new column 'stem-area' as the product of 'stem-height' and 'stem-width'
df['stem-area'] = df['stem-height'] * df['stem-width']

# Display the updated DataFrame with the new column
print("\nUpdated DataFrame with 'stem-area':")
display(df.head(10))

# Model Training

## Target Definition  
Many classification model algorithms require the binary target to be represented by a 1 or 0.  We will call "poisonous" our positive class and represent with a 1, and edible will be represented by a 0. Calling a poisonous mushroom the "positive" class sounds counter-intuitive, but that type of terminology is common (e.g. a patient test positive for a disease, or a customer defaults on a loan and "hits the target", etc.

In [None]:
df["class"] = df["class"].map({"e": 0, "p": 1})

frequency = df["class"].value_counts()
print(frequency)


## Create train and test data sets  
The model will be built and tuned on the training data.  The test data will be set completely off to the side and only used for final model performance evaluation.

In [None]:
# Split the DataFrame.  random_state is a seed for reproducibility purposes.
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

## Train the model

In [None]:
# Features and target from train_df
X = train_df.drop(columns=["class"])
y = train_df["class"]

# Initialize the model with default hyperparameters
rf_model = RandomForestClassifier()

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())


# Model Evaluation

In [None]:
# Model evaluation placeholder