Imports

In [13]:
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

Load Stacked Dataset

In [14]:
data_path = "../data/processed/stacked_dataset.parquet"
df = pd.read_parquet(data_path)

print("Loaded dataset:", df.shape)

Loaded dataset: (35954029, 12)


Confusion Matrix Plot Function

In [15]:
def plot_confusion(cm, classes, title, save_path):
    plt.figure(figsize=(6,5))
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], "d"),
                     ha="center", va="center")

    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()

    plt.savefig(save_path)
    plt.close()

Train/Val/Test Split

In [None]:
# Features and target
X = df.drop(columns=["severity"])
y = df["severity"]

# Train/Temp Split (stratified)
train_idx, temp_idx = train_test_split(
    np.arange(len(X)),
    test_size=0.3,          
    stratify=y,
    random_state=42
)

X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_temp,  y_temp  = X.iloc[temp_idx],  y.iloc[temp_idx]

# Temp â†’ Validation / Test Split (stratified)
val_idx, test_idx = train_test_split(
    np.arange(len(X_temp)),
    test_size=0.5, 
    stratify=y_temp,
    random_state=42
)

X_val,  y_val  = X_temp.iloc[val_idx],  y_temp.iloc[val_idx]
X_test, y_test = X_temp.iloc[test_idx], y_temp.iloc[test_idx]

print("Train:", len(X_train))
print("Val:", len(X_val))
print("Test:", len(X_test))

Train: 25167820
Val: 5393104
Test: 5393105


Model Definitions

In [17]:
models = {
    "logistic_regression": LogisticRegression(
        max_iter=2000,
        class_weight="balanced"
    ),
    "random_forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        n_jobs=-1,
        class_weight="balanced"
    ),
    "xgboost": XGBClassifier(
        n_estimators=400,
        max_depth=10,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="mlogloss"
    ),
    "mlp": MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        max_iter=50
    )
}

os.makedirs("../data/models", exist_ok=True)
os.makedirs("../figures", exist_ok=True)

Train and Evaluate Models

In [18]:
results = []

for name, model in models.items():
    print(f"Training {name}")

    # Train
    model.fit(X_train, y_train)

    # Validation prediction
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)

    # Validation metrics
    val_f1 = f1_score(y_val, val_pred, average="weighted")
    test_f1 = f1_score(y_test, test_pred, average="weighted")

    print(f"{name} validation F1:", val_f1)
    print(f"{name} test F1:", test_f1)

    # Save model
    model_path = f"../data/models/{name}.pkl"
    joblib.dump(model, model_path)
    print(f"Saved model: {model_path}")

    # Confusion matrix
    cm_val = confusion_matrix(y_val, val_pred)
    cm_test = confusion_matrix(y_test, test_pred)

    plot_confusion(
        cm_val,
        classes=np.unique(y_train),
        title=f"{name} - Validation Confusion Matrix",
        save_path=f"../figures/{name}_val_cm.png"
    )

    plot_confusion(
        cm_test,
        classes=np.unique(y_train),
        title=f"{name} - Test Confusion Matrix",
        save_path=f"../figures/{name}_test_cm.png"
    )

    results.append({
        "model": name,
        "val_f1": val_f1,
        "test_f1": test_f1
    })

    # Print classification report
    print("\nClassification Report (Validation):")
    print(classification_report(y_val, val_pred))

    print("\nClassification Report (Test):")
    print(classification_report(y_test, test_pred))


Training logistic_regression


ValueError: could not convert string to float: 'east_troublesome_fire'