# Wildfire Baseline Modeling — Feature Scaling, Training & Evaluation

In [14]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# 1. LOAD CLEAN DATA

In [15]:
INPUT_PATH = '../data/outputs/combined_dataset.parquet'
df = pd.read_parquet(INPUT_PATH)
print("Loaded:", INPUT_PATH)
print(df.dtypes)  # inspect column types
print(df[target_col].value_counts())  # make sure label is balanced
print("Checking unique classes:", np.unique(y))
print("Checking numeric cols:", numeric_cols)

print("Columns:", df.columns.tolist())

# Determine target column (fallback to 'severity' if 'label' not present)
target_col = "label" if "label" in df.columns else ("severity" if "severity" in df.columns else None)
if target_col is None:
    raise SystemExit("No target column found. Expected 'label' or 'severity' in the dataset.")

print("Using target column:", target_col)

Loaded: ../data/outputs/combined_dataset.parquet
y                          float64
x                          float64
band                         int64
Caldor_dNBR                float32
Caldor_SPI                 float64
Caldor_VCI                 float32
Caldor_veg_indices         float64
spatial_ref                  int64
fire_name                   object
severity                    object
Camp_dNBR                  float32
Camp_SPI                   float64
Camp_VCI                   float32
Camp_veg_indices           float64
Dixie_dNBR                 float32
Dixie_SPI                  float64
Dixie_VCI                  float32
Dixie_veg_indices          float64
Troublesome_dNBR           float32
Troublesome_SPI            float64
Troublesome_VCI            float32
Troublesome_veg_indices    float64
dtype: object
severity
Unburned    19615810
Low          1067812
Moderate      248388
High          200712
Name: count, dtype: int64
Checking unique classes: []
Checking numeric col

# 2. FEATURE / LABEL SPLIT


In [16]:
if df[target_col].dtype == object or not np.issubdtype(df[target_col].dtype, np.number):
    le = LabelEncoder()
    df[target_col] = le.fit_transform(df[target_col].astype(str))
    label_map = {i: c for i, c in enumerate(le.classes_)}
    print("Label mapping:", label_map)
else:
    label_map = None

# Select numeric feature columns only, exclude coordinates / identifiers and the target
exclude = {target_col, "fire_name", "x", "y", "lon", "lat"}
numeric_cols = [c for c in df.columns if c not in exclude and np.issubdtype(df[c].dtype, np.number)]
if not numeric_cols:
    raise SystemExit("No numeric feature columns detected. Check dataframe columns.")

print("Using numeric features (n={}): {}".format(len(numeric_cols), numeric_cols[:10]))

df = df[numeric_cols + [target_col]].dropna()
X = df[numeric_cols].values
y = df[target_col].values


Label mapping: {0: 'High', 1: 'Low', 2: 'Moderate', 3: 'Unburned'}
Using numeric features (n=18): ['band', 'Caldor_dNBR', 'Caldor_SPI', 'Caldor_VCI', 'Caldor_veg_indices', 'spatial_ref', 'Camp_dNBR', 'Camp_SPI', 'Camp_VCI', 'Camp_veg_indices']


# 3. TRAIN / TEST SPLIT

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

# 4. FEATURE NORMALIZATION

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 5. BASELINE MODEL — Logistic Regression

In [None]:
print("\n=== Logistic Regression ===")
log_reg = LogisticRegression(max_iter=2000, class_weight='balanced', multi_class='multinomial', solver='saga')
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


# 6. BASELINE MODEL — Random Forest

In [None]:
print("\n=== Random Forest ===")
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


# 7. FEATURE IMPORTANCE (RF ONLY)

In [None]:
importances = rf.feature_importances_
idx = np.argsort(importances)[::-1]
top_n = min(10, len(numeric_cols))
plt.figure(figsize=(10, 4))
plt.bar([numeric_cols[i] for i in idx[:top_n]], importances[idx[:top_n]])
plt.xticks(rotation=45, ha='right')
plt.title("Top {} Feature Importances (Random Forest)".format(top_n))
plt.tight_layout()
plt.show()