# Data Processing

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score
)

df = pd.read_csv("FPA_FOD_west_cleaned.csv", low_memory = False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519689 entries, 0 to 519688
Data columns (total 40 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   DISCOVERY_DOY         519689 non-null  int64  
 1   FIRE_YEAR             519689 non-null  int64  
 2   STATE                 519689 non-null  object 
 3   FIPS_CODE             519689 non-null  float64
 4   NWCG_GENERAL_CAUSE    519689 non-null  object 
 5   Annual_etr            519689 non-null  int64  
 6   Annual_precipitation  519689 non-null  int64  
 7   Annual_tempreture     519689 non-null  float64
 8   pr                    519689 non-null  float64
 9   tmmn                  519689 non-null  float64
 10  vs                    519689 non-null  float64
 11  fm100                 519689 non-null  float64
 12  fm1000                519689 non-null  float64
 13  bi                    519689 non-null  float64
 14  vpd                   519689 non-null  float64
 15  

In [68]:
#Separate into two datasets with and without cause
mask_known = df["NWCG_GENERAL_CAUSE"] != "Missing data/not specified/undetermined"
df_known = df[mask_known].copy()

X = df_known.drop(columns=["NWCG_GENERAL_CAUSE"])
y_text = df_known["NWCG_GENERAL_CAUSE"]

# Encode numeric/categorical features
le = LabelEncoder()
y = le.fit_transform(y_text)

numeric_features = [
    'pr', 'tmmn', 'Annual_tempreture', 'Annual_precipitation',
    'vpd', 'bi', 'erc', 'fm100', 'fm1000',
    'Elevation_1km', 'Slope_1km', 'Aspect_1km', 'TPI_1km',
    'SDI', 'No_FireStation_5.0km', 'GACC_PL', 'GAP_Sts',
    'GDP', 'GHM', 'NDVI-1day', 'NPL', 'Popo_1km',
    'RPL_THEMES', 'RPL_THEME1', 'RPL_THEME2',
    'RPL_THEME3', 'RPL_THEME4',
    'erc_Percentile', 'EVC', 'FRG',
    'DISCOVERY_DOY', 'FIRE_YEAR', 'Distance2road'
]
categorical_features = ['STATE', 'Mang_Name']

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [69]:
# Trainâ€“test split (stratify to keep class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# K-Means Clustering

In [70]:
# n_clusters = number of unique causes
n_classes = len(np.unique(y_train))
n_clusters = n_classes

kmeans = KMeans(
    n_clusters=n_clusters,
    init="k-means++",
    n_init=20,
    max_iter=300,
    random_state=42,
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("kmeans", kmeans),
    ]
)

In [None]:
# Fit k-means on X_train only (unsupervised with respect to y_train)
model.fit(X_train)

In [None]:
# See how many distinct clusters were actually used
train_clusters = model.predict(X_train)
print("Distinct clusters in train:", np.unique(train_clusters))

# Get clusters for train and test
test_clusters = model.predict(X_test)

# Learn cluster/map by cause using majority vote
def learn_cluster_mapping_numeric(y_true_train, clusters_train):
    mapping = {}
    for c in np.unique(clusters_train):
        # indices of samples in this cluster
        mask = (clusters_train == c)
        labels_in_cluster = y_true_train[mask]

        # majority label (numeric) via bincount
        majority_label = np.bincount(labels_in_cluster).argmax()
        mapping[c] = majority_label
    return mapping

cluster_to_label = learn_cluster_mapping_numeric(y_train, train_clusters)
print("Cluster -> label mapping:", cluster_to_label)

def apply_mapping(clusters, mapping):
    return np.array([mapping[c] for c in clusters])

In [None]:
# Apply mapping to train and test clusters to get pseudo-class predictions
y_train_pred = apply_mapping(train_clusters, cluster_to_label)
y_test_pred  = apply_mapping(test_clusters, cluster_to_label)

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=le.inverse_transform(np.arange(n_classes))
)

fig, ax = plt.subplots(figsize=(9, 9), dpi=150)

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=le.inverse_transform(np.arange(n_classes))
)
disp.plot(cmap="Blues", ax=ax, colorbar=True, values_format=".0f")
plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Metrics on alignment of clusters with true labels
acc  = accuracy_score(y_test, y_test_pred)
prec = precision_score(y_test, y_test_pred, average="macro", zero_division=0)
rec  = recall_score(y_test, y_test_pred, average="macro", zero_division=0)
f1   = f1_score(y_test, y_test_pred, average="macro", zero_division=0)

print(f"Alignment accuracy: {acc:.4f}")
print(f"Precision (macro): {prec:.4f}")
print(f"Recall (macro):    {rec:.4f}")
print(f"F1 (macro):        {f1:.4f}")