In [10]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

#getting the dataset
file_path = "./Wildfire_Dataset.csv"

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "firecastrl/us-wildfire-dataset",
    file_path,
)

In [None]:
import pandas as pd
import numpy as np
import time

#creating a sequence level sample (no more rows)
INPUT_FILE = "Wildfire_Dataset.csv"
OUTPUT_FILE = "wildfire_sequence_sample.csv"

SEQ_LEN = 75
FIRE_RATIO_THRESHOLD = 0.20
#can be adjusted
N_SEQUENCES_SAMPLE = 20000
RANDOM_STATE = 42

FEATURES = [
    "pr","rmax","rmin","sph","srad",
    "tmmn","tmmx","vs","bi",
    "fm100","fm1000","erc","etr","pet","vpd"
]

USE_COLS = ["latitude","longitude","datetime","Wildfire"] + FEATURES

df = pd.read_csv(INPUT_FILE, usecols=USE_COLS)
df = df.dropna()
df["datetime"] = pd.to_datetime(df["datetime"])

#75 day sequence per location
seq_rows = []
seq_labels = []
seq_id = 0

for (lat, lon), g in df.groupby(["latitude", "longitude"]):
    g = g.sort_values("datetime")

    n_full = len(g) // SEQ_LEN
    if n_full == 0:
        continue

    g = g.iloc[:n_full * SEQ_LEN]

    for i in range(n_full):
        block = g.iloc[i*SEQ_LEN:(i+1)*SEQ_LEN].copy()
        block["SeqID"] = seq_id

        fire_ratio = (block["Wildfire"] == "Yes").mean()
        label = 1 if fire_ratio >= FIRE_RATIO_THRESHOLD else 0

        block["SeqLabel"] = label

        seq_rows.append(block)
        seq_labels.append((seq_id, label))
        seq_id += 1

seq_df = pd.concat(seq_rows, ignore_index=True)
seq_label_df = pd.DataFrame(seq_labels, columns=["SeqID", "Label"])

print("Total sequences built:", len(seq_label_df))
print("Label distribution:\n", seq_label_df["Label"].value_counts(normalize=True))

#take sample
rng = np.random.default_rng(RANDOM_STATE)

available_seq_ids = seq_label_df["SeqID"].values
n_sample = min(N_SEQUENCES_SAMPLE, len(available_seq_ids))

sampled_seq_ids = rng.choice(
    available_seq_ids,
    size=n_sample,
    replace=False
)

sample_df = seq_df[seq_df["SeqID"].isin(sampled_seq_ids)].copy()
sample_df = sample_df.sort_values(["SeqID", "datetime"])
sample_df = sample_df.drop(columns=["SeqID", "SeqLabel"])

sample_df.to_csv(OUTPUT_FILE, index=False)

print("Saved sample to:", OUTPUT_FILE)
print("Sample rows:", len(sample_df))
print("Sample sequences:", len(sample_df) // SEQ_LEN)


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score
)

#used sampled file to speed up train and test process
SAMPLE_FILE = "wildfire_sequence_sample.csv"
SEQ_LEN = 75
THRESHOLD = 0.20
RANDOM_STATE = 42

FEATURES = [
    "pr","rmax","rmin","sph","srad",
    "tmmn","tmmx","vs","bi",
    "fm100","fm1000","erc","etr","pet","vpd"
]

In [4]:
df = pd.read_csv(SAMPLE_FILE).dropna().copy()
df["datetime"] = pd.to_datetime(df["datetime"])

seq_feature_rows = []
seq_labels = []

def summarize_block(block: pd.DataFrame) -> np.ndarray:

    """Convert a 75x15 block into a compact feature vector.
    Stats per feature: mean, std, min, max, last value, slope (trend).
    Output size = 15 * 6 = 90 features."""

    X = block[FEATURES].to_numpy()

    mean = X.mean(axis=0)
    std  = X.std(axis=0)
    mn   = X.min(axis=0)
    mx   = X.max(axis=0)
    last = X[-1, :]

    t = np.arange(X.shape[0])
    var_t = t.var()
    slope = ((t - t.mean())[:, None] * (X - X.mean(axis=0))).mean(axis=0) / var_t

    return np.concatenate([mean, std, mn, mx, last, slope])

seq_id = 0
for (lat, lon), g in df.groupby(["latitude", "longitude"]):
    g = g.sort_values("datetime")
    n_full = len(g) // SEQ_LEN
    if n_full == 0:
        continue

    g = g.iloc[:n_full * SEQ_LEN]

    for i in range(n_full):
        block = g.iloc[i*SEQ_LEN:(i+1)*SEQ_LEN]

        fire_ratio = (block["Wildfire"] == "Yes").mean()
        label = 1 if fire_ratio >= THRESHOLD else 0

        seq_feature_rows.append(summarize_block(block))
        seq_labels.append(label)
        seq_id += 1

X = np.vstack(seq_feature_rows)
y = np.array(seq_labels)

print("Sequences built:", len(y))
print("Summary feature matrix shape:", X.shape)
print("Label distribution:\n", pd.Series(y).value_counts())

Sequences built: 20000
Summary feature matrix shape: (20000, 90)
Label distribution:
 0    14878
1     5122
Name: count, dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=RANDOM_STATE,
    stratify=y
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

print("train:", X_train_s.shape, "Test:", X_test_s.shape)
print("train labels:\n", pd.Series(y_train).value_counts())
print("test labels:\n", pd.Series(y_test).value_counts())

train: (16000, 90) Test: (4000, 90)
train labels:
 0    11902
1     4098
Name: count, dtype: int64
test labels:
 0    2976
1    1024
Name: count, dtype: int64


In [None]:
svm = SVC(
    kernel="rbf",
    C=1.0, #maybe use bigger C
    gamma="scale",
    class_weight="balanced"
)

t0 = time.time()
svm.fit(X_train_s, y_train)

y_pred = svm.predict(X_test_s)

print("---classification report---")
print(classification_report(y_test, y_pred, digits=3))

print("---confusion matrix---")
print(confusion_matrix(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
bal_acc = balanced_accuracy_score(y_test, y_pred)
scores = svm.decision_function(X_test_s)
auc = roc_auc_score(y_test, scores)

print("\naccuracy:", round(acc, 3))
print("balanced accuracy:", round(bal_acc, 3))
print("ROC-AUC:", round(auc, 3))

---classification report---
              precision    recall  f1-score   support

           0      0.811     0.605     0.693      2976
           1      0.339     0.589     0.430      1024

    accuracy                          0.601      4000
   macro avg      0.575     0.597     0.562      4000
weighted avg      0.690     0.601     0.626      4000

---confusion matrix---
[[1801 1175]
 [ 421  603]]

Accuracy: 0.601
Balanced accuracy: 0.597
ROC-AUC: 0.653
