In [13]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)


from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [14]:
import kagglehub

# Download latest version
data_path = kagglehub.dataset_download("firecastrl/us-wildfire-dataset")

print("Path to dataset files:", data_path)

print(os.listdir(data_path))

file_path = os.path.join(data_path, "Wildfire_Dataset.csv")
wild_fires = pd.read_csv(file_path)
wild_fires.head()

Using Colab cache for faster access to the 'us-wildfire-dataset' dataset.
Path to dataset files: /kaggle/input/us-wildfire-dataset
['Wildfire_Dataset.csv']


Unnamed: 0,latitude,longitude,datetime,Wildfire,pr,rmax,rmin,sph,srad,tmmn,tmmx,vs,bi,fm100,fm1000,erc,etr,pet,vpd
0,48.128431,-97.276685,2018-08-15,No,0.0,78.6,14.9,0.00582,272.6,282.0,301.6,3.0,40.0,10.2,12.2,54.0,7.5,5.5,1.59
1,48.128431,-97.276685,2018-08-16,No,0.0,80.4,13.9,0.00676,264.0,283.9,304.9,3.0,40.0,9.7,12.0,56.0,8.2,5.9,1.93
2,48.128431,-97.276685,2018-08-17,No,0.0,70.9,20.4,0.00672,265.6,285.8,300.7,3.1,40.0,9.2,11.9,56.0,7.2,5.3,1.51
3,48.128431,-97.276685,2018-08-18,No,5.0,65.2,19.4,0.00756,261.4,289.0,303.3,5.1,0.0,9.9,12.0,40.0,10.0,6.9,1.85
4,48.128431,-97.276685,2018-08-19,No,0.0,100.0,42.2,0.00895,166.8,283.8,296.5,4.7,41.0,11.8,12.1,47.0,4.6,3.4,0.66


In [15]:
wild_fires.columns

Index(['latitude', 'longitude', 'datetime', 'Wildfire', 'pr', 'rmax', 'rmin',
       'sph', 'srad', 'tmmn', 'tmmx', 'vs', 'bi', 'fm100', 'fm1000', 'erc',
       'etr', 'pet', 'vpd'],
      dtype='object')

In [16]:
wild_fires['Wildfire'] = wild_fires['Wildfire'].map({'Yes': 1, 'No': 0})
wild_fires['Wildfire'].value_counts()

Unnamed: 0_level_0,count
Wildfire,Unnamed: 1_level_1
0,9007860
1,502065


Colab crashes due to there not being enough RAM, therefore I would use stratified sampling to get a sample of the dataset.

In [17]:
# --------------------------
# 1. Sample Data (5% Stratified)
# --------------------------
sampled_df = wild_fires.groupby('Wildfire', group_keys=False).apply(
    lambda x: x.sample(frac=0.05, random_state=42)
).reset_index(drop=True)

print("Sampled dataset shape:", sampled_df.shape)
print(sampled_df['Wildfire'].value_counts())

# --------------------------
# 2. Create a copy for transformation (so original stays untouched)
# --------------------------
df_copy = sampled_df.copy()  # <--- Work on a copy

# --------------------------
# 3. Define y (target)
# --------------------------
y = df_copy['Wildfire']   # safe, no change to original

# --------------------------
# 4. Feature Selection
# --------------------------
features = ['latitude', 'longitude', 'datetime', 'rmax', 'tmmx', 'vs', 'fm100']
X_df = df_copy[features].copy()

# Rename features (optional, cosmetic)
X_df = X_df.rename(columns={
    'latitude': 'lat',
    'longitude': 'lon',
    'datetime': 'date',
    'rmax': 'max_humidity',
    'tmmx': 'max_temp',
    'vs': 'vapor_pressure',
    'fm100': 'fuel_moisture_100h'
})

# Convert date and extract components
X_df['date'] = pd.to_datetime(X_df['date'])
X_df['year'] = X_df['date'].dt.year
X_df['month'] = X_df['date'].dt.month
X_df['day'] = X_df['date'].dt.day
X_df.drop(columns=['date'], inplace=True)

# --------------------------
# 5. Train-Test Split (only rows where y is not NaN)
# --------------------------
valid_idx = y.dropna().index

x_train, x_test, y_train, y_test = train_test_split(
    X_df.loc[valid_idx], y.loc[valid_idx],
    test_size=0.2, random_state=42, stratify=y.loc[valid_idx]
)


  sampled_df = wild_fires.groupby('Wildfire', group_keys=False).apply(


Sampled dataset shape: (475496, 19)
Wildfire
0    450393
1     25103
Name: count, dtype: int64


In [18]:
# Check cleaned feature DataFrame
X_df.head()

Unnamed: 0,lat,lon,max_humidity,max_temp,vapor_pressure,fuel_moisture_100h,year,month,day
0,34.420939,-117.172528,78.6,287.1,6.3,12.7,2019,5,22
1,33.86928,-118.21317,92.3,298.9,1.5,14.2,2023,11,14
2,48.916944,-112.795,90.2,296.9,2.2,14.7,2018,5,22
3,37.923969,-120.628081,48.9,310.1,3.3,5.9,2018,7,31
4,44.101254,-76.219031,88.6,276.0,2.3,18.9,2020,2,2


In [19]:

y.head()

Unnamed: 0,Wildfire
0,0
1,0
2,0
3,0
4,0


In [20]:
X_df.columns

Index(['lat', 'lon', 'max_humidity', 'max_temp', 'vapor_pressure',
       'fuel_moisture_100h', 'year', 'month', 'day'],
      dtype='object')

In [21]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 475496 entries, 0 to 475495
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   lat                 475496 non-null  float64
 1   lon                 475496 non-null  float64
 2   max_humidity        475496 non-null  float64
 3   max_temp            475496 non-null  float64
 4   vapor_pressure      475496 non-null  float64
 5   fuel_moisture_100h  475496 non-null  float64
 6   year                475496 non-null  int32  
 7   month               475496 non-null  int32  
 8   day                 475496 non-null  int32  
dtypes: float64(6), int32(3)
memory usage: 27.2 MB


In [22]:
# --------------------------
# 2. Preprocessing & Pipeline
# --------------------------
num_vals = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer([
    ("num_process", num_vals, X_df.columns)
])

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# --------------------------
# 3. Baseline
# --------------------------
pipeline.fit(x_train, y_train)
print("Baseline Accuracy:", pipeline.score(x_test, y_test))

# --------------------------
# 4. Light Grid Search
# --------------------------
search_space_rf = [{
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 20],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1],
    'model__max_features': ['sqrt']
}]


gs = GridSearchCV(pipeline, search_space_rf, cv=3, verbose=1, n_jobs=-1)
gs.fit(x_train, y_train)

best_model = gs.best_estimator_
print("Best Model:", best_model)
print("Best Model Accuracy:", best_model.score(x_test, y_test))

# --------------------------
# 5. Save Model
# --------------------------
joblib.dump(best_model, "wildfire_rf_best_model.pkl")
print("Model saved as wildfire_rf_best_model.pkl")

from google.colab import files
files.download("wildfire_rf_best_model.pkl")

# Save to Google Drive (optional)
joblib.dump(best_model, '/content/drive/MyDrive/best_wildfire_model.pkl')



Baseline Accuracy: 0.9468664563617245
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Model: Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num_process',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['lat', 'lon', 'max_humidity', 'max_temp', 'vapor_pressure',
       'fuel_moisture_100h', 'year', 'month', 'day'],
      dtype='object'))])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        random_state=42))])
Best Model Accuracy: 0.9468664563617245
Model saved as wildfire_rf_best_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/best_wildfire_model.pkl'

In [23]:
# --------------------------
# 6. Evaluation
# --------------------------
y_pred = best_model.predict(x_test)
y_proba = best_model.predict_proba(x_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     90079
           1       0.43      0.02      0.04      5021

    accuracy                           0.95     95100
   macro avg       0.69      0.51      0.50     95100
weighted avg       0.92      0.95      0.92     95100

Precision: 0.42592592592592593
Recall: 0.018323043218482375
ROC-AUC: 0.771153170361366


## Version with a Different sample 50/50 split of fire vs no fire

In [28]:
# --------------------------
# 0. Make a copy for sampling & transformation
# --------------------------
df_copy = wild_fires.copy()

# Keep only rows where 'Wildfire' is not NaN
df_copy = df_copy[df_copy['Wildfire'].notna()]

# --------------------------
# 1. Separate majority and minority classes
# --------------------------
df_fire = df_copy[df_copy['Wildfire'] == 1]
df_no_fire = df_copy[df_copy['Wildfire'] == 0]

# --------------------------
# 2. Balance classes 50/50
# --------------------------
n_minority = len(df_fire)
df_no_fire_sample = df_no_fire.sample(n=n_minority, random_state=42)

balanced_df = pd.concat([df_fire, df_no_fire_sample]).sample(frac=1, random_state=42)
print("Balanced dataset shape:", balanced_df.shape)
print(balanced_df['Wildfire'].value_counts())

# --------------------------
# 3. Sample 5% from the balanced dataset
# --------------------------
sampled_df = balanced_df.groupby('Wildfire', group_keys=False).apply(
    lambda x: x.sample(frac=0.05, random_state=42)
).reset_index(drop=True)

print("Sampled dataset shape:", sampled_df.shape)
print(sampled_df['Wildfire'].value_counts())

# --------------------------
# 4. Features and Target
# --------------------------
X = ['latitude', 'longitude', 'datetime', 'rmax', 'tmmx', 'vs', 'fm100']
X_df = sampled_df[X].copy()

# Rename columns for clarity
X_df = X_df.rename(columns={
    'latitude': 'lat',
    'longitude': 'lon',
    'datetime': 'date',
    'rmax': 'max_humidity',
    'tmmx': 'max_temp',
    'vs': 'vapor_pressure',
    'fm100': 'fuel_moisture_100h'
})

# Extract date features
X_df['date'] = pd.to_datetime(X_df['date'])
X_df['year'] = X_df['date'].dt.year
X_df['month'] = X_df['date'].dt.month
X_df['day'] = X_df['date'].dt.day
X_df.drop(columns=['date'], inplace=True)

# Target variable (binary)
y = sampled_df['Wildfire']

# --------------------------
# 5. Train-Test Split
# --------------------------
x_train, x_val, y_train, y_val = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", x_train.shape)
print("Validation set shape:", x_val.shape)
print("Wildfire distribution in training set:\n", y_train.value_counts())
print("Wildfire distribution in validation set:\n", y_val.value_counts())


Balanced dataset shape: (1004130, 19)
Wildfire
0    502065
1    502065
Name: count, dtype: int64
Sampled dataset shape: (50206, 19)
Wildfire
0    25103
1    25103
Name: count, dtype: int64
Training set shape: (40164, 9)
Validation set shape: (10042, 9)
Wildfire distribution in training set:
 Wildfire
0    20082
1    20082
Name: count, dtype: int64
Wildfire distribution in validation set:
 Wildfire
0    5021
1    5021
Name: count, dtype: int64


  sampled_df = balanced_df.groupby('Wildfire', group_keys=False).apply(


In [29]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score
import joblib
from google.colab import files

# --------------------------
# 2. Preprocessing & Pipeline
# --------------------------
num_vals = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocess = ColumnTransformer([
    ("num_process", num_vals, X_df.columns)
])

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# --------------------------
# 3. Baseline
# --------------------------
pipeline.fit(x_train, y_train)
print("Baseline Accuracy:", pipeline.score(x_val, y_val))

# --------------------------
# 4. Light Grid Search
# --------------------------
search_space_rf = [{
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 20],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1],
    'model__max_features': ['sqrt']
}]

gs = GridSearchCV(pipeline, search_space_rf, cv=3, verbose=1, n_jobs=-1)
gs.fit(x_train, y_train)

best_model = gs.best_estimator_
print("Best Model:", best_model)
print("Best Model Accuracy:", best_model.score(x_val, y_val))

# --------------------------
# 5. Save Model
# --------------------------
joblib.dump(best_model, "wildfire_rf_balanced_classes_best_model.pkl")
print("Model saved as wildfire_rf_balanced_classes_best_model.pkl")

# Download to local machine
files.download("wildfire_rf_balanced_classes_best_model.pkl")

# --------------------------
# 6. Evaluation
# --------------------------
y_pred = best_model.predict(x_val)
y_proba = best_model.predict_proba(x_val)[:, 1]

print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_proba))


Baseline Accuracy: 0.700955984863573
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Model: Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num_process',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['lat', 'lon', 'max_humidity', 'max_temp', 'vapor_pressure',
       'fuel_moisture_100h', 'year', 'month', 'day'],
      dtype='object'))])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=200, random_state=42))])
Best Model Accuracy: 0.6985660227046405
Model saved as wildfire_rf_balanced_classes_best_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70      5021
           1       0.70      0.70      0.70      5021

    accuracy                           0.70     10042
   macro avg       0.70      0.70      0.70     10042
weighted avg       0.70      0.70      0.70     10042

Precision: 0.6991610067918498
Recall: 0.6970722963553077
ROC-AUC: 0.7711966244461967
