In [1]:
import pandas as pd
df = pd.read_csv("datasets/cleaned/evaluation_with_fire_and_coordinates_and_date.csv", na_values="", 
                                         dtype={"ID_UEV": str}, )
df.dtypes

ID_UEV                  object
CIVIQUE_DEBUT            int64
CIVIQUE_FIN              int64
NOM_RUE                 object
SUITE_DEBUT             object
MUNICIPALITE             int64
ETAGE_HORS_SOL         float64
NOMBRE_LOGEMENT        float64
ANNEE_CONSTRUCTION     float64
CODE_UTILISATION         int64
LETTRE_DEBUT            object
LETTRE_FIN              object
LIBELLE_UTILISATION     object
CATEGORIE_UEF           object
MATRICULE83             object
SUPERFICIE_TERRAIN       int64
SUPERFICIE_BATIMENT    float64
NO_ARROND_ILE_CUM       object
NOM_RUE_CLEAN           object
fire_date               object
fire                      bool
ADDR_DE                float64
LONGITUDE              float64
LATITUDE               float64
dtype: object

In [2]:
df.head()

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,MATRICULE83,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,fire_date,fire,ADDR_DE,LONGITUDE,LATITUDE
0,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,,False,3577.0,-73.588602,45.493711
1,5213144,5211,5211,rue du Sureau (PFD),105.0,50,1.0,1.0,2012.0,1000,...,7941-06-3037-4-001-0004,69,68.0,REM31,rue du sureau,,False,,,
2,1036349,3550,3550,rue de la Montagne (MTL),109.0,50,3.0,1.0,1983.0,1000,...,9840-31-8010-6-001-0005,133,127.0,REM19,rue de la montagne,,False,,,
3,5189527,1200,1200,rue Saint-Alexandre (MTL),511.0,50,1.0,1.0,1963.0,1000,...,9940-55-8522-7-001-0257,16,62.0,REM19,rue saint-alexandre,,False,,,
4,1037334,1254,1254,rue Saint-Marc (MTL),61.0,50,1.0,1.0,1914.0,1000,...,9839-51-6255-9-001-0013,16,82.0,REM19,rue saint-marc,2023-01-07 01:45:48,True,1254.0,-73.579815,45.492286


In [3]:
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df["fire_month"] = df["fire_date"].dt.month.fillna(13).astype(int)
df.head()

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,fire_date,fire,ADDR_DE,LONGITUDE,LATITUDE,fire_month
0,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,2,16.0,REM19,avenue atwater,NaT,False,3577.0,-73.588602,45.493711,13
1,5213144,5211,5211,rue du Sureau (PFD),105.0,50,1.0,1.0,2012.0,1000,...,69,68.0,REM31,rue du sureau,NaT,False,,,,13
2,1036349,3550,3550,rue de la Montagne (MTL),109.0,50,3.0,1.0,1983.0,1000,...,133,127.0,REM19,rue de la montagne,NaT,False,,,,13
3,5189527,1200,1200,rue Saint-Alexandre (MTL),511.0,50,1.0,1.0,1963.0,1000,...,16,62.0,REM19,rue saint-alexandre,NaT,False,,,,13
4,1037334,1254,1254,rue Saint-Marc (MTL),61.0,50,1.0,1.0,1914.0,1000,...,16,82.0,REM19,rue saint-marc,2023-01-07 01:45:48,True,1254.0,-73.579815,45.492286,1


In [4]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

# --- Load dataset ---
df = pd.read_csv("datasets/cleaned/evaluation_with_fire_and_coordinates_and_date.csv")

# --- Assign fire_month (13 = no fire) ---
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df["fire_month"] = df["fire_date"].dt.month.fillna(13).astype(int)  # 1–12 = fire month, 13 = no fire

# --- Feature selection ---
features = [
    "ETAGE_HORS_SOL", "NOMBRE_LOGEMENT", "ANNEE_CONSTRUCTION",
    "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT", "LONGITUDE", "LATITUDE"
]

df["ANNEE_CONSTRUCTION"] = pd.to_numeric(df["ANNEE_CONSTRUCTION"], errors="coerce").fillna(0).astype(int)
X = df[features]
y = df["fire_month"]

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# --- Train model ---
model = LGBMClassifier(objective="multiclass", num_class=13, random_state=42)
model.fit(X_train, y_train)

# --- Evaluate ---
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1409
[LightGBM] [Info] Number of data points in the train set: 531026, number of used features: 7
[LightGBM] [Info] Start training from score -3.201423
[LightGBM] [Info] Start training from score -3.333060
[LightGBM] [Info] Start training from score -3.197038
[LightGBM] [Info] Start training from score -3.129453
[LightGBM] [Info] Start training from score -3.057336
[LightGBM] [Info] Start training from score -3.274240
[LightGBM] [Info] Start training from score -3.397299
[LightGBM] [Info] Start training from score -3.431007
[LightGBM] [Info] Start training from score -3.488936
[LightGBM] [Info] Start training from score -3.360506
[LightGBM] [Info] Start training from score -3.393592
[LightGBM] [Info] Start training from score -3.394939

In [5]:
print(df["fire_month"].value_counts(normalize=True).sort_index())


fire_month
1     0.040705
2     0.035683
3     0.040884
4     0.043742
5     0.047012
6     0.037845
7     0.033464
8     0.032354
9     0.030534
10    0.034718
11    0.033588
12    0.033543
13    0.555929
Name: proportion, dtype: float64


## Results above are terrible.

## Try dropping non fire rows from the training set

In [6]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier

# --- Load dataset ---
df = pd.read_csv("datasets/cleaned/evaluation_with_fire_and_coordinates_and_date.csv")

# --- Keep only rows where a fire occurred ---
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df = df[df["fire_date"].notna()].copy()  # keep only fire rows

# --- Extract fire month as target ---
df["fire_month"] = df["fire_date"].dt.month.astype(int)  # 1–12 only

# --- Feature selection ---
features = [
    "ETAGE_HORS_SOL", "NOMBRE_LOGEMENT", "ANNEE_CONSTRUCTION",
    "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT", "LONGITUDE", "LATITUDE"
]

# --- Ensure clean numeric types ---
df["ANNEE_CONSTRUCTION"] = pd.to_numeric(df["ANNEE_CONSTRUCTION"], errors="coerce").fillna(0).astype(int)
X = df[features].fillna(0)
y = df["fire_month"]

# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# --- Train model ---
model = LGBMClassifier(
    objective="multiclass",
    num_class=12,  # only months 1–12
    random_state=42
)
model.fit(X_train, y_train)

# --- Evaluate ---
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1421
[LightGBM] [Info] Number of data points in the train set: 235813, number of used features: 7
[LightGBM] [Info] Start training from score -2.389652
[LightGBM] [Info] Start training from score -2.521288
[LightGBM] [Info] Start training from score -2.385266
[LightGBM] [Info] Start training from score -2.317681
[LightGBM] [Info] Start training from score -2.245564
[LightGBM] [Info] Start training from score -2.462469
[LightGBM] [Info] Start training from score -2.585527
[LightGBM] [Info] Start training from score -2.619235
[LightGBM] [Info] Start training from score -2.677164
[LightGBM] [Info] Start training from score -2.548734
[LightGBM] [Info] Start training from score -2.581820
[LightGBM] [Info] Start training from score -2.583167