In [3]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [6]:

import os
print("Working directory:", os.getcwd())
print("File exists:", os.path.exists(INPUT_CSV))


Working directory: C:\Users\mirei\OneDrive\Desktop\all-capstone-project-summer-2025-team-6-main\datamodel
File exists: False


In [7]:
import os

# Adjusted path for use in notebook
INPUT_CSV = os.path.join("..", "datasets", "cleaned", "evaluation_with_fire_and_coordinates_and_date.csv")
OUTPUT_PANEL = os.path.join("building_month_fire_panel.csv")  # Output is okay to save in datamodel

# Load the dataset
import pandas as pd
df = pd.read_csv(INPUT_CSV)


In [8]:
df.head()

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,MATRICULE83,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,fire_date,fire,ADDR_DE,LONGITUDE,LATITUDE
0,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,,False,3577.0,-73.588602,45.493711
1,5213144,5211,5211,rue du Sureau (PFD),105.0,50,1.0,1.0,2012.0,1000,...,7941-06-3037-4-001-0004,69,68.0,REM31,rue du sureau,,False,,,
2,1036349,3550,3550,rue de la Montagne (MTL),109.0,50,3.0,1.0,1983.0,1000,...,9840-31-8010-6-001-0005,133,127.0,REM19,rue de la montagne,,False,,,
3,5189527,1200,1200,rue Saint-Alexandre (MTL),511.0,50,1.0,1.0,1963.0,1000,...,9940-55-8522-7-001-0257,16,62.0,REM19,rue saint-alexandre,,False,,,
4,1037334,1254,1254,rue Saint-Marc (MTL),61.0,50,1.0,1.0,1914.0,1000,...,9839-51-6255-9-001-0013,16,82.0,REM19,rue saint-marc,2023-01-07 01:45:48,True,1254.0,-73.579815,45.492286


In [9]:
df.shape


(663783, 24)

In [10]:
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df["month"] = df["fire_date"].dt.to_period("M")
df = df.dropna(subset=["LONGITUDE", "LATITUDE", "ID_UEV"])
df["geometry"] = df.apply(lambda row: Point(row["LONGITUDE"], row["LATITUDE"]), axis=1)
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:32188")


In [12]:

# 🔸 Construct panel: building × month
unique_buildings = gdf[["ID_UEV", "LATITUDE", "LONGITUDE"]].drop_duplicates()
all_months = pd.period_range(start=gdf["month"].min(), end=gdf["month"].max(), freq="M")
panel = pd.MultiIndex.from_product([unique_buildings["ID_UEV"], all_months],
                                   names=["ID_UEV", "month"]).to_frame(index=False)
panel = panel.merge(unique_buildings, on="ID_UEV", how="left")

In [13]:
# 🔸 Label fire presence
fires = gdf[gdf["fire"] == True][["ID_UEV", "month"]].drop_duplicates()
fires["HAS_FIRE_THIS_MONTH"] = 1
panel = panel.merge(fires, on=["ID_UEV", "month"], how="left")
panel["HAS_FIRE_THIS_MONTH"] = panel["HAS_FIRE_THIS_MONTH"].fillna(0).astype(int)

In [14]:

# 🔸 Add time-based features
panel["month_num"] = panel["month"].dt.month
panel["year"] = panel["month"].dt.year


In [15]:
panel.shape

(20582705, 7)

In [16]:
print(unique_buildings["ID_UEV"].nunique())  # Is it really 663,783?


310387


In [17]:
print(gdf["month"].min(), gdf["month"].max())
print(len(all_months))  # How many months?


2020-01 2025-05
65


 So what's wrong?
This panel assumes every building existed or was active during all 65 months, which is likely unrealistic.

In reality:

Some buildings might have fire incidents only in a few months.

Some buildings may not appear at all during some years.

✅ Recommendation: Build a sparse panel (only active building-month pairs)
Instead of using from_product, use real building × month pairs from the dataset:

✅ Efficient replacement for dense panel

In [19]:
# 🔹 Use only real building-months
panel = gdf[["ID_UEV", "month"]].drop_duplicates()

# 🔹 Add back LAT/LON for geolocation
panel = panel.merge(unique_buildings, on="ID_UEV", how="left")

# 🔹 Label fire presence
fires = gdf[gdf["fire"] == True][["ID_UEV", "month"]].drop_duplicates()
fires["HAS_FIRE_THIS_MONTH"] = 1

panel = panel.merge(fires, on=["ID_UEV", "month"], how="left")
panel["HAS_FIRE_THIS_MONTH"] = panel["HAS_FIRE_THIS_MONTH"].fillna(0).astype(int)

# 🔹 Add time features
panel["month_num"] = panel["month"].dt.month
panel["year"] = panel["month"].dt.year


In [21]:
panel.shape

(456761, 7)

✅ Next Suggested Steps
Now that your panel looks good:

1. Join Feature Engineering Columns
You likely had engineered features (e.g., RATIO_SURFACE, AGE_BATIMENT, FIRE_RATE_ZONE, etc.) in your original evaluation dataset.

To bring them in:

python
Copy
Edit


In [None]:
# Keep only static + spatial features per building
features_static = gdf.drop_duplicates("ID_UEV")[[
    "ID_UEV", "MUNICIPALITE", "ETAGE_HORS_SOL", "NOMBRE_LOGEMENT", "AGE_BATIMENT",
    "CODE_UTILISATION", "CATEGORIE_UEF", "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT",
    "NO_ARROND_ILE_CUM", "RATIO_SURFACE", "DENSITE_LOGEMENT", "HAS_MULTIPLE_LOGEMENTS",
    "FIRE_FREQUENCY_ZONE", "FIRE_RATE_ZONE", "FIRE_COUNT_LAST_YEAR_ZONE", "BUILDING_COUNT",
    "FIRE_RATE_ZONE_NORM", "FIRE_COUNT_LAST_YEAR_ZONE_NORM"
]]

# Merge with panel
panel = panel.merge(features_static, on="ID_UEV", how="left")
