In [1]:
!pip install pandas geopandas shapely xgboost scikit-learn



In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

# ðŸ”¹ Load CSV from local disk
INPUT_CSV = r"D:\McGill\Final Course Images\FINAL COURSE STUFF\evaluation_fire_coordinates_date_feat_eng_2.csv"
print("Loading CSV...")
df = pd.read_csv(INPUT_CSV)
print(f"âœ… Loaded: {df.shape[0]:,} rows Ã— {df.shape[1]:,} columns")

# ðŸ”¹ Convert dates
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df["month"] = df["fire_date"].dt.to_period("M")

# ðŸ”¹ Drop rows without key identifiers
df = df.dropna(subset=["LONGITUDE", "LATITUDE", "ID_UEV"])

# ðŸ”¹ Build geometry
df["geometry"] = df.apply(lambda row: Point(row["LONGITUDE"], row["LATITUDE"]), axis=1)
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:32188")

# ðŸ”¹ Prepare unique buildings and months
unique_buildings = gdf[["ID_UEV", "LATITUDE", "LONGITUDE"]].drop_duplicates()
all_months = pd.period_range(start=gdf["month"].min(), end=gdf["month"].max(), freq="M")

print(f"Expanding dataset into building Ã— month panel...")
panel = pd.MultiIndex.from_product(
    [unique_buildings["ID_UEV"].unique(), all_months],
    names=["ID_UEV", "month"]
).to_frame(index=False)
panel = panel.merge(unique_buildings, on="ID_UEV", how="left")

print(f"âœ… Base panel created: {panel.shape}")

# ðŸ”¹ Static features
static_cols = [
    "ID_UEV", "LATITUDE", "LONGITUDE", "MUNICIPALITE", "ETAGE_HORS_SOL",
    "NOMBRE_LOGEMENT", "AGE_BATIMENT", "CODE_UTILISATION", "CATEGORIE_UEF",
    "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT", "NO_ARROND_ILE_CUM", "RATIO_SURFACE",
    "DENSITE_LOGEMENT", "HAS_MULTIPLE_LOGEMENTS", "FIRE_FREQUENCY_ZONE",
    "FIRE_RATE_ZONE", "FIRE_COUNT_LAST_YEAR_ZONE", "BUILDING_COUNT",
    "FIRE_RATE_ZONE_NORM", "FIRE_COUNT_LAST_YEAR_ZONE_NORM"
]

print("Merging static building features...")
static_features = gdf[static_cols].drop_duplicates(subset=["ID_UEV"])
valid_ids = gdf["ID_UEV"].unique()
static_features = static_features[static_features["ID_UEV"].isin(valid_ids)]

panel = panel.merge(static_features, on="ID_UEV", how="left")
print(f"âœ… Static features merged: {panel.shape}")

# ðŸ”¹ Label fire presence (optimized approach to reduce memory)
print("Labelling fire occurrence...")
fires = gdf.loc[gdf["fire"] == True, ["ID_UEV", "month"]].drop_duplicates()
fires["HAS_FIRE_THIS_MONTH"] = 1
panel = panel.merge(fires, on=["ID_UEV", "month"], how="left")
panel["HAS_FIRE_THIS_MONTH"] = panel["HAS_FIRE_THIS_MONTH"].fillna(0).astype(np.int8)

# ðŸ”¹ Lag features
print("Adding lag features...")
panel = panel.sort_values(["ID_UEV", "month"])
panel["fire_last_1m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(1).fillna(0).astype(np.int8)
panel["fire_last_2m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(2).fillna(0).astype(np.int8)
panel["fire_last_3m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(3).fillna(0).astype(np.int8)

# ðŸ”¹ Cumulative and rolling counts
print("Adding cumulative fire features...")
panel["fire_cumcount"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().cumsum())
    .fillna(0)
)

panel["fire_rolling_3m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(window=3, min_periods=1).sum())
    .fillna(0)
)

panel["fire_rolling_6m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(window=6, min_periods=1).sum())
    .fillna(0)
)

panel["fire_rolling_12m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(window=12, min_periods=1).sum())
    .fillna(0)
)

# ðŸ”¹ Time-based features
panel["month_num"] = panel["month"].dt.month
panel["year"] = panel["month"].dt.year

print(f"âœ… Final panel shape: {panel.shape}")
print(panel.columns)

# ðŸ”¹ Save output to local file
OUTPUT_PATH = r"D:\McGill\Final Course Images\FINAL COURSE STUFF\building_month_fire_panel_feat_eng.csv"
panel.to_csv(OUTPUT_PATH, index=False)
print(f"âœ… Saved to: {OUTPUT_PATH}")

Loading CSV...
âœ… Loaded: 663,783 rows Ã— 31 columns
Expanding dataset into building Ã— month panel...
âœ… Base panel created: (20300865, 4)
Merging static building features...
âœ… Static features merged: (20300865, 24)
Labelling fire occurrence...
Adding lag features...
Adding cumulative fire features...
âœ… Final panel shape: (20300865, 34)
Index(['ID_UEV', 'month', 'LATITUDE_x', 'LONGITUDE_x', 'LATITUDE_y',
       'LONGITUDE_y', 'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT',
       'AGE_BATIMENT', 'CODE_UTILISATION', 'CATEGORIE_UEF',
       'SUPERFICIE_TERRAIN', 'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM',
       'RATIO_SURFACE', 'DENSITE_LOGEMENT', 'HAS_MULTIPLE_LOGEMENTS',
       'FIRE_FREQUENCY_ZONE', 'FIRE_RATE_ZONE', 'FIRE_COUNT_LAST_YEAR_ZONE',
       'BUILDING_COUNT', 'FIRE_RATE_ZONE_NORM',
       'FIRE_COUNT_LAST_YEAR_ZONE_NORM', 'HAS_FIRE_THIS_MONTH', 'fire_last_1m',
       'fire_last_2m', 'fire_last_3m', 'fire_cumcount', 'fire_rolling_3m',
       'fire_rolling_6m', '