In [1]:
# =====================================================
# 📂 Upload your dataset
# =====================================================
from google.colab import files
uploaded = files.upload()

Saving evaluation_fire_coordinates_date_feat_eng_2.csv to evaluation_fire_coordinates_date_feat_eng_2 (1).csv


In [1]:
import os
print("Files in /content:", os.listdir("/content"))

Files in /content: ['.config', 'evaluation_fire_coordinates_date_feat_eng_2.csv', 'evaluation_fire_coordinates_date_feat_eng_2 (1).csv', 'sample_data']


In [2]:
# =====================================================
# 🧹 Dense Panel Building Month Creation
# =====================================================
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

INPUT_CSV = "/content/evaluation_fire_coordinates_date_feat_eng_2.csv"
OUTPUT_PANEL = "/content/building_month_fire_panel_feat_eng.csv"

# Load fire dataset
df = pd.read_csv(INPUT_CSV)
print(f"✅ Loaded CSV: {df.shape[0]:,} rows × {df.shape[1]} columns")

✅ Loaded CSV: 663,783 rows × 31 columns


In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# 🟡 Load your big dataset again if needed (assuming df is already loaded and cleaned)

# ✅ Convert fire_date to period
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df["month"] = df["fire_date"].dt.to_period("M")
df = df.dropna(subset=["LONGITUDE", "LATITUDE", "ID_UEV"])

# ✅ Create GeoDataFrame
df["geometry"] = df.apply(lambda row: Point(row["LONGITUDE"], row["LATITUDE"]), axis=1)
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:32188")

# ✅ Get unique buildings
unique_buildings = gdf[["ID_UEV", "LATITUDE", "LONGITUDE"]].drop_duplicates()

# ✅ Subsample: take 40,000 random buildings
sample_size = 40000
unique_buildings_sample = unique_buildings.sample(n=sample_size, random_state=42).reset_index(drop=True)

# ✅ Limit time range if you want (or comment this out to keep full range)
# all_months = pd.period_range(start="2019-01", end=gdf["month"].max(), freq="M")
all_months = pd.period_range(start=gdf["month"].min(), end=gdf["month"].max(), freq="M")

print(f"Building sample size: {len(unique_buildings_sample)}")
print(f"Total months: {len(all_months)}")

# ✅ Create the panel
print("Creating building x month panel...")
panel = pd.MultiIndex.from_product(
    [unique_buildings_sample["ID_UEV"].unique(), all_months],
    names=["ID_UEV", "month"]
).to_frame(index=False)

panel = panel.merge(unique_buildings_sample, on="ID_UEV", how="left")
print(f"Panel shape after merge: {panel.shape}")

# ✅ Label fire presence (more memory efficient)
fires = gdf[gdf["fire"] == True][["ID_UEV", "month"]].drop_duplicates()
fires["HAS_FIRE_THIS_MONTH"] = 1

panel = panel.merge(fires, on=["ID_UEV", "month"], how="left")
panel["HAS_FIRE_THIS_MONTH"] = panel["HAS_FIRE_THIS_MONTH"].fillna(0).astype(int)

print("✅ Fire presence labeled!")

# ✅ You can now continue with lag/cumulative features

Building sample size: 40000
Total months: 65
Creating building x month panel...
Panel shape after merge: (2600000, 4)
✅ Fire presence labeled!


In [4]:
# Clean data
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")
df["month"] = df["fire_date"].dt.to_period("M")
df = df.dropna(subset=["LONGITUDE", "LATITUDE", "ID_UEV"])
df["geometry"] = df.apply(lambda row: Point(row["LONGITUDE"], row["LATITUDE"]), axis=1)
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:32188")

In [5]:
# Create unique buildings and months
unique_buildings = gdf[["ID_UEV", "LATITUDE", "LONGITUDE"]].drop_duplicates()
all_months = pd.period_range(start=gdf["month"].min(), end=gdf["month"].max(), freq="M")
print(f"Expanding dataset into building × month panel...")
panel = pd.MultiIndex.from_product(
    [unique_buildings["ID_UEV"].unique(), all_months],
    names=["ID_UEV", "month"]
).to_frame(index=False)
panel = panel.merge(unique_buildings, on="ID_UEV", how="left")
print(f"Base panel shape: {panel.shape}")

Expanding dataset into building × month panel...
Base panel shape: (20300865, 4)


In [6]:
# Define static features (optional: adjust if any are missing)
static_cols = [
    "ID_UEV", "LATITUDE", "LONGITUDE", "MUNICIPALITE", "ETAGE_HORS_SOL",
    "NOMBRE_LOGEMENT", "AGE_BATIMENT", "CODE_UTILISATION", "CATEGORIE_UEF",
    "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT", "NO_ARROND_ILE_CUM",
    "RATIO_SURFACE", "DENSITE_LOGEMENT", "HAS_MULTIPLE_LOGEMENTS",
    "FIRE_FREQUENCY_ZONE", "FIRE_RATE_ZONE", "FIRE_COUNT_LAST_YEAR_ZONE",
    "BUILDING_COUNT", "FIRE_RATE_ZONE_NORM", "FIRE_COUNT_LAST_YEAR_ZONE_NORM"
]
existing_cols = [col for col in static_cols if col in gdf.columns]
print("Using static columns:", existing_cols)

Using static columns: ['ID_UEV', 'LATITUDE', 'LONGITUDE', 'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT', 'AGE_BATIMENT', 'CODE_UTILISATION', 'CATEGORIE_UEF', 'SUPERFICIE_TERRAIN', 'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM', 'RATIO_SURFACE', 'DENSITE_LOGEMENT', 'HAS_MULTIPLE_LOGEMENTS', 'FIRE_FREQUENCY_ZONE', 'FIRE_RATE_ZONE', 'FIRE_COUNT_LAST_YEAR_ZONE', 'BUILDING_COUNT', 'FIRE_RATE_ZONE_NORM', 'FIRE_COUNT_LAST_YEAR_ZONE_NORM']


In [7]:
# Filter static features
static_features = gdf[existing_cols].drop_duplicates(subset=["ID_UEV"])

# Filter to valid IDs
valid_ids = gdf["ID_UEV"].unique()
static_features = static_features[static_features["ID_UEV"].isin(valid_ids)]

In [8]:
# Rebuild panel
panel = pd.MultiIndex.from_product(
    [static_features["ID_UEV"].unique(), all_months],
    names=["ID_UEV", "month"]
).to_frame(index=False)
panel = panel.merge(static_features, on="ID_UEV", how="left")
print(f"Static features merged: {panel.shape}")

Static features merged: (20175155, 22)


In [None]:
# Label fire presence (memory-efficient version)
print("Labelling HAS_FIRE_THIS_MONTH efficiently...")
fires = gdf[gdf["fire"] == True][["ID_UEV", "month"]].drop_duplicates()
fires_index = pd.MultiIndex.from_frame(fires)

# Create a boolean mask indicating presence of fire
panel["HAS_FIRE_THIS_MONTH"] = (
    panel.set_index(["ID_UEV", "month"]).index.isin(fires_index)
).astype(int)

Labelling HAS_FIRE_THIS_MONTH efficiently...


In [1]:
# Lag features
panel = panel.sort_values(by=["ID_UEV", "month"])
panel["fire_last_1m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(1).fillna(0)
panel["fire_last_2m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(2).fillna(0)
panel["fire_last_3m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(3).fillna(0)

NameError: name 'panel' is not defined

In [None]:
# Cumulative fire count
panel["fire_cumcount"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().cumsum())
    .fillna(0)
)

In [None]:
# Rolling counts
panel["fire_rolling_3m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(window=3, min_periods=1).sum())
    .fillna(0)
)
panel["fire_rolling_6m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(window=6, min_periods=1).sum())
    .fillna(0)
)
panel["fire_rolling_12m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(window=12, min_periods=1).sum())
    .fillna(0)
)

In [None]:
# Months since last fire
panel["has_fire_last_month"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(1).fillna(0)
panel["months_since_last_fire"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().apply(lambda val: 0 if val == 1 else None).ffill(limit=None).cumsum())
    .fillna(999)
)

In [None]:
# Add month/year columns
panel["month_num"] = panel["month"].dt.month
panel["year"] = panel["month"].dt.year

print(f"✅ Final panel shape: {panel.shape}")

In [None]:
# Save output
panel.to_csv(OUTPUT_PANEL, index=False)
print(f"✅ Panel saved to: {OUTPUT_PANEL}")

In [1]:
# 🚀 Import libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# ✅ Load the CSV
df = pd.read_csv("evaluation_fire_coordinates_date_feat_eng_2.csv")
print(f"✅ Loaded data: {df.shape}")

# ✅ Clean / Parse dates
df["fire_date"] = pd.to_datetime(df["fire_date"], errors="coerce")

# ✅ Drop rows with missing essential data
df = df.dropna(subset=["LONGITUDE", "LATITUDE", "ID_UEV"])

# ✅ Create geometry
df["geometry"] = df.apply(lambda row: Point(row["LONGITUDE"], row["LATITUDE"]), axis=1)

# ✅ Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326").to_crs("EPSG:32188")

# ✅ OPTIONAL: Subset to avoid memory errors (take 40,000 buildings)
unique_building_ids = gdf["ID_UEV"].drop_duplicates().sample(n=40000, random_state=42)
gdf_subset = gdf[gdf["ID_UEV"].isin(unique_building_ids)]
print(f"✅ Subset size: {gdf_subset.shape}")

# ✅ Build all months range
all_months = pd.period_range(start=gdf_subset["fire_date"].dt.to_period("M").min(),
                             end=gdf_subset["fire_date"].dt.to_period("M").max(),
                             freq="M")

# ✅ Create the base panel
panel = pd.MultiIndex.from_product(
    [gdf_subset["ID_UEV"].unique(), all_months],
    names=["ID_UEV", "month"]
).to_frame(index=False)

# ✅ Merge static building info
static_cols = [
    "ID_UEV", "LATITUDE", "LONGITUDE", "MUNICIPALITE", "ETAGE_HORS_SOL",
    "NOMBRE_LOGEMENT", "AGE_BATIMENT", "CODE_UTILISATION", "CATEGORIE_UEF",
    "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT", "NO_ARROND_ILE_CUM",
    "RATIO_SURFACE", "DENSITE_LOGEMENT", "HAS_MULTIPLE_LOGEMENTS",
    "FIRE_FREQUENCY_ZONE", "FIRE_RATE_ZONE", "FIRE_COUNT_LAST_YEAR_ZONE",
    "BUILDING_COUNT", "FIRE_RATE_ZONE_NORM", "FIRE_COUNT_LAST_YEAR_ZONE_NORM"
]
static_features = (
    gdf_subset[static_cols]
    .drop_duplicates(subset=["ID_UEV"])
)

panel = panel.merge(static_features, on="ID_UEV", how="left")
print(f"✅ Panel after merging static features: {panel.shape}")

# ✅ Label fire presence
fires = (
    gdf_subset[gdf_subset["fire"] == True][["ID_UEV", "fire_date"]]
    .drop_duplicates()
    .assign(month=lambda d: d["fire_date"].dt.to_period("M"))
    [["ID_UEV", "month"]]
)
fires["HAS_FIRE_THIS_MONTH"] = 1

panel = panel.merge(fires, on=["ID_UEV", "month"], how="left")
panel["HAS_FIRE_THIS_MONTH"] = panel["HAS_FIRE_THIS_MONTH"].fillna(0).astype(int)
print("✅ Fire presence labeled")

# ✅ Lag features
panel = panel.sort_values(["ID_UEV", "month"]).reset_index(drop=True)
panel["fire_last_1m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(1).fillna(0)
panel["fire_last_2m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(2).fillna(0)
panel["fire_last_3m"] = panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].shift(3).fillna(0)

# ✅ Cumulative and rolling fire counts
panel["fire_cumcount"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().cumsum())
    .fillna(0)
)
panel["fire_rolling_3m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(3, min_periods=1).sum())
    .fillna(0)
)
panel["fire_rolling_6m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(6, min_periods=1).sum())
    .fillna(0)
)
panel["fire_rolling_12m"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().rolling(12, min_periods=1).sum())
    .fillna(0)
)

# ✅ Months since last fire
panel["has_fire_last_month"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"].transform(lambda x: x.shift(1).fillna(0))
)
panel["months_since_last_fire"] = (
    panel.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
    .transform(lambda x: x.shift().apply(lambda v: 0 if v == 1 else None).ffill(limit=None).cumsum())
    .fillna(999)
)

# ✅ Add month and year numeric
panel["month_num"] = panel["month"].dt.month
panel["year"] = panel["month"].dt.year

# ✅ Show info
print(f"✅ Final panel shape: {panel.shape}")
print("✅ Columns:", list(panel.columns))

# ✅ Download without saving to Colab directory
from google.colab import files

# Save to a temporary CSV
panel.to_csv("building_month_fire_panel_feat_eng_SUBSET.csv", index=False)
files.download("building_month_fire_panel_feat_eng_SUBSET.csv")

✅ Loaded data: (663783, 31)
✅ Subset size: (59341, 32)
✅ Panel after merging static features: (2600000, 22)
✅ Fire presence labeled


  .fillna(999)


✅ Final panel shape: (2600618, 34)
✅ Columns: ['ID_UEV', 'month', 'LATITUDE', 'LONGITUDE', 'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT', 'AGE_BATIMENT', 'CODE_UTILISATION', 'CATEGORIE_UEF', 'SUPERFICIE_TERRAIN', 'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM', 'RATIO_SURFACE', 'DENSITE_LOGEMENT', 'HAS_MULTIPLE_LOGEMENTS', 'FIRE_FREQUENCY_ZONE', 'FIRE_RATE_ZONE', 'FIRE_COUNT_LAST_YEAR_ZONE', 'BUILDING_COUNT', 'FIRE_RATE_ZONE_NORM', 'FIRE_COUNT_LAST_YEAR_ZONE_NORM', 'HAS_FIRE_THIS_MONTH', 'fire_last_1m', 'fire_last_2m', 'fire_last_3m', 'fire_cumcount', 'fire_rolling_3m', 'fire_rolling_6m', 'fire_rolling_12m', 'has_fire_last_month', 'months_since_last_fire', 'month_num', 'year']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>