In [1]:
# 📦 Install dependencies (if needed)
!pip install geopandas shapely pyproj --quiet

# 📂 Upload your files interactively
from google.colab import files
uploaded = files.upload()

# ✅ Confirm files are in /content
import os
print("Files in /content:")
print(os.listdir("/content"))

# 🌟 Define input paths
EVAL_PATH = "/content/eval_cleaned_feat_eng.csv"
ADDR_PATH = "/content/adresses.csv"
INC_PATH = "/content/interventions_cleaned_with_has_fire.csv"
OUTPUT_PATH = "/content/evaluation_fire_coordinates_date_feat_eng_2.csv"

# ✅ Check existence
for path in [EVAL_PATH, ADDR_PATH, INC_PATH]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"❌ File not found: {path}")

# 🚀 Load data
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from sklearn.preprocessing import MinMaxScaler

eval_df = pd.read_csv(EVAL_PATH, dtype=str)
addr_df = pd.read_csv(ADDR_PATH, dtype=str)
inc_df  = pd.read_csv(INC_PATH)

# 🧹 Preprocessing
eval_df["CIVIQUE_DEBUT"] = eval_df["CIVIQUE_DEBUT"].str.strip().astype(int)
eval_df["NOM_RUE_CLEAN"] = eval_df["NOM_RUE"].str.extract(r"^(.*?)(?:\s+\(.*)?$")[0].str.lower().str.strip()
original_eval_df = eval_df.copy()

addr_df["ADDR_DE"] = addr_df["ADDR_DE"].astype(int)
addr_df["NOM_RUE_CLEAN"] = (
    addr_df["GENERIQUE"].str.lower().str.strip() + " " +
    addr_df["SPECIFIQUE"].str.lower().str.strip()
)

# 🗺️ Merge for coordinates
eval_with_coords = pd.merge(
    eval_df,
    addr_df,
    left_on=["CIVIQUE_DEBUT", "NOM_RUE_CLEAN"],
    right_on=["ADDR_DE", "NOM_RUE_CLEAN"],
    how="left"
).dropna(subset=["LONGITUDE", "LATITUDE"])

eval_gdf = gpd.GeoDataFrame(
    eval_with_coords,
    geometry=gpd.points_from_xy(
        eval_with_coords["LONGITUDE"].astype(float),
        eval_with_coords["LATITUDE"].astype(float)
    ),
    crs="EPSG:4326"
)

# 🚒 Clean fire incidents
inc_df = inc_df[inc_df["DESCRIPTION_GROUPE"].str.contains("INCENDIE", case=False, na=False)]
inc_df["CREATION_DATE_TIME"] = pd.to_datetime(inc_df["CREATION_DATE_TIME"], errors="coerce")

incident_gdf = gpd.GeoDataFrame(
    inc_df,
    geometry=gpd.points_from_xy(inc_df["LONGITUDE"], inc_df["LATITUDE"]),
    crs="EPSG:4326"
)

# 🗺️ Project to meters for spatial join
eval_gdf = eval_gdf.to_crs(epsg=32188)
incident_gdf = incident_gdf.to_crs(epsg=32188)
incident_gdf["buffer"] = incident_gdf.geometry.buffer(100)
incident_buffer_gdf = incident_gdf.set_geometry("buffer")

# 🔄 Spatial join
joined = gpd.sjoin(eval_gdf, incident_buffer_gdf, predicate="within", how="inner")
joined = joined.rename(columns={"CREATION_DATE_TIME": "fire_date"})
joined["fire"] = True

# 🔻 Drop irrelevant columns
drop_cols = [
    "CIVIQUE_DEBUT","CIVIQUE_FIN","NOM_RUE","LETTRE_DEBUT","LETTRE_FIN",
    "MATRICULE83","NOM_RUE_CLEAN","ADDR_DE","X","Y","geometry",
    "geometry_right","index_right","DESCRIPTION_GROUPE","INCIDENT_TYPE_DESC",
    "DIVISION","NOM_VILLE","NOM_ARROND"
]
joined.drop(columns=drop_cols, inplace=True, errors="ignore")

# 🔄 Merge fire info back
fire_records = joined[["ID_UEV","fire_date","NOMBRE_UNITES","CASERNE"]].copy()
fire_records["fire"] = True

data = pd.merge(original_eval_df, fire_records, on="ID_UEV", how="left")
data["fire"] = data["fire"].fillna(False)
data["fire_date"] = pd.to_datetime(data["fire_date"], errors="coerce")

# 🗺️ Reattach coordinates
addr_coords = addr_df[["ADDR_DE","NOM_RUE_CLEAN","LONGITUDE","LATITUDE"]]
data = pd.merge(
    data,
    addr_coords,
    left_on=["CIVIQUE_DEBUT","NOM_RUE_CLEAN"],
    right_on=["ADDR_DE","NOM_RUE_CLEAN"],
    how="left"
)

# 🕒 Time features
data["fire_month"] = data["fire_date"].dt.month
data["fire_year"] = data["fire_date"].dt.year
data["year_month"] = data["fire_date"].dt.to_period("M").astype(str)

def get_season(month):
    if pd.isnull(month): return None
    if month in [12,1,2]: return "Winter"
    if month in [3,4,5]: return "Spring"
    if month in [6,7,8]: return "Summer"
    return "Fall"

data["fire_season"] = data["fire_month"].apply(get_season)

# 📊 Zone aggregates
data["NO_ARROND_ILE_CUM"] = data["NO_ARROND_ILE_CUM"].astype(str)
fires_2024 = data[(data["fire"]==True) & (data["fire_date"].dt.year==2024)]
fire_count = fires_2024.groupby("NO_ARROND_ILE_CUM").size().reset_index(name="FIRE_COUNT_LAST_YEAR_ZONE")
building_count = data.groupby("NO_ARROND_ILE_CUM").size().reset_index(name="BUILDING_COUNT")

data = data.merge(fire_count, on="NO_ARROND_ILE_CUM", how="left")
data = data.merge(building_count, on="NO_ARROND_ILE_CUM", how="left")
data["FIRE_COUNT_LAST_YEAR_ZONE"] = data["FIRE_COUNT_LAST_YEAR_ZONE"].fillna(0)
data["FIRE_RATE_ZONE"] = (data["FIRE_COUNT_LAST_YEAR_ZONE"] / data["BUILDING_COUNT"]).fillna(0)

# 🔄 Normalize
scaler = MinMaxScaler()
data[["FIRE_COUNT_LAST_YEAR_ZONE_NORM","FIRE_RATE_ZONE_NORM"]] = scaler.fit_transform(
    data[["FIRE_COUNT_LAST_YEAR_ZONE","FIRE_RATE_ZONE"]]
)

# ✅ Final cleanup
data["had_fire"] = data["fire_date"].notna().astype(int)
data["missing_coords"] = data[["LATITUDE","LONGITUDE"]].isna().any(axis=1)

# Drop unwanted columns
columns_to_drop = [
    "CIVIQUE_DEBUT","CIVIQUE_FIN","NOM_RUE","NOM_RUE_CLEAN","ADDR_DE",
    "MATRICULE83","LETTRE_DEBUT","LETTRE_FIN","SUITE_DEBUT","CASERNE",
    "ANNEE_CONSTRUCTION"
]
cleaned_data = data.drop(columns=columns_to_drop, errors="ignore")

# 💾 Save output
cleaned_data.to_csv(OUTPUT_PATH, index=False)
print(f"✅ File saved to {OUTPUT_PATH}")

Saving adresses.csv to adresses.csv
Saving eval_cleaned_feat_eng.csv to eval_cleaned_feat_eng.csv
Saving interventions_cleaned_with_has_fire.csv to interventions_cleaned_with_has_fire.csv
Files in /content:
['.config', 'eval_cleaned_feat_eng.csv', 'interventions_cleaned_with_has_fire.csv', 'adresses.csv', 'sample_data']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inc_df["CREATION_DATE_TIME"] = pd.to_datetime(inc_df["CREATION_DATE_TIME"], errors="coerce")
  data["fire"] = data["fire"].fillna(False)


✅ File saved to /content/evaluation_fire_coordinates_date_feat_eng_2.csv


In [2]:
from google.colab import files
files.download("/content/evaluation_fire_coordinates_date_feat_eng_2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>