In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib

# ✅ Load your panel dataset
panel_path = r"D:\McGill\Final Course Images\FINAL COURSE STUFF\building_month_fire_panel_feat_eng.csv"
df = pd.read_csv(panel_path, parse_dates=["month"])

# ✅ Sort by building & time
df = df.sort_values(["ID_UEV", "month"]).reset_index(drop=True)
df["year"] = df["month"].dt.year

# ✅ Recreate lag features if needed
for lag in range(1, 4):
    df[f"fire_last_{lag}m"] = (
        df.groupby("ID_UEV")["HAS_FIRE_THIS_MONTH"]
        .shift(lag)
        .fillna(0)
        .astype(int)
    )

# ✅ Encode categorical columns
for col in ["CATEGORIE_UEF", "NO_ARROND_ILE_CUM"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# ✅ Define features
features = [
    "MUNICIPALITE", "ETAGE_HORS_SOL", "NOMBRE_LOGEMENT", "AGE_BATIMENT",
    "CODE_UTILISATION", "CATEGORIE_UEF", "SUPERFICIE_TERRAIN", "SUPERFICIE_BATIMENT",
    "NO_ARROND_ILE_CUM", "RATIO_SURFACE", "DENSITE_LOGEMENT", "HAS_MULTIPLE_LOGEMENTS",
    "FIRE_FREQUENCY_ZONE", "FIRE_RATE_ZONE", "FIRE_COUNT_LAST_YEAR_ZONE",
    "BUILDING_COUNT", "FIRE_RATE_ZONE_NORM", "FIRE_COUNT_LAST_YEAR_ZONE_NORM",
    "fire_last_1m", "fire_last_2m", "fire_last_3m",
    "fire_cumcount", "fire_rolling_3m", "fire_rolling_6m", "fire_rolling_12m",
    "month_num", "year"
]
target = "HAS_FIRE_THIS_MONTH"

# ✅ Split train/test
train_df = df[df["year"] <= 2023]
test_df = df[df["year"] == 2024]

X_train = train_df[features].copy()
y_train = train_df[target]

# ✅ Convert categorical columns
categorical_cols = ["CATEGORIE_UEF", "NO_ARROND_ILE_CUM"]
for col in categorical_cols:
    X_train[col] = X_train[col].astype("category")

# ✅ Compute scale_pos_weight for imbalance
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

# ✅ Train the model
model = XGBClassifier(
    enable_categorical=True,
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="logloss"
)

model.fit(X_train, y_train)

# ✅ Save the model
model_path = r"D:\McGill\Final Course Images\FINAL COURSE STUFF\xgb_fire_model.pkl"
joblib.dump(model, model_path)
print(f"✅ Model saved successfully at:\n{model_path}")

Parameters: { "use_label_encoder" } are not used.



✅ Model saved successfully at:
D:\McGill\Final Course Images\FINAL COURSE STUFF\xgb_fire_model.pkl
