In [1]:
import pandas as pd 
from matplotlib import pyplot as pyplot
import seaborn as sns
import os
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
import pickle

# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score,cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score,make_scorer,precision_score
from sklearn.preprocessing import LabelEncoder

In [3]:
#os.path.realpath(__file__)
# current_path = os.getcwd()
# dataset_path = os.path.dirname(current_path) # up 1 level
# dataset_path = os.path.dirname(dataset_path) # up 2 level
# dataset_path = os.path.join(dataset_path,'datasets')

dirname = os.getcwd()

raw_dataset_path = os.path.join(dirname,'..','..','datasets','raw')
cleaned_dataset_path = os.path.join(dirname,'..','..','datasets','cleaned')

In [4]:
#df = pd.read_csv(os.path.join(DIRECTORY,'evaluation-fonciere-with-fire.csv'))

df = pd.read_csv(os.path.join(cleaned_dataset_path,'evaluation_with_fire_and_coordinates.csv'))
df.drop(['fire','LETTRE_DEBUT','LETTRE_FIN'],axis=1,inplace=True)

#build geo dataframe from evaluation df
eval_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["LONGITUDE"].astype(float),
                                 df["LATITUDE"].astype(float)),
    crs="EPSG:4326"
)

# Create list of months (1-12)
months = list(range(1, 13))

# Duplicate each row 12 times and add month column
df = df.loc[df.index.repeat(12)].assign(month=np.tile(months, len(df))).reset_index(drop=True)

In [5]:
df.shape

(6171384, 21)

In [6]:
df.columns

Index(['ID_UEV', 'CIVIQUE_DEBUT', 'CIVIQUE_FIN', 'NOM_RUE', 'SUITE_DEBUT',
       'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT',
       'ANNEE_CONSTRUCTION', 'CODE_UTILISATION', 'LIBELLE_UTILISATION',
       'CATEGORIE_UEF', 'MATRICULE83', 'SUPERFICIE_TERRAIN',
       'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM', 'NOM_RUE_CLEAN', 'ADDR_DE',
       'LONGITUDE', 'LATITUDE', 'month'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,CATEGORIE_UEF,MATRICULE83,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,ADDR_DE,LONGITUDE,LATITUDE,month
0,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,,,1983,1921,...,Condominium,9739-83-9737-8-001-0431,2,,REM19,avenue atwater,3577.0,-73.588602,45.493711,1
1,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,,,1983,1921,...,Condominium,9739-83-9737-8-001-0431,2,,REM19,avenue atwater,3577.0,-73.588602,45.493711,2
2,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,,,1983,1921,...,Condominium,9739-83-9737-8-001-0431,2,,REM19,avenue atwater,3577.0,-73.588602,45.493711,3
3,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,,,1983,1921,...,Condominium,9739-83-9737-8-001-0431,2,,REM19,avenue atwater,3577.0,-73.588602,45.493711,4
4,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,,,1983,1921,...,Condominium,9739-83-9737-8-001-0431,2,,REM19,avenue atwater,3577.0,-73.588602,45.493711,5


In [9]:
incidents = pd.read_csv(os.path.join(cleaned_dataset_path,'interventions_cleaned.csv'),parse_dates=['CREATION_DATE_TIME'])
# -- Project both to meters for spatial operations ---
eval_gdf = eval_gdf.to_crs(epsg=32188)
incident_gdf = gpd.GeoDataFrame(
    incidents,
    geometry=gpd.points_from_xy(incidents["LONGITUDE"], incidents["LATITUDE"]),
    crs="EPSG:4326"
)

# --- Project both to meters for spatial operations ---

incident_gdf = incident_gdf.to_crs(epsg=32188)


df['fire']=False #sets all values to False by default
print(f"Starting incident matching")
print(f"df['fire'] contains {df['fire'].value_counts()}")
for m in months:
    
    monthly_incidents_gdf=incident_gdf[incident_gdf['CREATION_DATE_TIME'].dt.month==m].copy()
    print(monthly_incidents_gdf.head(5))
    print(f"month {m} : {monthly_incidents_gdf.shape}")
    
    
    # --- Buffer fire incidents by 100 meters ---
    monthly_incidents_gdf["buffer"] = monthly_incidents_gdf.geometry.buffer(100)
    incident_buffer_gdf = monthly_incidents_gdf.set_geometry("buffer")
    
    # --- Spatial join: find properties within 100m of a fire incident ---
    joined = gpd.sjoin(eval_gdf, incident_buffer_gdf, predicate='within', how='inner')
    # print(f"joined length : {joined.shape}")
    # --- Use unique matched ID_UEV set to mark fires ---
    matched_ids = set(joined["ID_UEV"])
    # print(f"matched ids: {len(matched_ids)} - first {list(matched_ids)[0]}")
    #rint('-'.join(sorted(list(str(id) for id in matched_ids))))
    # --- Back to original eval_df (including unmatched rows) ---
    # Assign fire flag based on ID_UEV
    # print(df["ID_UEV"].isin(matched_ids) & df['month']==m)
    #df["fire"] = (df["ID_UEV"].isin(matched_ids) & df['month']==m) | df['fire']==True # added the condition that if fire is already set to true we don't overwrite it
    df["fire"] = (df["ID_UEV"].isin(matched_ids) & (df['month']==m)) | (df['fire']==True)
    print(f"Finished processing month {m}")
    print(f"df['fire'] contains {df['fire'].value_counts()}")

Starting incident matching
df['fire'] contains fire
False    6171384
Name: count, dtype: int64
   INCIDENT_NBR  CREATION_DATE_TIME         INCIDENT_TYPE_DESC  \
0             1 2025-01-01 00:02:31  Appel de Cie de détection   
1             5 2024-01-01 00:22:14  Appel de Cie de détection   
2             8 2024-01-01 00:27:34  Appel de Cie de détection   
3             9 2025-01-01 00:13:59     Alarme privé ou locale   
4            10 2025-01-01 00:16:26        Alarme vérification   

  DESCRIPTION_GROUPE  CASERNE NOM_VILLE                            NOM_ARROND  \
0  Alarmes-incendies       64  Montréal                               Lachine   
1  Alarmes-incendies       34  Montréal   Côte-des-Neiges-Notre-Dame-de-Grâce   
2  Alarmes-incendies       27  Montréal   Côte-des-Neiges-Notre-Dame-de-Grâce   
3  Alarmes-incendies       41  Montréal  Villeray-Saint-Michel-Parc-Extension   
4  Alarmes-incendies       10  Montréal                           Ville-Marie   

   DIVISION  LONGITUD

In [9]:
df[df['fire']]['month'].value_counts()

month
5     150364
6     149704
7     149182
4     148935
8     146802
9     143995
10    140680
3     140391
1     136082
2     133656
11    129923
12    123989
Name: count, dtype: int64

In [None]:
incidents['DESCRIPTION_GROUPE'].unique()

array(['Alarmes-incendies', 'AUTREFEU', 'INCENDIE'], dtype=object)

In [None]:
df.head(10)['fire']

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: fire, dtype: bool

In [None]:
addresses.head()

Unnamed: 0,ID_ADRESSE,TEXTE,SPECIFIQUE,ORIENTATION,LIEN,HAUTEUR,GENERIQUE,ANGLE,ADDR_DE,ADDR_A,X,Y,LONGITUDE,LATITUDE
0,72,#5664,1re,X,,2.0,avenue,22.6058,5664,5664,298491.88,5045167.13,-73.580784,45.546389
1,261,#6653-55,1re,X,,2.0,avenue,33.6547,6653,6655,297562.19,5045750.57,-73.592699,45.55163
2,294,#6777-79,1re,X,,2.0,avenue,33.7086,6777,6779,297390.43,5045863.43,-73.5949,45.552644
3,296,#6781-85,1re,X,,2.0,avenue,33.7086,6781,6785,297383.87,5045867.68,-73.594984,45.552682
4,305,#6812-14,1re,X,,2.0,avenue,33.5571,6812,6814,297337.81,5045854.24,-73.595574,45.552561


In [None]:
df.columns

Index(['ID_UEV', 'CIVIQUE_DEBUT', 'CIVIQUE_FIN', 'NOM_RUE', 'SUITE_DEBUT',
       'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT',
       'ANNEE_CONSTRUCTION', 'CODE_UTILISATION', 'LIBELLE_UTILISATION',
       'CATEGORIE_UEF', 'MATRICULE83', 'SUPERFICIE_TERRAIN',
       'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512288 entries, 0 to 512287
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID_UEV               512288 non-null  int64  
 1   CIVIQUE_DEBUT        512288 non-null  int64  
 2   CIVIQUE_FIN          512288 non-null  int64  
 3   NOM_RUE              512288 non-null  object 
 4   SUITE_DEBUT          134705 non-null  object 
 5   MUNICIPALITE         512288 non-null  int64  
 6   ETAGE_HORS_SOL       462458 non-null  float64
 7   NOMBRE_LOGEMENT      465930 non-null  float64
 8   ANNEE_CONSTRUCTION   512288 non-null  int64  
 9   CODE_UTILISATION     512288 non-null  int64  
 10  LIBELLE_UTILISATION  512288 non-null  object 
 11  CATEGORIE_UEF        512288 non-null  object 
 12  MATRICULE83          512288 non-null  object 
 13  SUPERFICIE_TERRAIN   512288 non-null  int64  
 14  SUPERFICIE_BATIMENT  477455 non-null  float64
 15  NO_ARROND_ILE_CUM

In [10]:


# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# split into X,y
X=df.iloc[:, :-1]
y=df['fire']

In [14]:
df[df['fire']].head(20)

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,MATRICULE83,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,ADDR_DE,LONGITUDE,LATITUDE,month,fire
48,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,1,True
49,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,2,True
50,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,3,True
51,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,4,True
52,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,5,True
53,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,6,True
54,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,7,True
55,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,8,True
56,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,9,True
57,1037334,1254,1254,5632,169,50,1.0,1.0,1914,1000,...,425609,16,82.0,7,5411,1254.0,-73.579815,45.492286,10,True


In [16]:
X.head(2)

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,CATEGORIE_UEF,MATRICULE83,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,ADDR_DE,LONGITUDE,LATITUDE,month
0,1038405,3577,3577,482,5893,50,,,1983,1921,...,0,387877,2,,7,228,3577.0,-73.588602,45.493711,1
1,1038405,3577,3577,482,5893,50,,,1983,1921,...,0,387877,2,,7,228,3577.0,-73.588602,45.493711,2


In [None]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define multiple scoring metrics
scoring_metrics = ['recall_macro', 'precision_macro', 'f1_macro']

#define a random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform k-fold cross-validation on training set
scores = cross_validate(
    estimator=rf,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=scoring_metrics,
    return_train_score=True,
    return_estimator=True,
    verbose=2 # to show progress
)

# Print scores for each metric
print("K-fold Cross Validation Scores:")
for metric in scoring_metrics:
    print(f"\n{metric}:")
    for i, score in enumerate(scores[f'test_{metric}']):
        print(f"Fold {i+1}: {score:.2%}")
    print(f"Average {metric}: {scores[f'test_{metric}'].mean():.2%}")


# Get the best model based on average test score
# best_model_idx = np.argmax([np.mean(scores[f'test_{metric}']) 
#                           for metric in scoring_metrics])
# Gets the best model based on f1_score
#print(scores)
best_model_idx = np.argmax(scores['test_f1_macro']) 


best_model = scores['estimator'][best_model_idx]

# Save the best model
model_path = 'best_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

# validate model performance on test set
y_test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test,y_test_predictions)
test_precision = precision_score(y_test,y_test_predictions)
test_recall = recall_score(y_test,y_test_predictions)
test_f1 = f1_score(y_test,y_test_predictions)


print(f"\nTest Set Accuracy: {test_accuracy:.2%}")
print(f"\nTest Set Precision: {test_precision:.2%}")
print(f"\nTest Set Recall: {test_recall:.2%}")
print(f"\nTest Set F1-score: {test_f1:.2%}")



[CV] END .................................................... total time=44.3min
[CV] END .................................................... total time=86.8min
[CV] END .................................................... total time=43.6min
[CV] END .................................................... total time=44.4min
[CV] END .................................................... total time=44.5min
K-fold Cross Validation Scores:

recall_macro:
Fold 1: 83.40%
Fold 2: 83.37%
Fold 3: 83.49%
Fold 4: 83.47%
Fold 5: 83.46%
Average recall_macro: 83.44%

precision_macro:
Fold 1: 84.44%
Fold 2: 84.44%
Fold 3: 84.52%
Fold 4: 84.50%
Fold 5: 84.52%
Average precision_macro: 84.48%

f1_macro:
Fold 1: 83.89%
Fold 2: 83.88%
Fold 3: 83.98%
Fold 4: 83.96%
Fold 5: 83.96%
Average f1_macro: 83.93%


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 276.6min finished



Test Set Accuracy: 87.41%

Test Set Precision: 78.41%

Test Set Recall: 74.52%

Test Set F1-score: 76.41%


In [4]:
model_path = 'best_model.pkl'
# Verify the save worked
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

# validate model performance on test set
y_test_predictions = loaded_model.predict(X_test)
test_accuracy = accuracy_score(y_test,y_test_predictions)
test_precision = precision_score(y_test,y_test_predictions)
test_recall = recall_score(y_test,y_test_predictions)
test_f1 = f1_score(y_test,y_test_predictions)


print(f"\nTest Set Accuracy: {test_accuracy:.2%}")
print(f"\nTest Set Precision: {test_precision:.2%}")
print(f"\nTest Set Recall: {test_recall:.2%}")
print(f"\nTest Set F1-score: {test_f1:.2%}")

NameError: name 'X_test' is not defined

In [5]:
loaded_model.feature_importances_

array([0.04358553, 0.02891045, 0.02935102, 0.02141013, 0.01581384,
       0.00229381, 0.01279384, 0.00823966, 0.02583329, 0.00135314,
       0.00125172, 0.00961647, 0.04432779, 0.04745686, 0.02714322,
       0.01064602, 0.01890073, 0.11895232, 0.12810088, 0.12561429,
       0.27840498])

In [11]:
# Built-in feature importance (Gini Importance)
feature_names=list(X.columns)
importances = loaded_model.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

                Feature  Gini Importance
20                month         0.278405
18            LONGITUDE         0.128101
19             LATITUDE         0.125614
17              ADDR_DE         0.118952
13   SUPERFICIE_TERRAIN         0.047457
12          MATRICULE83         0.044328
0                ID_UEV         0.043586
2           CIVIQUE_FIN         0.029351
1         CIVIQUE_DEBUT         0.028910
14  SUPERFICIE_BATIMENT         0.027143
8    ANNEE_CONSTRUCTION         0.025833
3               NOM_RUE         0.021410
16        NOM_RUE_CLEAN         0.018901
4           SUITE_DEBUT         0.015814
6        ETAGE_HORS_SOL         0.012794
15    NO_ARROND_ILE_CUM         0.010646
11        CATEGORIE_UEF         0.009616
7       NOMBRE_LOGEMENT         0.008240
5          MUNICIPALITE         0.002294
9      CODE_UTILISATION         0.001353
10  LIBELLE_UTILISATION         0.001252


In [32]:
print(kfold_scores)

[0.92507202 0.92395196 0.92439041 0.92425226 0.92271704]


- filtering redundant geographical information
- summarizing alarmes over time
- summarizing fire incidents over time
- try lasso ?




# Next steps
- how do we deal with building construction year 9999
- how do we evaluate ? recall, AUC-PR, f1 score?
- how do we handle the month component ? replace target by 12 target variables 'fire in month' run 12 predictions per address ? or create 10 lines per address with a single binary target variable ?
- visualization map + slider for month ?