In [23]:
import pandas as pd 
from matplotlib import pyplot as pyplot
import seaborn as sns
import os
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
import pickle

# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score,cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score,make_scorer,precision_score
from sklearn.preprocessing import LabelEncoder

In [24]:
#os.path.realpath(__file__)
# current_path = os.getcwd()
# dataset_path = os.path.dirname(current_path) # up 1 level
# dataset_path = os.path.dirname(dataset_path) # up 2 level
# dataset_path = os.path.join(dataset_path,'datasets')

dirname = os.getcwd()

raw_dataset_path = os.path.join(dirname,'..','..','datasets','raw')
cleaned_dataset_path = os.path.join(dirname,'..','..','datasets','cleaned')

In [25]:
#df = pd.read_csv(os.path.join(DIRECTORY,'evaluation-fonciere-with-fire.csv'))

df = pd.read_csv(os.path.join(cleaned_dataset_path,'evaluation_with_fire_and_coordinates.csv'))
df.drop(['fire','LETTRE_DEBUT','LETTRE_FIN'],axis=1,inplace=True)

#build geo dataframe from evaluation df
eval_gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["LONGITUDE"].astype(float),
                                 df["LATITUDE"].astype(float)),
    crs="EPSG:4326"
)

# Create list of months (1-12)
months = list(range(1, 13))

# Duplicate each row 12 times and add month column
df = df.loc[df.index.repeat(12)].assign(month=np.tile(months, len(df))).reset_index(drop=True)

In [26]:
df.shape

(6171384, 21)

In [27]:
df.columns

Index(['ID_UEV', 'CIVIQUE_DEBUT', 'CIVIQUE_FIN', 'NOM_RUE', 'SUITE_DEBUT',
       'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT',
       'ANNEE_CONSTRUCTION', 'CODE_UTILISATION', 'LIBELLE_UTILISATION',
       'CATEGORIE_UEF', 'MATRICULE83', 'SUPERFICIE_TERRAIN',
       'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM', 'NOM_RUE_CLEAN', 'ADDR_DE',
       'LONGITUDE', 'LATITUDE', 'month'],
      dtype='object')

In [28]:
df.head()

Unnamed: 0,ID_UEV,CIVIQUE_DEBUT,CIVIQUE_FIN,NOM_RUE,SUITE_DEBUT,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,...,CATEGORIE_UEF,MATRICULE83,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,NO_ARROND_ILE_CUM,NOM_RUE_CLEAN,ADDR_DE,LONGITUDE,LATITUDE,month
0,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,Condominium,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,3577.0,-73.588602,45.493711,1
1,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,Condominium,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,3577.0,-73.588602,45.493711,2
2,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,Condominium,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,3577.0,-73.588602,45.493711,3
3,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,Condominium,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,3577.0,-73.588602,45.493711,4
4,1038405,3577,3577,avenue Atwater (MTL+WMT),,50,1.0,1.0,1983.0,1921,...,Condominium,9739-83-9737-8-001-0431,2,16.0,REM19,avenue atwater,3577.0,-73.588602,45.493711,5


In [29]:
incidents = pd.read_csv(os.path.join(cleaned_dataset_path,'interventions_cleaned.csv'),parse_dates=['CREATION_DATE_TIME'])
# -- Project both to meters for spatial operations ---
eval_gdf = eval_gdf.to_crs(epsg=32188)
incident_gdf = gpd.GeoDataFrame(
    incidents,
    geometry=gpd.points_from_xy(incidents["LONGITUDE"], incidents["LATITUDE"]),
    crs="EPSG:4326"
)

# --- Project both to meters for spatial operations ---

incident_gdf = incident_gdf.to_crs(epsg=32188)


df['fire']=False #sets all values to False by default
print(f"Starting incident matching")
print(f"df['fire'] contains {df['fire'].value_counts()}")
for m in months:
    
    monthly_incidents_gdf=incident_gdf[(incident_gdf['CREATION_DATE_TIME'].dt.month==m) & (incident_gdf['DESCRIPTION_GROUPE'].isin(['INCENDIE','AUTREFEU']))].copy()
    print(monthly_incidents_gdf.head(5))
    print(f"month {m} : {monthly_incidents_gdf.shape}")
    
    
    # --- Buffer fire incidents by 100 meters ---
    monthly_incidents_gdf["buffer"] = monthly_incidents_gdf.geometry.buffer(100)
    incident_buffer_gdf = monthly_incidents_gdf.set_geometry("buffer")
    
    # --- Spatial join: find properties within 100m of a fire incident ---
    joined = gpd.sjoin(eval_gdf, incident_buffer_gdf, predicate='within', how='inner')
    # print(f"joined length : {joined.shape}")
    # --- Use unique matched ID_UEV set to mark fires ---
    matched_ids = set(joined["ID_UEV"])
    # print(f"matched ids: {len(matched_ids)} - first {list(matched_ids)[0]}")
    #rint('-'.join(sorted(list(str(id) for id in matched_ids))))
    # --- Back to original eval_df (including unmatched rows) ---
    # Assign fire flag based on ID_UEV
    # print(df["ID_UEV"].isin(matched_ids) & df['month']==m)
    #df["fire"] = (df["ID_UEV"].isin(matched_ids) & df['month']==m) | df['fire']==True # added the condition that if fire is already set to true we don't overwrite it
    df["fire"] = (df["ID_UEV"].isin(matched_ids) & (df['month']==m)) | (df['fire']==True)
    print(f"Finished processing month {m}")
    print(f"df['fire'] contains {df['fire'].value_counts()}")

Starting incident matching
df['fire'] contains fire
False    6171384
Name: count, dtype: int64
    INCIDENT_NBR  CREATION_DATE_TIME         INCIDENT_TYPE_DESC  \
5             25 2025-01-01 01:02:04             Déchets en feu   
7             45 2024-01-01 02:16:54             Déchets en feu   
14            91 2025-01-01 04:02:02  Feu de véhicule extérieur   
29           247 2025-01-01 14:59:03            Feu de bâtiment   
37           330 2024-01-02 00:47:05             Déchets en feu   

   DESCRIPTION_GROUPE  CASERNE NOM_VILLE  \
5            AUTREFEU       31  Montréal   
7            AUTREFEU       26  Montréal   
14           AUTREFEU       44  Montréal   
29           INCENDIE       66  Montréal   
37           AUTREFEU       20  Montréal   

                                  NOM_ARROND  DIVISION  LONGITUDE   LATITUDE  \
5                  Rosemont-La Petite-Patrie         5 -73.611566  45.539967   
7                      Le Plateau-Mont-Royal         5 -73.575126  45.531628 

In [30]:
df[df['fire']]['month'].value_counts()

month
5     76957
6     68522
4     64034
7     57656
9     51007
8     50222
10    49806
3     46894
11    46545
1     40874
12    39111
2     37243
Name: count, dtype: int64

In [31]:
incidents['DESCRIPTION_GROUPE'].unique()

array(['Alarmes-incendies', 'AUTREFEU', 'INCENDIE'], dtype=object)

In [32]:
df.head(10)['fire']

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: fire, dtype: bool

In [33]:
df.columns

Index(['ID_UEV', 'CIVIQUE_DEBUT', 'CIVIQUE_FIN', 'NOM_RUE', 'SUITE_DEBUT',
       'MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT',
       'ANNEE_CONSTRUCTION', 'CODE_UTILISATION', 'LIBELLE_UTILISATION',
       'CATEGORIE_UEF', 'MATRICULE83', 'SUPERFICIE_TERRAIN',
       'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM', 'NOM_RUE_CLEAN', 'ADDR_DE',
       'LONGITUDE', 'LATITUDE', 'month', 'fire'],
      dtype='object')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6171384 entries, 0 to 6171383
Data columns (total 22 columns):
 #   Column               Dtype  
---  ------               -----  
 0   ID_UEV               int64  
 1   CIVIQUE_DEBUT        int64  
 2   CIVIQUE_FIN          int64  
 3   NOM_RUE              object 
 4   SUITE_DEBUT          object 
 5   MUNICIPALITE         int64  
 6   ETAGE_HORS_SOL       float64
 7   NOMBRE_LOGEMENT      float64
 8   ANNEE_CONSTRUCTION   object 
 9   CODE_UTILISATION     int64  
 10  LIBELLE_UTILISATION  object 
 11  CATEGORIE_UEF        object 
 12  MATRICULE83          object 
 13  SUPERFICIE_TERRAIN   int64  
 14  SUPERFICIE_BATIMENT  float64
 15  NO_ARROND_ILE_CUM    object 
 16  NOM_RUE_CLEAN        object 
 17  ADDR_DE              float64
 18  LONGITUDE            float64
 19  LATITUDE             float64
 20  month                int64  
 21  fire                 bool   
dtypes: bool(1), float64(6), int64(7), object(8)
memory usage: 994.6+ M

In [35]:
# remove redundant geographical information
df.drop(['CIVIQUE_DEBUT', 'CIVIQUE_FIN', 'NOM_RUE', 'SUITE_DEBUT','MATRICULE83','NO_ARROND_ILE_CUM','NOM_RUE_CLEAN', 'ADDR_DE','LIBELLE_UTILISATION'],axis=1,inplace=True)


In [43]:


# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# split into X,y
X=df.drop(['fire','ID_UEV'],axis=1)
y=df['fire']

In [37]:
df[df['fire']].head(20)

Unnamed: 0,ID_UEV,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,CATEGORIE_UEF,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,LONGITUDE,LATITUDE,month,fire
48,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,1,True
50,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,3,True
52,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,5,True
53,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,6,True
54,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,7,True
55,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,8,True
56,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,9,True
58,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,11,True
59,1037334,50,1.0,1.0,100,1000,0,16,82.0,-73.579815,45.492286,12,True
64,1008952,50,1.0,1.0,135,1000,2,411,105.0,-73.586974,45.456788,5,True


In [38]:
X.head(2)

Unnamed: 0,ID_UEV,MUNICIPALITE,ETAGE_HORS_SOL,NOMBRE_LOGEMENT,ANNEE_CONSTRUCTION,CODE_UTILISATION,CATEGORIE_UEF,SUPERFICIE_TERRAIN,SUPERFICIE_BATIMENT,LONGITUDE,LATITUDE,month
0,1038405,50,1.0,1.0,169,1921,0,2,16.0,-73.588602,45.493711,1
1,1038405,50,1.0,1.0,169,1921,0,2,16.0,-73.588602,45.493711,2


In [44]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define multiple scoring metrics
scoring_metrics = ['recall_macro', 'precision_macro', 'f1_macro']

#define a random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform k-fold cross-validation on training set
scores = cross_validate(
    estimator=rf,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=scoring_metrics,
    return_train_score=True,
    return_estimator=True,
    verbose=2 # to show progress
)

# Print scores for each metric
print("K-fold Cross Validation Scores:")
for metric in scoring_metrics:
    print(f"\n{metric}:")
    for i, score in enumerate(scores[f'test_{metric}']):
        print(f"Fold {i+1}: {score:.2%}")
    print(f"Average {metric}: {scores[f'test_{metric}'].mean():.2%}")


# Get the best model based on average test score
# best_model_idx = np.argmax([np.mean(scores[f'test_{metric}']) 
#                           for metric in scoring_metrics])
# Gets the best model based on f1_score
#print(scores)
best_model_idx = np.argmax(scores['test_f1_macro']) 


best_model = scores['estimator'][best_model_idx]

# Save the best model
model_path = 'best_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

# validate model performance on test set
y_test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test,y_test_predictions)
test_precision = precision_score(y_test,y_test_predictions)
test_recall = recall_score(y_test,y_test_predictions)
test_f1 = f1_score(y_test,y_test_predictions)


print(f"\nTest Set Accuracy: {test_accuracy:.2%}")
print(f"\nTest Set Precision: {test_precision:.2%}")
print(f"\nTest Set Recall: {test_recall:.2%}")
print(f"\nTest Set F1-score: {test_f1:.2%}")



[CV] END .................................................... total time=26.0min
[CV] END .................................................... total time=25.0min
[CV] END .................................................... total time=25.1min
[CV] END .................................................... total time=25.0min
[CV] END .................................................... total time=24.9min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 136.3min finished


K-fold Cross Validation Scores:

recall_macro:
Fold 1: 80.89%
Fold 2: 80.97%
Fold 3: 80.85%
Fold 4: 80.98%
Fold 5: 81.12%
Average recall_macro: 80.96%

precision_macro:
Fold 1: 90.63%
Fold 2: 90.86%
Fold 3: 90.45%
Fold 4: 90.85%
Fold 5: 90.89%
Average precision_macro: 90.73%

f1_macro:
Fold 1: 84.91%
Fold 2: 85.04%
Fold 3: 84.82%
Fold 4: 85.04%
Fold 5: 85.16%
Average f1_macro: 84.99%

Test Set Accuracy: 95.22%

Test Set Precision: 85.63%

Test Set Recall: 63.75%

Test Set F1-score: 73.09%


In [56]:
model_path = 'best_model.pkl'
# Verify the save worked
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

# validate model performance on test set
y_test_predictions = loaded_model.predict(X_test)
test_accuracy = accuracy_score(y_test,y_test_predictions)
test_precision = precision_score(y_test,y_test_predictions)
test_recall = recall_score(y_test,y_test_predictions)
test_f1 = f1_score(y_test,y_test_predictions)


print(f"\nTest Set Accuracy: {test_accuracy:.2%}")
print(f"\nTest Set Precision: {test_precision:.2%}")
print(f"\nTest Set Recall: {test_recall:.2%}")
print(f"\nTest Set F1-score: {test_f1:.2%}")


Test Set Accuracy: 95.22%

Test Set Precision: 85.63%

Test Set Recall: 63.75%

Test Set F1-score: 73.09%


In [46]:
X_test.shape

(1234277, 11)

In [54]:
X_test.iloc[0:1].shape

(1, 11)

In [57]:
loaded_model.predict(X_test.iloc[0:1])

array([False])

In [60]:
loaded_model.predict_proba(X_test.iloc[0:50])

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.84, 0.16],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.97, 0.03],
       [0.72, 0.28],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.06, 0.94],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.86, 0.14],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.97, 0.03],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [1.  , 0.  ],
       [0.96, 0.04],
       [0.46, 0.54],
       [1.  , 0.  ],
       [0.32, 0.68],
       [1.  , 0.  ],
       [0.97, 0.03],
       [1.  ,

In [42]:
# Built-in feature importance (Gini Importance)
feature_names=list(X.columns)
importances = loaded_model.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

                Feature  Gini Importance
11                month         0.307070
9             LONGITUDE         0.180070
10             LATITUDE         0.157395
0                ID_UEV         0.117517
7    SUPERFICIE_TERRAIN         0.079576
8   SUPERFICIE_BATIMENT         0.062990
4    ANNEE_CONSTRUCTION         0.057466
3       NOMBRE_LOGEMENT         0.013723
2        ETAGE_HORS_SOL         0.010460
5      CODE_UTILISATION         0.005433
1          MUNICIPALITE         0.004311
6         CATEGORIE_UEF         0.003989


In [32]:
print(kfold_scores)

[0.92507202 0.92395196 0.92439041 0.92425226 0.92271704]


- filtering redundant geographical information
- summarizing alarmes over time
- summarizing fire incidents over time
- try lasso ?




# Next steps
- how do we deal with building construction year 9999
- how do we evaluate ? recall, AUC-PR, f1 score?
- how do we handle the month component ? replace target by 12 target variables 'fire in month' run 12 predictions per address ? or create 10 lines per address with a single binary target variable ?
- visualization map + slider for month ?