In [77]:
import sys, os
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pickle
from pathlib import Path



In [78]:
ROOT = Path.cwd().parents[0]
model_name="xgboost_panel_with_feat"
FEATURE_LIST_FILE = ROOT / "datamodel" / f"{model_name}_features.pkl"
MODEL_FILE = ROOT / "datamodel" / f"{model_name}.pkl"
RESULT_FILE = ROOT / 'datamodel' / f"{model_name}_pred.csv"
ENCODER_FILE  = ROOT / "datamodel" / f"{model_name}_label_enc.pkl"







In [79]:
# Reload categorical encoders from file
with open(ENCODER_FILE, 'rb') as f:
    categorical_encoders = pickle.load(f)

# Reload model from file
with open(MODEL_FILE, 'rb') as f:
    model = pickle.load(f)

# Reload features from file
with open(FEATURE_LIST_FILE, 'rb') as f:
    features = pickle.load(f)


In [80]:
df=pd.read_csv(RESULT_FILE)

In [95]:
df.columns

Index(['ID_UEV', 'month', 'LATITUDE', 'LONGITUDE', 'MUNICIPALITE',
       'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT', 'AGE_BATIMENT', 'CODE_UTILISATION',
       'CATEGORIE_UEF', 'SUPERFICIE_TERRAIN', 'SUPERFICIE_BATIMENT',
       'NO_ARROND_ILE_CUM', 'RATIO_SURFACE', 'DENSITE_LOGEMENT',
       'HAS_MULTIPLE_LOGEMENTS', 'FIRE_FREQUENCY_ZONE', 'FIRE_RATE_ZONE',
       'FIRE_COUNT_LAST_YEAR_ZONE', 'BUILDING_COUNT', 'FIRE_RATE_ZONE_NORM',
       'FIRE_COUNT_LAST_YEAR_ZONE_NORM', 'HAS_FIRE_THIS_MONTH', 'fire_last_1m',
       'fire_last_2m', 'fire_last_3m', 'fire_cumcount', 'fire_rolling_3m',
       'fire_rolling_6m', 'fire_rolling_12m', 'has_fire_last_month',
       'months_since_last_fire', 'month_num', 'year', 'predicted_result',
       'predicted_proba', 'target'],
      dtype='object')

In [None]:
pred_year=2024
top_predictions = df[df['year']==pred_year].sort_values(['predicted_proba', 'month_num'], 
                                        ascending=[False, False]) \
                             .groupby('ID_UEV') \
                             .first() \
                             .reset_index()
    
# Sort by probability and select top 7500
result = top_predictions.sort_values('predicted_proba', 
                                       ascending=False) \
                           .head(7500)

prioritization_list = result.merge(top_predictions)
actual_fires=df[(df['year']==pred_year) & (df['target']==1)][['ID_UEV']].drop_duplicates().copy(deep=True)
actual_fires['actual_fire']=1
print(actual_fires)

result=result.merge(actual_fires,how='left')
result['actual_fire'].fillna(0,inplace=True)
result['actual_fire'].value_counts()


          ID_UEV  actual_fire
73       1000038            1
85       1000039            1
109      1000041            1
133      1000043            1
157      1000045            1
...          ...          ...
3724275  5307816            1
3724287  5307817            1
3724299  5307818            1
3724311  5307819            1
3724637  5308940            1

[42788 rows x 2 columns]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result['actual_fire'].fillna(0,inplace=True)


actual_fire
0.0    3912
1.0    3588
Name: count, dtype: int64

In [93]:
np.random.seed(42) #(for repeatability)

random_pick=pd.DataFrame()
#randomly pick 7500 buildings
random_pick['ID_UEV']=np.random.choice(df['ID_UEV'].unique(),size=7500)
random_pick.head()
#merge with actual fires
random_pick=random_pick.merge(actual_fires,how='left')
random_pick['actual_fire']=random_pick['actual_fire'].fillna(0)
random_pick['actual_fire'].value_counts()
int(random_pick['actual_fire'].value_counts()[1])



1058

In [94]:
print("predicted high probability value with actual fires")
result_df=pd.DataFrame()
result_df['Method']=['Random pick','Highest predicted risk']
result_df['Actual fire*']=[int(random_pick['actual_fire'].value_counts()[1]),int(result['actual_fire'].value_counts()[1])]
result_df['Precision@K']=result_df['Actual fire*'].apply(lambda x:f"{x/7500:.2%}")
result_df

predicted high probability value with actual fires


Unnamed: 0,Method,Actual fire*,Precision@K
0,Random pick,1058,14.11%
1,Highest predicted risk,3588,47.84%


In [96]:
len(actual_fires['ID_UEV'])

42788

In [88]:
len(df['ID_UEV'].unique())

310387

In [None]:
len(df[(df['year']==pred_year) & (df['target']==1)][['ID_UEV']])

In [91]:
len(df[(df['year']==2024) & (df['target'])]['ID_UEV'].unique())

42788

In [None]:
#this method is extracting highest risks in predicted set vs random but may return multiple times the same building
# so it's probably best not to use it

print("Random pick: ")
print(df.iloc[np.random.choice(df.index.to_list(),size=7500)]['target'].value_counts())
print("Predicted :")
print(df.nlargest(7500,columns='predicted_proba')['target'].value_counts())

Random pick: 
target
0    7406
1      94
Name: count, dtype: int64
Predicted :
target
0    6881
1     619
Name: count, dtype: int64


In [45]:
#feature_names=list(set(list(df.columns)) -set(['target,predicted_result','predicted_proba']))
importances = model.feature_importances_
print(importances) 
print(len(importances),len(features))
print(features)
feature_imp_df = pd.DataFrame({'Feature': features, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)


[0.02402052 0.02489714 0.13272934 0.04110989 0.01749733 0.03444773
 0.02249078 0.01440536 0.09380012 0.3459575  0.0495928  0.
 0.         0.         0.         0.         0.05008211 0.03246946
 0.02193386 0.03720846 0.02561954 0.03173818]
22 22
['MUNICIPALITE', 'ETAGE_HORS_SOL', 'NOMBRE_LOGEMENT', 'AGE_BATIMENT', 'CODE_UTILISATION', 'CATEGORIE_UEF', 'SUPERFICIE_TERRAIN', 'SUPERFICIE_BATIMENT', 'NO_ARROND_ILE_CUM', 'RATIO_SURFACE', 'DENSITE_LOGEMENT', 'HAS_MULTIPLE_LOGEMENTS', 'FIRE_FREQUENCY_ZONE', 'BUILDING_COUNT', 'FIRE_RATE_ZONE_NORM', 'FIRE_COUNT_LAST_YEAR_ZONE_NORM', 'fire_cumcount', 'fire_rolling_3m', 'fire_rolling_6m', 'fire_rolling_12m', 'month_num', 'year']
                           Feature  Gini Importance
9                    RATIO_SURFACE         0.345957
2                  NOMBRE_LOGEMENT         0.132729
8                NO_ARROND_ILE_CUM         0.093800
16                   fire_cumcount         0.050082
10                DENSITE_LOGEMENT         0.049593
3            