# Preprocessing of the Portuguese ICNF wildfire dataset 

## Resolve imports and load dataset

In [136]:
import polars as pl

# load dataset from file
file = "ICNF_2013_2022_full.csv"
icnf_data = pl.read_csv(file)

## Clean dataset

In [137]:
print(icnf_data.shape)
print(icnf_data.columns)
print(icnf_data.dtypes)

(143403, 77)
['DISTRITO', 'TIPO', 'ANO', 'AREAPOV', 'AREAMATO', 'AREAAGRIC', 'AREATOTAL', 'REACENDIMENTOS', 'QUEIMADA', 'FALSOALARME', 'FOGACHO', 'INCENDIO', 'AGRICOLA', 'NCCO', 'NOMECCO', 'DATAALERTA', 'HORAALERTA', 'LOCAL', 'CONCELHO', 'FREGUESIA', 'FONTEALERTA', 'INE', 'X', 'Y', 'DIA', 'MES', 'HORA', 'OPERADOR', 'PERIMETRO', 'APS', 'CAUSA', 'TIPOCAUSA', 'DHINICIO', 'DHFIM', 'DURACAO', 'HAHORA', 'DATAEXTINCAO', 'HORAEXTINCAO', 'DATA1INTERVENCAO', 'HORA1INTERVENCAO', 'QUEIMA', 'LAT', 'LON', 'CAUSAFAMILIA', 'TEMPERATURA', 'HUMIDADERELATIVA', 'VENTOINTENSIDADE', 'VENTOINTENSIDADE_VETOR', 'VENTODIRECAO_VETOR', 'PRECEPITACAO', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'DSR', 'THC', 'MODFARSITE', 'ALTITUDEMEDIA', 'DECLIVEMEDIO', 'HORASEXPOSICAOMEDIA', 'DENDIDADERV', 'COSN5VARIEDADE', 'AREAMANCHAMODFARSITE', 'AREASFICHEIROS_GNR', 'AREASFICHEIROS_GTF', 'FICHEIROIMAGEM_GNR', 'AREASFICHEIROSHP_GTF', 'AREASFICHEIROSHPXML_GTF', 'AREASFICHEIRODBF_GTF', 'AREASFICHEIROPRJ_GTF', 'AREASFICHEIROSBN_GT

There are certain measurments that only started appearing later in the dataset, so they can be removed

In [138]:
# find columns with 80% or more missing values
threshold = 0.8
null_counts_df = icnf_data.null_count()
missing_data = null_counts_df.row(0)
columns = null_counts_df.columns
columns_to_drop = [
    col
    for col, nulls in zip(columns, missing_data)
    if nulls / icnf_data.height >= threshold
]
print(columns_to_drop)
# drop these columns
icnf_data_cleaned = icnf_data.drop(columns_to_drop)
print(icnf_data_cleaned.shape)
print(icnf_data_cleaned.columns)
print(icnf_data_cleaned.dtypes)

['PERIMETRO', 'APS', 'HORASEXPOSICAOMEDIA', 'AREASFICHEIROS_GNR', 'AREASFICHEIROS_GTF', 'FICHEIROIMAGEM_GNR', 'AREASFICHEIROSHP_GTF', 'AREASFICHEIROSHPXML_GTF', 'AREASFICHEIRODBF_GTF', 'AREASFICHEIROPRJ_GTF', 'AREASFICHEIROSBN_GTF', 'AREASFICHEIROSBX_GTF', 'AREASFICHEIROSHX_GTF', 'AREASFICHEIROZIP_SAA', 'index']
(143403, 62)
['DISTRITO', 'TIPO', 'ANO', 'AREAPOV', 'AREAMATO', 'AREAAGRIC', 'AREATOTAL', 'REACENDIMENTOS', 'QUEIMADA', 'FALSOALARME', 'FOGACHO', 'INCENDIO', 'AGRICOLA', 'NCCO', 'NOMECCO', 'DATAALERTA', 'HORAALERTA', 'LOCAL', 'CONCELHO', 'FREGUESIA', 'FONTEALERTA', 'INE', 'X', 'Y', 'DIA', 'MES', 'HORA', 'OPERADOR', 'CAUSA', 'TIPOCAUSA', 'DHINICIO', 'DHFIM', 'DURACAO', 'HAHORA', 'DATAEXTINCAO', 'HORAEXTINCAO', 'DATA1INTERVENCAO', 'HORA1INTERVENCAO', 'QUEIMA', 'LAT', 'LON', 'CAUSAFAMILIA', 'TEMPERATURA', 'HUMIDADERELATIVA', 'VENTOINTENSIDADE', 'VENTOINTENSIDADE_VETOR', 'VENTODIRECAO_VETOR', 'PRECEPITACAO', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'DSR', 'THC', 'MODFARSITE', 'ALT

In [139]:
# find columns with only one unique value
columns_to_drop_unique = [
    col for col in icnf_data_cleaned.columns if icnf_data_cleaned[col].n_unique() <= 1
]
print(columns_to_drop_unique)
# drop these columns
icnf_data_cleaned = icnf_data_cleaned.drop(columns_to_drop_unique)
print(icnf_data_cleaned.shape)
print(icnf_data_cleaned.columns)
print(icnf_data_cleaned.dtypes)

['QUEIMADA', 'FALSOALARME']
(143403, 60)
['DISTRITO', 'TIPO', 'ANO', 'AREAPOV', 'AREAMATO', 'AREAAGRIC', 'AREATOTAL', 'REACENDIMENTOS', 'FOGACHO', 'INCENDIO', 'AGRICOLA', 'NCCO', 'NOMECCO', 'DATAALERTA', 'HORAALERTA', 'LOCAL', 'CONCELHO', 'FREGUESIA', 'FONTEALERTA', 'INE', 'X', 'Y', 'DIA', 'MES', 'HORA', 'OPERADOR', 'CAUSA', 'TIPOCAUSA', 'DHINICIO', 'DHFIM', 'DURACAO', 'HAHORA', 'DATAEXTINCAO', 'HORAEXTINCAO', 'DATA1INTERVENCAO', 'HORA1INTERVENCAO', 'QUEIMA', 'LAT', 'LON', 'CAUSAFAMILIA', 'TEMPERATURA', 'HUMIDADERELATIVA', 'VENTOINTENSIDADE', 'VENTOINTENSIDADE_VETOR', 'VENTODIRECAO_VETOR', 'PRECEPITACAO', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'DSR', 'THC', 'MODFARSITE', 'ALTITUDEMEDIA', 'DECLIVEMEDIO', 'DENDIDADERV', 'COSN5VARIEDADE', 'AREAMANCHAMODFARSITE']
[String, String, Int64, Float64, Float64, Float64, Float64, Int64, Int64, Int64, Int64, Int64, Int64, String, String, String, String, String, String, Int64, Int64, Int64, Int64, Int64, Int64, String, Float64, String, String, St

In [140]:
# show first 5 rows of cleaned dataset with table format
icnf_data_cleaned.head()

DISTRITO,TIPO,ANO,AREAPOV,AREAMATO,AREAAGRIC,AREATOTAL,REACENDIMENTOS,FOGACHO,INCENDIO,AGRICOLA,NCCO,NOMECCO,DATAALERTA,HORAALERTA,LOCAL,CONCELHO,FREGUESIA,FONTEALERTA,INE,X,Y,DIA,MES,HORA,OPERADOR,CAUSA,TIPOCAUSA,DHINICIO,DHFIM,DURACAO,HAHORA,DATAEXTINCAO,HORAEXTINCAO,DATA1INTERVENCAO,HORA1INTERVENCAO,QUEIMA,LAT,LON,CAUSAFAMILIA,TEMPERATURA,HUMIDADERELATIVA,VENTOINTENSIDADE,VENTOINTENSIDADE_VETOR,VENTODIRECAO_VETOR,PRECEPITACAO,FFMC,DMC,DC,ISI,BUI,FWI,DSR,THC,MODFARSITE,ALTITUDEMEDIA,DECLIVEMEDIO,DENDIDADERV,COSN5VARIEDADE,AREAMANCHAMODFARSITE
str,str,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str,f64,str,str,str,f64,f64,str,str,str,str,i64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Coimbra""","""Florestal""",2013,0.0,0.001,0.0,0.001,0,1,0,0,2013060000295,6,"""03-01-2013""","""17:54""","""CUNHEDO""","""Penacova""","""Oliveira do Mondego""","""Populares""",61305,192399,373499,3,1,17,"""sigo""",449.0,"""Intencional""","""03-01-2013 17:54:00""","""03-01-2013 18:29:00""",35.0,0.001714,"""03-01-2013""","""18:29""","""03-01-2013""","""18:10""",0,40.330172,-8.222549,"""Incendiarismo - Imputáveis""",15.3,54.0,14.0,-999.0,-999.0,0.0,76.300003,2.2,35.200001,1.7,3.8,0.7,0.01,26.7735,,99.8913,17.5053,120.767,22.49,
"""Coimbra""","""Florestal""",2013,0.0,0.001,0.0,0.001,0,1,0,0,2013060000445,6,"""04-01-2013""","""23:56""","""Comareira""","""Góis""","""Góis""","""Populares""",60604,198300,351000,4,1,23,"""sigo""",14.0,"""Negligente""","""04-01-2013 23:56:00""","""05-01-2013 01:40:00""",104.0,0.000577,"""05-01-2013""","""01:40""","""05-01-2013""","""00:15""",0,40.127576,-8.153059,"""Uso do fogo - Fogueiras""",14.6,41.0,10.0,-999.0,-999.0,0.0,85.400002,2.9,47.400002,3.7,5.1,2.6,0.15,15.8476,,597.479,49.9571,183.455,21.73,
"""Coimbra""","""Florestal""",2013,0.0,0.05,0.0,0.05,0,1,0,0,2013060000377,6,"""04-01-2013""","""13:09""","""Chãs de Égua""","""Arganil""","""Piódão""","""Populares""",60111,228799,363099,4,1,13,"""sigo""",124.0,"""Negligente""","""04-01-2013 13:09:00""","""04-01-2013 14:30:00""",81.0,0.037037,"""04-01-2013""","""14:30""","""04-01-2013""","""13:45""",0,40.236057,-7.794691,"""Queimas amontoados de sobrante…",9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,227.0,748.766,50.4426,224.046,11.95,10.8147
"""Leiria""","""Florestal""",2013,0.1,0.1,0.0,0.2,0,1,0,0,2013100000560,10,"""04-01-2013""","""13:13""","""Sobreiro""","""Pedrógão Grande""","""Pedrógão Grande""","""Outros""",101302,197499,325299,4,1,13,"""2020419""",124.0,"""Negligente""","""04-01-2013 13:13:00""","""04-01-2013 13:53:00""",40.0,0.3,"""04-01-2013""","""13:53""","""04-01-2013""","""13:27""",0,39.896117,-8.162342,"""Queimas amontoados de sobrante…",9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,,343.081,10.0806,138.011,18.89,
"""Leiria""","""Agrícola""",2013,0.0,0.0,6.5,6.5,0,0,0,1,2013100000546,10,"""04-01-2013""","""12:00""","""PNSAC - ALVADOS""","""Porto de Mós""","""Alvados""","""Populares""",101603,145400,287000,4,1,12,"""2020419""",129.0,"""Negligente""","""04-01-2013 12:00:00""","""04-01-2013 15:08:00""",188.0,2.074468,"""04-01-2013""","""15:08""","""04-01-2013""","""12:10""",0,39.549428,-8.768329,"""Queimadas de sobrantes florest…",14.9,42.0,14.0,-999.0,-999.0,0.0,85.800003,4.0,40.200001,4.8,6.5,4.1,0.33,15.3988,,264.889,11.7616,261.708,15.32,


## Translate dataset from Portuguese to English

Translate feature names

In [141]:
from dicts import translation_dict

icnf_data_cleaned = icnf_data_cleaned.rename(translation_dict)

Translate columns with features in Portuguese

In [142]:
print(icnf_data_cleaned["WILDFIRE_TYPE"].value_counts())
print(icnf_data_cleaned["CAUSE_TYPE"].value_counts())
print(icnf_data_cleaned["ALERT_SOURCE"].value_counts())

shape: (3, 2)
┌───────────────┬────────┐
│ WILDFIRE_TYPE ┆ count  │
│ ---           ┆ ---    │
│ str           ┆ u32    │
╞═══════════════╪════════╡
│ Florestal     ┆ 113669 │
│ Agrícola      ┆ 29718  │
│ Queima        ┆ 16     │
└───────────────┴────────┘
shape: (6, 2)
┌───────────────┬───────┐
│ CAUSE_TYPE    ┆ count │
│ ---           ┆ ---   │
│ str           ┆ u32   │
╞═══════════════╪═══════╡
│ Negligente    ┆ 42505 │
│ Natural       ┆ 1166  │
│ Desconhecida  ┆ 43359 │
│ null          ┆ 22382 │
│ Reacendimento ┆ 10037 │
│ Intencional   ┆ 23954 │
└───────────────┴───────┘
shape: (8, 2)
┌──────────────┬───────┐
│ ALERT_SOURCE ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ PV           ┆ 10390 │
│ 117          ┆ 16924 │
│ 112          ┆ 13336 │
│ Outros       ┆ 18553 │
│ Sapadores    ┆ 1811  │
│ Populares    ┆ 62587 │
│ null         ┆ 12321 │
│ CCO          ┆ 7481  │
└──────────────┴───────┘


In [143]:
from dicts import (
    wildfire_type_translation_dict,
    cause_type_translation_dict,
    alert_source_translation_dict,
)

# Translate values in the "WILDFIRE_TYPE" and "CAUSE_TYPE" columns
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        icnf_data_cleaned["WILDFIRE_TYPE"]
        .replace(wildfire_type_translation_dict)
        .alias("WILDFIRE_TYPE"),
        icnf_data_cleaned["CAUSE_TYPE"]
        .replace(cause_type_translation_dict)
        .alias("CAUSE_TYPE"),
        icnf_data_cleaned["ALERT_SOURCE"]
        .replace(alert_source_translation_dict)
        .alias("ALERT_SOURCE"),
    ]
)

In [144]:
# check all values in first 5 rows of cleaned dataset
icnf_data_cleaned.head()

DISTRICT,WILDFIRE_TYPE,YEAR,BURNED_POPULATIONAL_AREA,BURNED_BRUSHLAND_AREA,BURNED_AGRICULTURAL_AREA,BURNED_TOTAL_AREA,IS_A_REIGNITION,IS_A_SMALL_FIRE,IS_A_WILDFIRE,IS_A_AGRICULTURAL_FIRE,LOCAL_COMMAND_CENTER_CODE,REGIONAL_COMMAND_CENTER_CODE,ALERT_DATE,ALERT_TIME,LOCATION,MUNICIPALITY,PARISH,ALERT_SOURCE,MUNICIPALITY_CODES,X_PORT_COORD,Y_PORT_COORD,DAY,MONTH,HOUR,OPERATOR,CAUSE,CAUSE_TYPE,START_DATETIME,END_DATETIME,DURATION_MINUTES,HECTARES_PER_HOUR,EXTINCTION_DATE,EXTINCTION_TIME,FIRST_INTERVENTION_DATE,FIRST_INTERVENTION_TIME,IS_A_CONTROLLED_FIRE,LATITUDE,LONGITUDE,CAUSE_FAMILY,TEMPERATURE_CELSIUS,RELATIVE_HUMIDITY_PERCENT,WIND_SPEED_MS,WIND_SPEED_VECTOR,WIND_DIRECTION_VECTOR,PRECIPITATION_MM,FFMC,DMC,DC,ISI,BUI,FWI,DSR,THC,MODELED_BURNED_AREA_FARSITE_HA,MEAN_ALTITUDE_M,MEAN_SLOPE_DEG,VEGETATION_DENSITY,VEGETATION_VARIETY_INDEX,BURNED_PATCH_AREA_FARSITE_HA
str,str,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str,f64,str,str,str,f64,f64,str,str,str,str,i64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Coimbra""","""Wildfire""",2013,0.0,0.001,0.0,0.001,0,1,0,0,2013060000295,6,"""03-01-2013""","""17:54""","""CUNHEDO""","""Penacova""","""Oliveira do Mondego""","""Public""",61305,192399,373499,3,1,17,"""sigo""",449.0,"""Intentional""","""03-01-2013 17:54:00""","""03-01-2013 18:29:00""",35.0,0.001714,"""03-01-2013""","""18:29""","""03-01-2013""","""18:10""",0,40.330172,-8.222549,"""Incendiarismo - Imputáveis""",15.3,54.0,14.0,-999.0,-999.0,0.0,76.300003,2.2,35.200001,1.7,3.8,0.7,0.01,26.7735,,99.8913,17.5053,120.767,22.49,
"""Coimbra""","""Wildfire""",2013,0.0,0.001,0.0,0.001,0,1,0,0,2013060000445,6,"""04-01-2013""","""23:56""","""Comareira""","""Góis""","""Góis""","""Public""",60604,198300,351000,4,1,23,"""sigo""",14.0,"""Negligent""","""04-01-2013 23:56:00""","""05-01-2013 01:40:00""",104.0,0.000577,"""05-01-2013""","""01:40""","""05-01-2013""","""00:15""",0,40.127576,-8.153059,"""Uso do fogo - Fogueiras""",14.6,41.0,10.0,-999.0,-999.0,0.0,85.400002,2.9,47.400002,3.7,5.1,2.6,0.15,15.8476,,597.479,49.9571,183.455,21.73,
"""Coimbra""","""Wildfire""",2013,0.0,0.05,0.0,0.05,0,1,0,0,2013060000377,6,"""04-01-2013""","""13:09""","""Chãs de Égua""","""Arganil""","""Piódão""","""Public""",60111,228799,363099,4,1,13,"""sigo""",124.0,"""Negligent""","""04-01-2013 13:09:00""","""04-01-2013 14:30:00""",81.0,0.037037,"""04-01-2013""","""14:30""","""04-01-2013""","""13:45""",0,40.236057,-7.794691,"""Queimas amontoados de sobrante…",9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,227.0,748.766,50.4426,224.046,11.95,10.8147
"""Leiria""","""Wildfire""",2013,0.1,0.1,0.0,0.2,0,1,0,0,2013100000560,10,"""04-01-2013""","""13:13""","""Sobreiro""","""Pedrógão Grande""","""Pedrógão Grande""","""Other""",101302,197499,325299,4,1,13,"""2020419""",124.0,"""Negligent""","""04-01-2013 13:13:00""","""04-01-2013 13:53:00""",40.0,0.3,"""04-01-2013""","""13:53""","""04-01-2013""","""13:27""",0,39.896117,-8.162342,"""Queimas amontoados de sobrante…",9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,,343.081,10.0806,138.011,18.89,
"""Leiria""","""Agricultural Fire""",2013,0.0,0.0,6.5,6.5,0,0,0,1,2013100000546,10,"""04-01-2013""","""12:00""","""PNSAC - ALVADOS""","""Porto de Mós""","""Alvados""","""Public""",101603,145400,287000,4,1,12,"""2020419""",129.0,"""Negligent""","""04-01-2013 12:00:00""","""04-01-2013 15:08:00""",188.0,2.074468,"""04-01-2013""","""15:08""","""04-01-2013""","""12:10""",0,39.549428,-8.768329,"""Queimadas de sobrantes florest…",14.9,42.0,14.0,-999.0,-999.0,0.0,85.800003,4.0,40.200001,4.8,6.5,4.1,0.33,15.3988,,264.889,11.7616,261.708,15.32,


## Remove redudant comlumns

In [145]:
# drop the column LOCATION
icnf_data_cleaned = icnf_data_cleaned.drop("LOCATION")
# drop the column MUNICIPALITY
icnf_data_cleaned = icnf_data_cleaned.drop("MUNICIPALITY")
# drop the column PARISH
icnf_data_cleaned = icnf_data_cleaned.drop("PARISH")
# drop the column CAUSE_FAMILY
icnf_data_cleaned = icnf_data_cleaned.drop("CAUSE_FAMILY")
# drop the column YEAR (we have ALERT_DATE)
icnf_data_cleaned = icnf_data_cleaned.drop("YEAR")

In [146]:
icnf_data_cleaned.head()

DISTRICT,WILDFIRE_TYPE,BURNED_POPULATIONAL_AREA,BURNED_BRUSHLAND_AREA,BURNED_AGRICULTURAL_AREA,BURNED_TOTAL_AREA,IS_A_REIGNITION,IS_A_SMALL_FIRE,IS_A_WILDFIRE,IS_A_AGRICULTURAL_FIRE,LOCAL_COMMAND_CENTER_CODE,REGIONAL_COMMAND_CENTER_CODE,ALERT_DATE,ALERT_TIME,ALERT_SOURCE,MUNICIPALITY_CODES,X_PORT_COORD,Y_PORT_COORD,DAY,MONTH,HOUR,OPERATOR,CAUSE,CAUSE_TYPE,START_DATETIME,END_DATETIME,DURATION_MINUTES,HECTARES_PER_HOUR,EXTINCTION_DATE,EXTINCTION_TIME,FIRST_INTERVENTION_DATE,FIRST_INTERVENTION_TIME,IS_A_CONTROLLED_FIRE,LATITUDE,LONGITUDE,TEMPERATURE_CELSIUS,RELATIVE_HUMIDITY_PERCENT,WIND_SPEED_MS,WIND_SPEED_VECTOR,WIND_DIRECTION_VECTOR,PRECIPITATION_MM,FFMC,DMC,DC,ISI,BUI,FWI,DSR,THC,MODELED_BURNED_AREA_FARSITE_HA,MEAN_ALTITUDE_M,MEAN_SLOPE_DEG,VEGETATION_DENSITY,VEGETATION_VARIETY_INDEX,BURNED_PATCH_AREA_FARSITE_HA
str,str,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,i64,i64,i64,str,f64,str,str,str,f64,f64,str,str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Coimbra""","""Wildfire""",0.0,0.001,0.0,0.001,0,1,0,0,2013060000295,6,"""03-01-2013""","""17:54""","""Public""",61305,192399,373499,3,1,17,"""sigo""",449.0,"""Intentional""","""03-01-2013 17:54:00""","""03-01-2013 18:29:00""",35.0,0.001714,"""03-01-2013""","""18:29""","""03-01-2013""","""18:10""",0,40.330172,-8.222549,15.3,54.0,14.0,-999.0,-999.0,0.0,76.300003,2.2,35.200001,1.7,3.8,0.7,0.01,26.7735,,99.8913,17.5053,120.767,22.49,
"""Coimbra""","""Wildfire""",0.0,0.001,0.0,0.001,0,1,0,0,2013060000445,6,"""04-01-2013""","""23:56""","""Public""",60604,198300,351000,4,1,23,"""sigo""",14.0,"""Negligent""","""04-01-2013 23:56:00""","""05-01-2013 01:40:00""",104.0,0.000577,"""05-01-2013""","""01:40""","""05-01-2013""","""00:15""",0,40.127576,-8.153059,14.6,41.0,10.0,-999.0,-999.0,0.0,85.400002,2.9,47.400002,3.7,5.1,2.6,0.15,15.8476,,597.479,49.9571,183.455,21.73,
"""Coimbra""","""Wildfire""",0.0,0.05,0.0,0.05,0,1,0,0,2013060000377,6,"""04-01-2013""","""13:09""","""Public""",60111,228799,363099,4,1,13,"""sigo""",124.0,"""Negligent""","""04-01-2013 13:09:00""","""04-01-2013 14:30:00""",81.0,0.037037,"""04-01-2013""","""14:30""","""04-01-2013""","""13:45""",0,40.236057,-7.794691,9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,227.0,748.766,50.4426,224.046,11.95,10.8147
"""Leiria""","""Wildfire""",0.1,0.1,0.0,0.2,0,1,0,0,2013100000560,10,"""04-01-2013""","""13:13""","""Other""",101302,197499,325299,4,1,13,"""2020419""",124.0,"""Negligent""","""04-01-2013 13:13:00""","""04-01-2013 13:53:00""",40.0,0.3,"""04-01-2013""","""13:53""","""04-01-2013""","""13:27""",0,39.896117,-8.162342,9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,,343.081,10.0806,138.011,18.89,
"""Leiria""","""Agricultural Fire""",0.0,0.0,6.5,6.5,0,0,0,1,2013100000546,10,"""04-01-2013""","""12:00""","""Public""",101603,145400,287000,4,1,12,"""2020419""",129.0,"""Negligent""","""04-01-2013 12:00:00""","""04-01-2013 15:08:00""",188.0,2.074468,"""04-01-2013""","""15:08""","""04-01-2013""","""12:10""",0,39.549428,-8.768329,14.9,42.0,14.0,-999.0,-999.0,0.0,85.800003,4.0,40.200001,4.8,6.5,4.1,0.33,15.3988,,264.889,11.7616,261.708,15.32,


## Merge date and hour columns to have a timestamp

In [147]:
# Merge ALERT_DATE and ALERT_TIME into ALERT_TIMESTAMP (Polars Datetime, minute precision)
import polars as pl

icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        (
            (icnf_data_cleaned["ALERT_DATE"] + " " + icnf_data_cleaned["ALERT_TIME"])
            .str.strptime(
                pl.Datetime("ms"),
                "%d-%m-%Y %H:%M",
            )
            .dt.truncate("1m")
            .dt.timestamp("ms")
            .alias("ALERT_TIMESTAMP")
        )
    ]
)
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [(pl.col("ALERT_TIMESTAMP") / 1_000).alias("ALERT_TIMESTAMP")]
)

icnf_data_cleaned = icnf_data_cleaned.drop(["ALERT_DATE", "ALERT_TIME"])


# Fix times
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        pl.col("FIRST_INTERVENTION_TIME")
        .str.replace(r"^(\d{2})$", r"$1:00")  # "17" -> "17:00"
        .str.replace(r"^(\d{2}:\d{2}):\d{2}$", r"$1")  # "13:25:00" -> "13:25"
        .alias("FIRST_INTERVENTION_TIME")
    ]
)

# same with FIRST_INTERVENTION_DATE and FIRST_INTERVENTION_TIME
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        (
            (
                icnf_data_cleaned["FIRST_INTERVENTION_DATE"]
                + " "
                + icnf_data_cleaned["FIRST_INTERVENTION_TIME"]
            )
            .str.strptime(
                pl.Datetime("ms"),
                "%d-%m-%Y %H:%M",
            )
            .dt.truncate("1m")
            .dt.timestamp("ms")
            .alias("FIRST_INTERVENTION_TIMESTAMP")
        )
    ]
)
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        (pl.col("FIRST_INTERVENTION_TIMESTAMP") / 1_000).alias(
            "FIRST_INTERVENTION_TIMESTAMP"
        )
    ]
)


icnf_data_cleaned = icnf_data_cleaned.drop(
    ["FIRST_INTERVENTION_DATE", "FIRST_INTERVENTION_TIME"]
)

# Fix times
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        pl.col("EXTINCTION_TIME")
        .str.replace(r"^(\d{2})$", r"$1:00")  # "17" -> "17:00"
        .str.replace(r"^(\d{2}:\d{2}):\d{2}$", r"$1")  # "13:25:00" -> "13:25"
        .alias("EXTINCTION_TIME")
    ]
)

# same with EXTINCTION_DATE and EXTINCTION_TIME
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        (
            (
                icnf_data_cleaned["EXTINCTION_DATE"]
                + " "
                + icnf_data_cleaned["EXTINCTION_TIME"]
            )
            .str.strptime(
                pl.Datetime("ms"),
                "%d-%m-%Y %H:%M",
            )
            .dt.truncate("1m")
            .dt.timestamp("ms")
            .alias("EXTINCTION_TIMESTAMP")
        )
    ]
)
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [(pl.col("EXTINCTION_TIMESTAMP") / 1_000).alias("EXTINCTION_TIMESTAMP")]
)

icnf_data_cleaned = icnf_data_cleaned.drop(["EXTINCTION_DATE", "EXTINCTION_TIME"])
icnf_data_cleaned.head()

DISTRICT,WILDFIRE_TYPE,BURNED_POPULATIONAL_AREA,BURNED_BRUSHLAND_AREA,BURNED_AGRICULTURAL_AREA,BURNED_TOTAL_AREA,IS_A_REIGNITION,IS_A_SMALL_FIRE,IS_A_WILDFIRE,IS_A_AGRICULTURAL_FIRE,LOCAL_COMMAND_CENTER_CODE,REGIONAL_COMMAND_CENTER_CODE,ALERT_SOURCE,MUNICIPALITY_CODES,X_PORT_COORD,Y_PORT_COORD,DAY,MONTH,HOUR,OPERATOR,CAUSE,CAUSE_TYPE,START_DATETIME,END_DATETIME,DURATION_MINUTES,HECTARES_PER_HOUR,IS_A_CONTROLLED_FIRE,LATITUDE,LONGITUDE,TEMPERATURE_CELSIUS,RELATIVE_HUMIDITY_PERCENT,WIND_SPEED_MS,WIND_SPEED_VECTOR,WIND_DIRECTION_VECTOR,PRECIPITATION_MM,FFMC,DMC,DC,ISI,BUI,FWI,DSR,THC,MODELED_BURNED_AREA_FARSITE_HA,MEAN_ALTITUDE_M,MEAN_SLOPE_DEG,VEGETATION_DENSITY,VEGETATION_VARIETY_INDEX,BURNED_PATCH_AREA_FARSITE_HA,ALERT_TIMESTAMP,FIRST_INTERVENTION_TIMESTAMP,EXTINCTION_TIMESTAMP
str,str,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,str,f64,str,str,str,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Coimbra""","""Wildfire""",0.0,0.001,0.0,0.001,0,1,0,0,2013060000295,6,"""Public""",61305,192399,373499,3,1,17,"""sigo""",449.0,"""Intentional""","""03-01-2013 17:54:00""","""03-01-2013 18:29:00""",35.0,0.001714,0,40.330172,-8.222549,15.3,54.0,14.0,-999.0,-999.0,0.0,76.300003,2.2,35.200001,1.7,3.8,0.7,0.01,26.7735,,99.8913,17.5053,120.767,22.49,,1357200000.0,1357200000.0,1357200000.0
"""Coimbra""","""Wildfire""",0.0,0.001,0.0,0.001,0,1,0,0,2013060000445,6,"""Public""",60604,198300,351000,4,1,23,"""sigo""",14.0,"""Negligent""","""04-01-2013 23:56:00""","""05-01-2013 01:40:00""",104.0,0.000577,0,40.127576,-8.153059,14.6,41.0,10.0,-999.0,-999.0,0.0,85.400002,2.9,47.400002,3.7,5.1,2.6,0.15,15.8476,,597.479,49.9571,183.455,21.73,,1357300000.0,1357300000.0,1357400000.0
"""Coimbra""","""Wildfire""",0.0,0.05,0.0,0.05,0,1,0,0,2013060000377,6,"""Public""",60111,228799,363099,4,1,13,"""sigo""",124.0,"""Negligent""","""04-01-2013 13:09:00""","""04-01-2013 14:30:00""",81.0,0.037037,0,40.236057,-7.794691,9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,227.0,748.766,50.4426,224.046,11.95,10.8147,1357300000.0,1357300000.0,1357300000.0
"""Leiria""","""Wildfire""",0.1,0.1,0.0,0.2,0,1,0,0,2013100000560,10,"""Other""",101302,197499,325299,4,1,13,"""2020419""",124.0,"""Negligent""","""04-01-2013 13:13:00""","""04-01-2013 13:53:00""",40.0,0.3,0,39.896117,-8.162342,9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,,343.081,10.0806,138.011,18.89,,1357300000.0,1357300000.0,1357300000.0
"""Leiria""","""Agricultural Fire""",0.0,0.0,6.5,6.5,0,0,0,1,2013100000546,10,"""Public""",101603,145400,287000,4,1,12,"""2020419""",129.0,"""Negligent""","""04-01-2013 12:00:00""","""04-01-2013 15:08:00""",188.0,2.074468,0,39.549428,-8.768329,14.9,42.0,14.0,-999.0,-999.0,0.0,85.800003,4.0,40.200001,4.8,6.5,4.1,0.33,15.3988,,264.889,11.7616,261.708,15.32,,1357300000.0,1357300000.0,1357300000.0


In [None]:
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        pl.col("END_DATETIME")
        .str.replace(
            r"^(\d{2}-\d{2}-\d{4})$", r"$1 00:00:00"
        )  # pad date-only with midnight
        .alias("END_DATETIME"),
        pl.col("START_DATETIME")
        .str.replace(
            r"^(\d{2}-\d{2}-\d{4})$", r"$1 00:00:00"
        )  # pad date-only with midnight
        .alias("START_DATETIME"),
    ]
)

icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        pl.col("START_DATETIME")
        .str.strptime(pl.Datetime("ms"), "%d-%m-%Y %H:%M:%S")
        .dt.timestamp("ms")
        .alias("START_DATETIME"),
        pl.col("END_DATETIME")
        .str.strptime(pl.Datetime("ms"), "%d-%m-%Y %H:%M:%S")
        .dt.timestamp("ms")
        .alias("END_DATETIME"),
    ]
)

icnf_data_cleaned = icnf_data_cleaned.with_columns(
    [
        (pl.col("START_DATETIME") / 1_000).alias("START_DATETIME"),
        (pl.col("END_DATETIME") / 1_000).alias("END_DATETIME"),
    ]
)

# rename START_DATETIME and END_DATETIME to START_TIMESTAMP and END_TIMESTAMP

icnf_data_cleaned = icnf_data_cleaned.rename(
    {
        "START_DATETIME": "START_TIMESTAMP",
        "END_DATETIME": "END_TIMESTAMP"
    }
)

icnf_data_cleaned.head()

DISTRICT,WILDFIRE_TYPE,BURNED_POPULATIONAL_AREA,BURNED_BRUSHLAND_AREA,BURNED_AGRICULTURAL_AREA,BURNED_TOTAL_AREA,IS_A_REIGNITION,IS_A_SMALL_FIRE,IS_A_WILDFIRE,IS_A_AGRICULTURAL_FIRE,LOCAL_COMMAND_CENTER_CODE,REGIONAL_COMMAND_CENTER_CODE,ALERT_SOURCE,MUNICIPALITY_CODES,X_PORT_COORD,Y_PORT_COORD,DAY,MONTH,HOUR,OPERATOR,CAUSE,CAUSE_TYPE,START_DATETIME,END_DATETIME,DURATION_MINUTES,HECTARES_PER_HOUR,IS_A_CONTROLLED_FIRE,LATITUDE,LONGITUDE,TEMPERATURE_CELSIUS,RELATIVE_HUMIDITY_PERCENT,WIND_SPEED_MS,WIND_SPEED_VECTOR,WIND_DIRECTION_VECTOR,PRECIPITATION_MM,FFMC,DMC,DC,ISI,BUI,FWI,DSR,THC,MODELED_BURNED_AREA_FARSITE_HA,MEAN_ALTITUDE_M,MEAN_SLOPE_DEG,VEGETATION_DENSITY,VEGETATION_VARIETY_INDEX,BURNED_PATCH_AREA_FARSITE_HA,ALERT_TIMESTAMP,FIRST_INTERVENTION_TIMESTAMP,EXTINCTION_TIMESTAMP
str,str,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,str,f64,str,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Coimbra""","""Wildfire""",0.0,0.001,0.0,0.001,0,1,0,0,2013060000295,6,"""Public""",61305,192399,373499,3,1,17,"""sigo""",449.0,"""Intentional""",1357200000.0,1357200000.0,35.0,0.001714,0,40.330172,-8.222549,15.3,54.0,14.0,-999.0,-999.0,0.0,76.300003,2.2,35.200001,1.7,3.8,0.7,0.01,26.7735,,99.8913,17.5053,120.767,22.49,,1357200000.0,1357200000.0,1357200000.0
"""Coimbra""","""Wildfire""",0.0,0.001,0.0,0.001,0,1,0,0,2013060000445,6,"""Public""",60604,198300,351000,4,1,23,"""sigo""",14.0,"""Negligent""",1357300000.0,1357400000.0,104.0,0.000577,0,40.127576,-8.153059,14.6,41.0,10.0,-999.0,-999.0,0.0,85.400002,2.9,47.400002,3.7,5.1,2.6,0.15,15.8476,,597.479,49.9571,183.455,21.73,,1357300000.0,1357300000.0,1357400000.0
"""Coimbra""","""Wildfire""",0.0,0.05,0.0,0.05,0,1,0,0,2013060000377,6,"""Public""",60111,228799,363099,4,1,13,"""sigo""",124.0,"""Negligent""",1357300000.0,1357300000.0,81.0,0.037037,0,40.236057,-7.794691,9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,227.0,748.766,50.4426,224.046,11.95,10.8147,1357300000.0,1357300000.0,1357300000.0
"""Leiria""","""Wildfire""",0.1,0.1,0.0,0.2,0,1,0,0,2013100000560,10,"""Other""",101302,197499,325299,4,1,13,"""2020419""",124.0,"""Negligent""",1357300000.0,1357300000.0,40.0,0.3,0,39.896117,-8.162342,9.6,51.0,28.0,-999.0,-999.0,0.0,81.099998,1.7,25.0,5.3,3.0,3.2,0.21,20.834101,,343.081,10.0806,138.011,18.89,,1357300000.0,1357300000.0,1357300000.0
"""Leiria""","""Agricultural Fire""",0.0,0.0,6.5,6.5,0,0,0,1,2013100000546,10,"""Public""",101603,145400,287000,4,1,12,"""2020419""",129.0,"""Negligent""",1357300000.0,1357300000.0,188.0,2.074468,0,39.549428,-8.768329,14.9,42.0,14.0,-999.0,-999.0,0.0,85.800003,4.0,40.200001,4.8,6.5,4.1,0.33,15.3988,,264.889,11.7616,261.708,15.32,,1357300000.0,1357300000.0,1357300000.0


## Replace -999 values in the VECTOR columns with the average

In [149]:
for col in ["WIND_SPEED_VECTOR", "WIND_DIRECTION_VECTOR"]:
    mean_value = icnf_data_cleaned.filter(pl.col(col) != -999)[col].mean()
    icnf_data_cleaned = icnf_data_cleaned.with_columns(
        pl.when((pl.col(col) == -999) | (pl.col(col).is_null()))
        .then(mean_value)
        .otherwise(pl.col(col))
        .alias(col)
    )

## Replace null values in numerical columns with the average

In [150]:
# replace CAUSE null values with 0
icnf_data_cleaned = icnf_data_cleaned.with_columns(
    pl.col("CAUSE").fill_nan(0).fill_null(0).alias("CAUSE")
)

In [151]:
# replace these values with the average
# TEMPERATURE_CELSIUS
# RELATIVE_HUMIDITY_PERCENT
# WIND_SPEED_MS
# PRECIPITATION_MM
# FFMC
# DMC
# DC
# ISI
# BUI
# FWI
# DSR
# THC
# MEAN_ALTITUDE_M
# MEAN_SLOPE_DEG
# VEGETATION_DENSITY
# VEGETATION_VARIETY_INDEX
numerical_columns = [
    "TEMPERATURE_CELSIUS",
    "RELATIVE_HUMIDITY_PERCENT",
    "WIND_SPEED_MS",
    "PRECIPITATION_MM",
    "FFMC",
    "DMC",
    "DC",
    "ISI",
    "BUI",
    "FWI",
    "DSR",
    "THC",
    "MEAN_ALTITUDE_M",
    "MEAN_SLOPE_DEG",
    "VEGETATION_DENSITY",
    "VEGETATION_VARIETY_INDEX",
]
for col in numerical_columns:
    mean_value = icnf_data_cleaned.filter(pl.col(col).is_not_null())[col].mean()
    icnf_data_cleaned = icnf_data_cleaned.with_columns(
        pl.when(pl.col(col).is_null())
        .then(mean_value)
        .otherwise(pl.col(col))
        .alias(col)
    )

In [152]:
for col in icnf_data_cleaned.columns:
    if icnf_data_cleaned[col].null_count() > 0:
        print(col, icnf_data_cleaned[col].null_count())

END_DATETIME 331
DURATION_MINUTES 331
HECTARES_PER_HOUR 325
MODELED_BURNED_AREA_FARSITE_HA 65574
BURNED_PATCH_AREA_FARSITE_HA 66239
FIRST_INTERVENTION_TIMESTAMP 179
EXTINCTION_TIMESTAMP 331


## Export the cleaned dataset as CSV

In [153]:
icnf_data_cleaned.write_csv("ICNF_2013_2022_cleaned.csv")