In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("FPA_FOD_Plus.csv", low_memory=False)
df.head()

Unnamed: 0,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,LOCAL_FIRE_REPORT_ID,...,erc_Percentile,NDVI-1day,NDVI_min,NDVI_max,NDVI_mean,CheatGrass,ExoticAnnualGrass,Medusahead,PoaSecunda,geometry
0,300300629,HIWMO-OA2805,INTERAGCY,IA-HIWMO,ST/C&L,USHIHNLX,City and County of Honolulu,HICNTY,Honolulu Fire Dept,,...,,0.27,'-0.03' '-0.01' '-0.07' '-0.09' '0.0' '-0.01' ...,'0.64' '0.7' '0.59' '0.71' '0.67' '0.74' '0.67...,'0.3' '0.28' '0.19' '0.27' '0.28' '0.27' '0.28...,,,,,
1,687128,SFO-TX02240707-86179,NONFED,ST-NASF,ST/C&L,USTXTXS,Texas A & M Forest Service,TXTXS,Texas Forest Service,,...,30-50%,0.35,'-0.1' '-0.03' '-0.05' '-0.02' '0.02' '0.0' '0...,'0.61' '0.61' '0.66' '0.59' '0.68' '0.48' '0.5...,'0.25' '0.32' '0.35' '0.31' '0.26' '0.28' '0.3...,,,,,
2,319387,W-570451,FED,DOI-WFMI,BLM,USCACND,Central California District,CABBD,Bakersfield District,,...,70-90%,,'-0.06' '-0.04' '0.04' '0.12' '0.23' '0.33' '0...,'0.63' '0.61' '0.66' '0.64' '0.72' '0.68' '0.6...,'0.28' '0.25' '0.39' '0.47' '0.53' '0.53' '0.4...,,,,,
3,686959,SFO-TX02240707-84214,NONFED,ST-NASF,ST/C&L,USTXTXS,Texas A & M Forest Service,TXTXS,Texas Forest Service,,...,50-70%,0.28,'-0.09' '-0.01' '-0.03' '-0.01' '0.04' '0.02' ...,'0.61' '0.59' '0.65' '0.65' '0.65' '0.68' '0.4...,'0.13' '0.28' '0.26' '0.23' '0.23' '0.26' '0.2...,,,,,
4,429152,SFO-2007FLFLS2007160001,NONFED,ST-NASF,ST/C&L,USFLFLS,Florida Forest Service,FLFLS,Florida Forest Service,,...,<10%,0.08,'-0.08' '-0.06' '-0.06' '0.07' '-0.06' '-0.06'...,'0.53' '0.64' '0.64' '0.62' '0.56' '0.59' '0.5...,'0.18' '0.28' '0.31' '0.28' '0.22' '0.21' '0.2...,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302521 entries, 0 to 2302520
Columns: 308 entries, FOD_ID to geometry
dtypes: float64(239), int64(10), object(59)
memory usage: 5.3+ GB


In [7]:
missing = df.isnull().mean().sort_values(ascending=False)
missing.head(30)
df = df.drop(columns=missing[missing > 0.8].index) # drop columns with > 80% missing data

In [8]:
constant_cols = [col for col in df.columns if df[col].nunique() == 1]

In [10]:
df = df.drop(columns=constant_cols) # dropping constants

In [12]:
drop_cols = [
    # Identifiers and names (non-informative)
    "FOD_ID", "FPA_ID", "LOCAL_FIRE_REPORT_ID", "LOCAL_INCIDENT_ID",
    "FIRE_NAME", "FIRE_CODE", "OWNER_DESCR", "SOURCE_SYSTEM_TYPE",
    "SOURCE_SYSTEM", "SOURCE_REPORTING_UNIT", "SOURCE_REPORTING_UNIT_NAME",
    "NWCG_REPORTING_AGENCY", "NWCG_REPORTING_UNIT_ID", "NWCG_REPORTING_UNIT_NAME",
    "FIPS_NAME", "LatLong_State", "LatLong_County",
    "NAME", "geometry",
    
    # Management or jurisdiction tags
    "NPL", "Mang_Type", "Mang_Name", "Des_Tp", "GAP_Sts", "GAP_Prity",
    
    # Multi-scale duplicates (1 km vs main versions)
    "EVH_1km", "EVT_1km", "EVC_1km", "Land_Cover_1km", "rpms_1km",
    "FRG_1km", "TRI_1km", "Aspect_1km", "Elevation_1km", "Slope_1km",
    "TPI_1km",
    
    # Repeated GACC administrative stats (regional aggregates)
    *[c for c in [
        "GACCAbbrev","GACC_PL","GACC_New fire","GACC_New LF","GACC_Uncont LF",
        "GACC_Type 1 IMTs","GACC_Type 2 IMTs","GACC_NIMO Teams",
        "GACC_Area Command Teams","GACC_Fire Use Teams"
    ]],
    
    # Percentile and 5-day summary variables (keep only if you study short-term dynamics)
    *[c for c in df.columns if "_5D_" in c or "_Percentile" in c],
    
    # Excess NDVI variations (min/max/day)
    "NDVI-1day", "NDVI_min", "NDVI_max"
]

In [13]:
df_reduced = df.drop(columns=[c for c in drop_cols if c in df.columns])

In [14]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302521 entries, 0 to 2302520
Columns: 192 entries, FIRE_YEAR to NDVI_mean
dtypes: float64(170), int64(9), object(13)
memory usage: 3.3+ GB


In [16]:
target = "NWCG_GENERAL_CAUSE"  
climate_vars = [
    # Daily/mean meteorological vars
    "pr", "tmmn", "tmmx", "rmin", "rmax", "sph", "srad", "etr",
    "vpd", "bi", "erc", "fm100", "fm1000",
    
    # Vegetation / dryness
    "NDVI_mean", "EVC", "EVT", "EVH",
    
    # Topography
    "Elevation", "Slope", "Aspect", "TRI", "TPI",
    
    # Aridity and evapotranspiration
    "Aridity_index", "PET", "AET"
]

socio_vars = [
    "Population", "GDP", "GDP_per_capita", "Road_Density", "Nightlights",
    "Urban_Cover", "Human_Footprint", "Distance_to_Road", "Distance_to_Urban",
    "Agriculture_Area", "Industrial_Area"
]

context_vars = [
    "LATITUDE", "LONGITUDE", "STATE", "COUNTY",
    "FIRE_YEAR", "DISCOVERY_DOY", "DISCOVERY_TIME"
]

drop_cols = [
    # Unique IDs and text
    "FOD_ID", "FPA_ID", "FIRE_NAME", "LOCAL_FIRE_REPORT_ID", "LOCAL_INCIDENT_ID",
    "FIRE_CODE", "OWNER_DESCR", "SOURCE_SYSTEM_TYPE", "SOURCE_SYSTEM",
    "SOURCE_REPORTING_UNIT", "SOURCE_REPORTING_UNIT_NAME", "NWCG_REPORTING_UNIT_ID",
    "NWCG_REPORTING_UNIT_NAME", "NWCG_REPORTING_AGENCY",
    
    # Geometry and duplicates
    "geometry", "FIPS_NAME", "FIPS_CODE",
    
    # Fire outcome fields (leak info about cause)
    "FIRE_SIZE", "FIRE_SIZE_CLASS", "CONT_DATE", "CONT_DOY", "CONT_TIME",
    
    # Administrative or GACC region stats
    *[c for c in df.columns if c.startswith("GACC_")],
    
    # 1km duplicates or suffix variations
    *[c for c in df.columns if c.endswith("_1km")],
    
    # NDVI day-based summaries
    "NDVI_max", "NDVI_min", "NDVI-1day",
]

In [17]:
keep_cols = climate_vars + socio_vars + context_vars + [target]
df_model = df[[c for c in keep_cols if c in df.columns]].copy()


In [20]:
df_model.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302521 entries, 0 to 2302520
Data columns (total 33 columns):
 #   Column              Dtype  
---  ------              -----  
 0   pr                  float64
 1   tmmn                float64
 2   tmmx                float64
 3   rmin                float64
 4   rmax                float64
 5   sph                 float64
 6   srad                float64
 7   etr                 float64
 8   vpd                 float64
 9   bi                  float64
 10  erc                 float64
 11  fm100               float64
 12  fm1000              float64
 13  NDVI_mean           object 
 14  EVC                 float64
 15  EVT                 float64
 16  EVH                 float64
 17  Elevation           int64  
 18  Slope               int64  
 19  Aspect              int64  
 20  TRI                 float64
 21  TPI                 float64
 22  Aridity_index       float64
 23  Population          float64
 24  GDP                 floa

In [23]:
df_model.to_csv('FPA-FOD_reduced.csv')  