In [7]:
import os
import re
import numpy as np
import pandas as pd
import rioxarray as rxr
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow as pa
import pyarrow.parquet as pq

from glob import glob
from pathlib import Path

Define Paths

In [2]:
processed_data_dir = "../data/processed"
output_data_dir = "../data/outputs"

os.makedirs(output_data_dir, exist_ok=True)

Load/Align Rasters Helper Function

In [3]:
def load_and_align_rasters(file_list, reference_raster=None):
    """
    Loads a list of rasters and aligns them to a common resolution/extent.
    If reference_raster is provided, aligns all to that raster.
    Returns an xarray.Dataset with all rasters stacked as variables.
    """
    rasters = []
    names = []

    for file in file_list:
        arr = rxr.open_rasterio(file, masked=True).squeeze()
        names.append(os.path.splitext(os.path.basename(file))[0])

        # Reproject/align if reference is given
        if reference_raster is not None:
            arr = arr.rio.reproject_match(reference_raster)

        rasters.append(arr)

    # Stack into dataset
    ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])
    return ds

Severity Classification Function

In [4]:
def classify_severity(dnbr):
    if dnbr < 0.1:
        return "Unburned"
    elif dnbr < 0.27:
        return "Low"
    elif dnbr < 0.44:
        return "Moderate"
    else:
        return "High"

Process All Fires

In [5]:
all_fire_dfs = []

# Detect fires based on available files
fires = sorted(set(os.path.basename(f).split("_")[0] for f in glob(os.path.join(processed_data_dir, "*.tif"))))

for fire_name in fires:
    print(f"Processing {fire_name} fire ...")

    fire_files = glob(os.path.join(processed_data_dir, f"{fire_name}_*.tif"))
    dnbr_path = os.path.join(processed_data_dir, f"{fire_name}_dNBR.tif")

    if not os.path.exists(dnbr_path):
        print(f"Skipping {fire_name} — no dNBR available.")
        continue

    try:
        reference = rxr.open_rasterio(dnbr_path, masked=True).squeeze()
        fire_ds = load_and_align_rasters(fire_files, reference_raster=reference)

        # Mask no-data areas based on dNBR
        fire_ds = fire_ds.where(~np.isnan(fire_ds[f"{fire_name}_dNBR"]), drop=True)

        # Convert to dataframe
        df = fire_ds.to_dataframe().reset_index()
        df = df.dropna()
        df["fire_name"] = fire_name
        df["severity"] = df[f"{fire_name}_dNBR"].apply(classify_severity)

        # Save individual file
        fire_output = os.path.join(output_data_dir, f"{fire_name}_dataset.parquet")
        df.to_parquet(fire_output, index=False)
        print(f"Saved {fire_name} dataset with {len(df)} pixels → {fire_output}")

        all_fire_dfs.append(df)

    except Exception as e:
        print(f"Error processing {fire_name}: {e}")



Processing Bootleg fire ...
Skipping Bootleg — no dNBR available.
Processing Caldor fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Caldor dataset with 5517951 pixels → ../data/outputs\Caldor_dataset.parquet
Processing Camp fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Camp dataset with 5609430 pixels → ../data/outputs\Camp_dataset.parquet
Processing Carr fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Carr dataset with 5680136 pixels → ../data/outputs\Carr_dataset.parquet
Processing Creek fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Creek dataset with 5427398 pixels → ../data/outputs\Creek_dataset.parquet
Processing Dixie fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Dixie dataset with 5625635 pixels → ../data/outputs\Dixie_dataset.parquet
Processing Glass fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Glass dataset with 3832992 pixels → ../data/outputs\Glass_dataset.parquet
Processing Thomas fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Thomas dataset with 4458250 pixels → ../data/outputs\Thomas_dataset.parquet
Processing Troublesome fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Troublesome dataset with 5576320 pixels → ../data/outputs\Troublesome_dataset.parquet
Processing Woolsey fire ...


  ds = xr.merge([rasters[i].to_dataset(name=names[i]) for i in range(len(rasters))])


Saved Woolsey dataset with 2091670 pixels → ../data/outputs\Woolsey_dataset.parquet


Combine Fires

In [8]:
fire_datasets = list(Path(output_data_dir).glob('*_dataset.parquet'))

output_file = Path("combined_dataset.parquet")

if output_file.exists():
    output_file.unlink()

writer = None
total_rows = 0

print(f"Found {len(fire_datasets)} fire datasets to combine.\n")

for i, file in enumerate(fire_datasets, start=1):
    fire_name = file.stem.replace('_dataset', '')
    
    df = pd.read_parquet(file)

    df.columns = [re.sub(f"^{fire_name}_", "", c) for c in df.columns]
    df = df.rename(columns={
        'x': 'longitude',
        'y': 'latitude',
        'veg_indices': 'NDVI'
    })

    if 'fire_name' not in df.columns:
        df['fire_name'] = fire_name
    else:
        df['fire_name'] = df['fire_name'].fillna(fire_name)

    expected_cols = ['latitude', 'longitude', 'dNBR', 'SPI', 'VCI', 'NDVI', 'severity', 'fire_name']
    available_cols = [c for c in expected_cols if c in df.columns]
    df = df[available_cols]

    for col in ['dNBR', 'SPI', 'VCI', 'NDVI', 'severity']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce', downcast='float')

    table = pa.Table.from_pandas(df)
    if writer is None:
        writer = pq.ParquetWriter(output_file, table.schema)
    writer.write_table(table)

    total_rows += len(df)
    print(f"[{i}/{len(fire_datasets)}] Processed '{fire_name}' — {len(df):,} rows (Total: {total_rows:,})")

if writer:
    writer.close()


Found 10 fire datasets to combine.

[1/10] Processed 'Caldor' — 5,517,951 rows (Total: 5,517,951)
[2/10] Processed 'Camp' — 5,609,430 rows (Total: 11,127,381)
[3/10] Processed 'Carr' — 5,680,136 rows (Total: 16,807,517)
[4/10] Processed 'combined' — 42,265,444 rows (Total: 59,072,961)
[5/10] Processed 'Creek' — 5,427,398 rows (Total: 64,500,359)
[6/10] Processed 'Dixie' — 5,625,635 rows (Total: 70,125,994)
[7/10] Processed 'Glass' — 3,832,992 rows (Total: 73,958,986)
[8/10] Processed 'Thomas' — 4,458,250 rows (Total: 78,417,236)
[9/10] Processed 'Troublesome' — 5,576,320 rows (Total: 83,993,556)
[10/10] Processed 'Woolsey' — 2,091,670 rows (Total: 86,085,226)
