In [1]:
import geopandas as gpd

geo_df = gpd.read_file("data/hms_fire2020.kml")


  result = read_func(


In [5]:
import xml.etree.ElementTree as ET
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

def parse_hms_fire_kml_to_geodf(path, max_points=None):
    ns = "{http://www.opengis.net/kml/2.2}"
    rows = []
    count = 0

    context = ET.iterparse(path, events=("end",))

    for event, elem in context:
        # We only care about Placemark elements
        if not elem.tag.endswith("Placemark"):
            continue

        # <description> block
        desc = elem.find(f"{ns}description")
        if desc is None or desc.text is None:
            elem.clear()
            continue

        desc_text = desc.text.replace("<br>", "\n")

        meta = {}
        for line in desc_text.split("\n"):
            if ":" in line:
                key, val = line.split(":", 1)
                meta[key.strip()] = val.strip()

        point = elem.find(f".//{ns}Point")
        if point is None:
            elem.clear()
            continue

        coords_tag = point.find(f"{ns}coordinates")
        if coords_tag is None or coords_tag.text is None:
            elem.clear()
            continue

        try:
            lon_str, lat_str, *_ = coords_tag.text.strip().split(",")
            lon = float(lon_str)
            lat = float(lat_str)
        except Exception:
            elem.clear()
            continue

        meta["lon"] = lon
        meta["lat"] = lat

        rows.append(meta)
        count += 1

        # Optional: stop early for testing
        if max_points is not None and count >= max_points:
            break

        # Free memory for processed element
        elem.clear()

    df = pd.DataFrame(rows)

    if "FRP" in df.columns:
        df["FRP"] = (
            df["FRP"]
            .astype(str)
            .str.replace("MW", "", regex=False)
            .replace("-999.000", pd.NA)
        )
        df["FRP"] = pd.to_numeric(df["FRP"], errors="coerce")

    if "YearDay" in df.columns:
        df["YearDay"] = pd.to_numeric(df["YearDay"], errors="coerce")

    # Build geometry
    geometry = [Point(xy) for xy in zip(df["lon"], df["lat"])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

    return gdf


In [7]:
fire2020_gdf = parse_hms_fire_kml_to_geodf("data/hms_fire2020.kml", max_points=None)
len(fire2020_gdf), fire2020_gdf.head()


(2491671,
            Lon        Lat  YearDay     Time  Satellite    Method Ecosystem  \
 0   -89.007000  43.250000  2020001  0001UTC  GOES-EAST  ANALYSIS        30   
 1  -115.107000  32.665000  2020001  0020UTC  GOES-WEST  ANALYSIS        51   
 2  -117.667000  47.512000  2020001  0106UTC  GOES-WEST  ANALYSIS        40   
 3   -86.205000  32.640000  2020001  0120UTC    METOP-A  ANALYSIS        27   
 4   -78.546000  35.440000  2020001  0120UTC    METOP-A  ANALYSIS        31   
 
    FRP      lon     lat                 geometry  
 0  NaN  -89.007  43.250    POINT (-89.007 43.25)  
 1  NaN -115.107  32.665  POINT (-115.107 32.665)  
 2  NaN -117.667  47.512  POINT (-117.667 47.512)  
 3  NaN  -86.205  32.640    POINT (-86.205 32.64)  
 4  NaN  -78.546  35.440    POINT (-78.546 35.44)  )

In [9]:
states = gpd.read_file("data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp")
ca = states[states["NAME"] == "California"].to_crs("EPSG:4326")

ca_fire = gpd.sjoin(fire2020_gdf, ca, how="inner", predicate="intersects")


In [10]:
import pandas as pd

def parse_hms_datetime(yd, time_str):
    date = pd.to_datetime(str(yd), format="%Y%j")

    hhmm = time_str.replace("UTC", "")
    hour = int(hhmm[:2])
    minute = int(hhmm[2:])

    return date + pd.Timedelta(hours=hour, minutes=minute)

ca_fire["timestamp"] = ca_fire.apply(
    lambda row: parse_hms_datetime(row["YearDay"], row["Time"]),
    axis=1
)
ca_fire["lon_round"] = ca_fire["lon"].round(3)
ca_fire["lat_round"] = ca_fire["lat"].round(3)
ca_fire["pixel_id"] = (ca_fire["lon_round"].astype(str) + "_" + ca_fire["lat_round"].astype(str))

ca_fire = ca_fire.sort_values(["pixel_id", "timestamp"])


In [11]:
from datetime import timedelta

gap = pd.Timedelta(hours=2)

ca_fire["prev_time"] = ca_fire.groupby("pixel_id")["timestamp"].shift(1)
ca_fire["gap"] = ca_fire["timestamp"] - ca_fire["prev_time"]

ca_fire["new_episode"] = (ca_fire["gap"] > gap) | ca_fire["gap"].isna()

ca_fire["episode_id"] = ca_fire.groupby("pixel_id")["new_episode"].cumsum()

episodes = (
    ca_fire.groupby(["pixel_id", "episode_id"])
    .agg(
        lon=("lon_round", "first"),
        lat=("lat_round", "first"),
        start_time=("timestamp", "min"),
        end_time=("timestamp", "max"),
        n_detections=("timestamp", "count"),
        satellites=("Satellite", lambda x: sorted(x.unique())),
        ecosystems=("Ecosystem", lambda x: sorted(x.unique())),
    )
    .reset_index()
)


In [12]:
from shapely.geometry import Point

episodes["geometry"] = episodes.apply(
    lambda r: Point(r["lon"], r["lat"]), axis=1
)

episodes_gdf = gpd.GeoDataFrame(episodes, geometry="geometry", crs="EPSG:4326")


In [16]:
print(f"Number of fire pixels: {len(episodes_gdf)}")

num_unique_pixels = ca_fire["episode_id"].nunique()
print(f"Number of unique fire events: {num_unique_pixels}")

episodes_gdf.head()

Number of fire pixels: 482158
Number of unique fire events: 13


Unnamed: 0,pixel_id,episode_id,lon,lat,start_time,end_time,n_detections,satellites,ecosystems,geometry
0,-114.141_34.281,1,-114.141,34.281,2020-02-18 08:20:00,2020-02-18 08:20:00,1,[NOAA-20],[51],POINT (-114.141 34.281)
1,-114.148_34.279,1,-114.148,34.279,2020-02-18 06:30:00,2020-02-18 06:36:00,2,[GOES-EAST],[51],POINT (-114.148 34.279)
2,-114.14_34.279,1,-114.14,34.279,2020-02-18 10:00:00,2020-02-18 10:00:00,1,[NOAA-20],[51],POINT (-114.14 34.279)
3,-114.464_32.896,1,-114.464,32.896,2020-02-25 19:31:00,2020-02-25 19:31:00,1,[GOES-EAST],[51],POINT (-114.464 32.896)
4,-114.486_32.914,1,-114.486,32.914,2020-02-23 19:44:00,2020-02-23 19:44:00,1,[NOAA-20],[51],POINT (-114.486 32.914)


In [22]:
import re
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import xml.etree.ElementTree as ET

def parse_hms_fire_kml_to_geodf(filepath):

    ns = {"kml": "http://www.opengis.net/kml/2.2"}
    tree = ET.parse(filepath)
    root = tree.getroot()

    placemarks = root.findall(".//kml:Placemark", ns)

    rows = []
    for pm in placemarks:
        desc = pm.find("kml:description", ns)
        coords_tag = pm.find(".//kml:Point/kml:coordinates", ns)

        if desc is None or coords_tag is None or coords_tag.text is None:
            continue

        # description text
        text = desc.text.replace("<br>", "\n")

        def get(field):
            m = re.search(fr"{field}:\s*([^\n]+)", text)
            return m.group(1).strip() if m else None

        # metadata from description
        YearDay   = get("YearDay")
        Time      = get("Time")
        Satellite = get("Satellite")
        Method    = get("Method")
        Ecosystem = get("Ecosystem")
        FRP_raw   = get("FRP")

        # coordinates from <Point><coordinates>
        try:
            lon_str, lat_str, *_ = coords_tag.text.strip().split(",")
            lon = float(lon_str)
            lat = float(lat_str)
        except Exception:
            continue

        # clean FRP
        FRP = None
        if FRP_raw:
            val = FRP_raw.replace("MW", "").strip()
            if val != "-999.000":
                try:
                    FRP = float(val)
                except ValueError:
                    FRP = None

        rows.append({
            "lon": lon,
            "lat": lat,
            "YearDay": YearDay,
            "Time": Time,
            "Satellite": Satellite,
            "Method": Method,
            "Ecosystem": Ecosystem,
            "FRP": FRP
        })

    df = pd.DataFrame(rows)

    # enforce numeric
    df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
    df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
    df["YearDay"] = pd.to_numeric(df["YearDay"], errors="coerce")

    df = df.dropna(subset=["lon", "lat", "YearDay", "Time"])

    # geometry column
    geometry = [Point(xy) for xy in zip(df["lon"], df["lat"])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

    return gdf

def add_timestamp_from_yearday_time(gdf):
    """
    Add a 'timestamp' column combining YearDay (YYYYDDD) and Time (HHMMUTC)
    """
    def _to_ts(row):
        yd = int(row["YearDay"])
        t = str(row["Time"])

        # date from YearDay (YYYYDDD)
        date = pd.to_datetime(str(yd), format="%Y%j")

        # time from HHMMUTC
        t = t.replace("UTC", "")
        hh = int(t[:2])
        mm = int(t[2:])
        return date + pd.Timedelta(hours=hh, minutes=mm)

    gdf = gdf.copy()
    gdf["timestamp"] = gdf.apply(_to_ts, axis=1)
    return gdf

def fire_pixels_to_episodes(gdf, round_decimals=3, max_gap_hours=2):
    """
    From point detections (with 'lon', 'lat', 'timestamp'),
    compute fire episodes per rounded pixel.
    """
    gdf = gdf.copy()

    # ensure numeric
    gdf["lon"] = pd.to_numeric(gdf["lon"], errors="coerce")
    gdf["lat"] = pd.to_numeric(gdf["lat"], errors="coerce")
    gdf = gdf.dropna(subset=["lon", "lat", "timestamp"])

    # round coordinates -> pixel grid
    gdf["lon_round"] = gdf["lon"].round(round_decimals)
    gdf["lat_round"] = gdf["lat"].round(round_decimals)

    # pixel id
    gdf["pixel_id"] = (
        gdf["lon_round"].astype(str) + "_" + gdf["lat_round"].astype(str)
    )

    # sort for episode grouping
    gdf = gdf.sort_values(["pixel_id", "timestamp"])

    # compute gaps in hours
    gdf["prev_time"] = gdf.groupby("pixel_id")["timestamp"].shift(1)
    gdf["gap_hours"] = (
        gdf["timestamp"] - gdf["prev_time"]
    ).dt.total_seconds() / 3600.0

    # new episode when gap > threshold or no prev
    gdf["new_episode"] = gdf["gap_hours"].isna() | (gdf["gap_hours"] > max_gap_hours)

    # cumulative sum per pixel gives episode id
    gdf["episode_id"] = gdf.groupby("pixel_id")["new_episode"].cumsum()

    # aggregate episodes
    episodes = (
        gdf.groupby(["pixel_id", "episode_id"])
        .agg(
            lon=("lon_round", "first"),
            lat=("lat_round", "first"),
            start_time=("timestamp", "min"),
            end_time=("timestamp", "max"),
            detections=("timestamp", "count"),
            satellites=("Satellite", lambda x: sorted(x.dropna().unique())),
            max_frp=("FRP", "max"),
            mean_frp=("FRP", "mean"),
        )
        .reset_index()
    )

    episodes["duration_hours"] = (
        episodes["end_time"] - episodes["start_time"]
    ).dt.total_seconds() / 3600.0
    episodes["duration_days"] = episodes["duration_hours"] / 24.0

    # back to GeoDataFrame
    episodes_gdf = gpd.GeoDataFrame(
        episodes,
        geometry=[Point(xy) for xy in zip(episodes["lon"], episodes["lat"])],
        crs="EPSG:4326",
    )

    return episodes_gdf

def process_fire_year_for_ca(kml_path, ca_polygon_gdf):
    """
    Full pipeline for a single year:
    - parse KML
    - add timestamp
    - filter to California only
    - create episodes
    """
    # 1. parse
    gdf = parse_hms_fire_kml_to_geodf(kml_path)

    # 2. add timestamp
    gdf = add_timestamp_from_yearday_time(gdf)

    # 3. filter to California via spatial join
    gdf_ca = gpd.sjoin(
        gdf,
        ca_polygon_gdf[["geometry"]],
        how="inner",
        predicate="intersects",
    ).drop(columns=["index_right"])

    # 4. build episodes (CA only)
    episodes_ca = fire_pixels_to_episodes(gdf_ca)

    return episodes_ca


In [23]:
all_years = []

for year in range(2020, 2026):
    kml_file = f"data/hms_fire{year}.kml"
    print(f"\nProcessing {kml_file} ...")

    episodes_ca = process_fire_year_for_ca(kml_file, ca)
    episodes_ca["year"] = year

    print(f"  → {len(episodes_ca)} episodes in CA")
    all_years.append(episodes_ca)



Processing data/hms_fire2020.kml ...
  → 482158 episodes in CA

Processing data/hms_fire2021.kml ...
  → 403359 episodes in CA

Processing data/hms_fire2022.kml ...
  → 57605 episodes in CA

Processing data/hms_fire2023.kml ...
  → 67568 episodes in CA

Processing data/hms_fire2024.kml ...
  → 120342 episodes in CA

Processing data/hms_fire2025.kml ...


FileNotFoundError: [Errno 2] No such file or directory: 'data/hms_fire2025.kml'

In [24]:
final_fire_episodes_ca = pd.concat(all_years, ignore_index=True)

print("\nFINAL SHAPE:", final_fire_episodes_ca.shape)

final_fire_episodes_ca["episode_key"] = (
    final_fire_episodes_ca["year"].astype(str)
    + "_"
    + final_fire_episodes_ca["episode_id"].astype(str)
)



FINAL SHAPE: (1131032, 14)


In [31]:
final_fire_episodes_ca.to_parquet("data/fire_episodes_2020_2025.parquet", index=False)