Here you guys are going to need to integrate the data I pulled from NASA firms into your overall master CSV,

You guys must write a script that takes a Fire coordinate and finds the closest weather station.

You guys must generate "No Fire" rows random dates/locations where no fire occurred so the model has something to compare against.

FINAL NOTE: Whatever columns you include, record on the google doc in this same folder listing where it came from and what the values
in the column record, it will help with documentation and help the model team along as well'''

In [3]:
!pip install "meteostat<2.0"
import pandas as pd
import numpy as np
from meteostat import Stations, Daily, Point #this is used to help you guys pull all the data from the relevant stations within California



In [4]:
# This script is used to clean the Archive FIRMS Data

# -----------------------------
# CONFIG
# -----------------------------
INPUT_CSV = "fire_archive_SV-C2_710372.csv"
OUTPUT_CSV = "fire_archive_cleaned.csv"

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv(INPUT_CSV)

# -----------------------------
# STANDARDIZE COLUMN NAMES
# -----------------------------
df.columns = df.columns.str.lower().str.strip()

# -----------------------------
# DROP DUPLICATES
# -----------------------------
df = df.drop_duplicates()

# -----------------------------
# QUALITY FILTERS
# -----------------------------
# Keep only nominal & high-confidence detections
if "confidence" in df.columns:
    df = df[df["confidence"].isin(["n", "h"])]

# Remove obviously bad FRP values
if "frp" in df.columns:
    df = df[df["frp"] > 0]

# Remove bad brightness values
if "bright_ti4" in df.columns:
    df = df[df["bright_ti4"] > 0]

# -----------------------------
# DATE / TIME HANDLING
# -----------------------------
# Convert acq_date to datetime
df["acq_date"] = pd.to_datetime(df["acq_date"], errors="coerce")

# Convert acq_time to HH:MM format
df["acq_time"] = df["acq_time"].astype(str).str.zfill(4)
df["acq_hour"] = df["acq_time"].str[:2].astype(int)
df["acq_minute"] = df["acq_time"].str[2:].astype(int)

# Combine into a single timestamp
df["timestamp_utc"] = (
    df["acq_date"]
    + pd.to_timedelta(df["acq_hour"], unit="h")
    + pd.to_timedelta(df["acq_minute"], unit="m")
)

# -----------------------------
# GEOGRAPHIC SANITY CHECKS
# -----------------------------
df = df[
    (df["latitude"].between(-90, 90)) &
    (df["longitude"].between(-180, 180))
]

# -----------------------------
# DROP UNUSED / REDUNDANT COLUMNS
# -----------------------------
columns_to_drop = [
    "acq_time",        # replaced by timestamp
    "bright_ti5",      # often redundant
    "version",         # metadata
    "confidence"       # already filtered
]

df = df.drop(columns=[c for c in columns_to_drop if c in df.columns])

# -----------------------------
# SORT + RESET INDEX
# -----------------------------
df = df.sort_values("timestamp_utc").reset_index(drop=True)

# -----------------------------
# SAVE CLEAN FILE
# -----------------------------
df.to_csv(OUTPUT_CSV, index=False)

print(f"Cleaned data saved to: {OUTPUT_CSV}")
print(f"Final row count: {len(df)}")

Cleaned data saved to: fire_archive_cleaned.csv
Final row count: 574297


In [5]:
# Because the NRT data is more sensitive to clean, this cell is dedicated
# to the NRT dataset.

# -----------------------------
# CONFIG
# -----------------------------
INPUT_CSV = "fire_nrt_SV-C2_710372.csv"
OUTPUT_CSV = "fire_nrt_cleaned.csv"

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv(INPUT_CSV)
df.columns = df.columns.str.lower().str.strip()

# -----------------------------
# DROP DUPLICATES
# -----------------------------
df = df.drop_duplicates()

# -----------------------------
# BASIC SANITY CHECKS (NOT AGGRESSIVE)
# -----------------------------
df = df[
    (df["latitude"].between(-90, 90)) &
    (df["longitude"].between(-180, 180))
]

# -----------------------------
# FRP / BRIGHTNESS SANITY
# (flag, don't drop)
# -----------------------------
df["frp_valid"] = True
df.loc[df["frp"] <= 0, "frp_valid"] = False

if "bright_ti4" in df.columns:
    df["bright_valid"] = True
    df.loc[df["bright_ti4"] <= 0, "bright_valid"] = False

# -----------------------------
# CONFIDENCE HANDLING
# -----------------------------
# Convert confidence to numeric scale
confidence_map = {
    "l": 0,
    "n": 1,
    "h": 2
}

if "confidence" in df.columns:
    df["confidence_level"] = df["confidence"].map(confidence_map)
else:
    df["confidence_level"] = None

# -----------------------------
# DATE / TIME HANDLING
# -----------------------------
df["acq_date"] = pd.to_datetime(df["acq_date"], errors="coerce")

df["acq_time"] = df["acq_time"].astype(str).str.zfill(4)
df["acq_hour"] = df["acq_time"].str[:2].astype(int)
df["acq_minute"] = df["acq_time"].str[2:].astype(int)

df["timestamp_utc"] = (
    df["acq_date"]
    + pd.to_timedelta(df["acq_hour"], unit="h")
    + pd.to_timedelta(df["acq_minute"], unit="m")
)

# -----------------------------
# SATELLITE / INSTRUMENT FLAGS
# -----------------------------
if "instrument" in df.columns:
    df["is_viirs"] = df["instrument"].str.contains("VIIRS", na=False)

# -----------------------------
# DROP ONLY TRUE REDUNDANCY
# -----------------------------
columns_to_drop = [
    "acq_time",   # replaced by timestamp
]

df = df.drop(columns=[c for c in columns_to_drop if c in df.columns])

# -----------------------------
# SORT + SAVE
# -----------------------------
df = df.sort_values("timestamp_utc").reset_index(drop=True)
df.to_csv(OUTPUT_CSV, index=False)

print(f"NRT cleaned data saved to: {OUTPUT_CSV}")
print(f"Final row count: {len(df)}")

NRT cleaned data saved to: fire_nrt_cleaned.csv
Final row count: 4652


In [6]:
# Cross-checking FIRMS Archive Data with 5 most recent fires from CAL FIRE
''' (I didn't know how to convert the data from CAL FIRE into a CSV file,
so I wrote it by hand. I added it to the Raw Data folder on the Drive.
The title is "cal_fire_top_5.csv")
-Arjun '''

# Import geodesic for easier distance calculation
from geopy.distance import geodesic

# -----------------------------
# CONFIG
# -----------------------------
FIRMS_CSV = "fire_archive_cleaned.csv"
CALFIRE_CSV = "cal_fire_top_5.csv"
OUTPUT_CSV = "firms_calfire_cross_reference.csv"

MAX_DISTANCE_KM = 10      # spatial tolerance
TIME_WINDOW_DAYS = 1      # ± days around start date

# -----------------------------
# LOAD DATA
# -----------------------------
firms = pd.read_csv(FIRMS_CSV, parse_dates=["timestamp_utc"])
calfire = pd.read_csv(CALFIRE_CSV, parse_dates=["start_date"])

results = []

# -----------------------------
# CROSS-REFERENCE
# -----------------------------
for _, fire in calfire.iterrows():
    fire_point = (fire["latitude"], fire["longitude"])

    # time window
    start = fire["start_date"] - pd.Timedelta(days=TIME_WINDOW_DAYS)
    end = fire["start_date"] + pd.Timedelta(days=TIME_WINDOW_DAYS)

    # FIRMS detections in time window
    firms_subset = firms[
        (firms["timestamp_utc"] >= start) &
        (firms["timestamp_utc"] <= end)
    ]

    for _, f in firms_subset.iterrows():
        firms_point = (f["latitude"], f["longitude"])
        distance_km = geodesic(fire_point, firms_point).km

        if distance_km <= MAX_DISTANCE_KM:
            results.append({
                "incident_name": fire["incident_name"],
                "incident_lat": fire["latitude"],
                "incident_lon": fire["longitude"],
                "firms_lat": f["latitude"],
                "firms_lon": f["longitude"],
                "distance_km": round(distance_km, 2),
                "firms_time": f["timestamp_utc"]
            })

# -----------------------------
# SAVE RESULTS
# -----------------------------
matches_df = pd.DataFrame(results)
matches_df.to_csv(OUTPUT_CSV, index=False)

print(f"Cross-reference saved to: {OUTPUT_CSV}")
print(f"Total matches found: {len(matches_df)}")


Cross-reference saved to: firms_calfire_cross_reference.csv
Total matches found: 39


In [7]:
# This script adds the seed zones into the cleaned up FIRMS Archive File
''' I got a bit of help for this because I'm not the best with geopandas.
If anything seems off after running this script, please let me know and
I'll fix it asap.
-Arjun '''

# Importing geopandas library due to geojson file of seed zones
import geopandas as gpd

# -----------------------------
# LOAD DATA
# -----------------------------
firms = pd.read_csv("fire_archive_cleaned.csv")

# Convert FIRMS to GeoDataFrame
firms_gdf = gpd.GeoDataFrame(
    firms,
    geometry=gpd.points_from_xy(firms.longitude, firms.latitude),
    crs="EPSG:4326"
)

# Load seed zone polygons
seed_zones = gpd.read_file("California_Seed_Zones_3280520806235389701.geojson")

# Ensure same CRS
seed_zones = seed_zones.to_crs(firms_gdf.crs)

# -----------------------------
# SPATIAL JOIN
# -----------------------------
firms_with_seed = gpd.sjoin(
    firms_gdf,
    seed_zones[["SEED_ZONE", "REGION", "SUBREGION", "SUBZONE", "geometry"]],
    how="left",
    predicate="within"
)

# Drop geometry for CSV output
firms_with_seed = firms_with_seed.drop(columns="geometry")

firms_with_seed.to_csv("fire_firms_with_seed_zones.csv", index=False)

# If you want to locally download the finished file, delete the hashtag below:
# files.download("fire_firms_with_seed_zones.csv")

In [None]:
'''
Used seed zone data to randomize points in California with no fire and at least 5KM away from the nearest fire that day. First converted lon,lat into metric units since it works better this way. Ratio for fire rows and no-fire rows: 1:0.5
-Will
'''

from shapely.geometry import Point
from pyproj import Transformer
import random

fires = pd.read_csv("fire_firms_with_seed_zones.csv")
fires["date"] = pd.to_datetime(fires["acq_date"]).dt.normalize()
fires = fires.drop(columns=["acq_date"], errors="ignore")

fires["fire"] = 1

#load seed zones
seed_zones = gpd.read_file("California_Seed_Zones_3280520806235389701.geojson")

seed_cols = ["SEED_ZONE", "REGION", "SUBREGION", "SUBZONE", "geometry"]
seed_zones = seed_zones[seed_cols].to_crs(epsg=4326)

seed_zones_5070 = seed_zones.to_crs(epsg=5070)
seed_zones_5070["area"] = seed_zones_5070.geometry.area

zone_weights = seed_zones_5070["area"].to_numpy()

# projections
to_5070 = Transformer.from_crs("EPSG:4326", "EPSG:5070", always_xy=True)
to_4326 = Transformer.from_crs("EPSG:5070", "EPSG:4326", always_xy=True)

def lonlat_to_5070(lon, lat):
    return to_5070.transform(lon, lat)

def xy5070_to_lonlat(x, y):
    return to_4326.transform(x,y)

# add projected cords for fire rows
fires["x"], fires["y"] = zip(*fires.apply(
    lambda r: lonlat_to_5070(r["longitude"], r["latitude"]),
    axis=1
))

def random_point(poly):
    minx, miny, maxx, maxy = poly.bounds
    while True:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p):
            return p
        
n = int(len(fires) * 0.5)
no_fire_rows = []

fires_by_date = fires.groupby("date")
unique_dates = fires["date"].dropna().unique()

MIN_DIST_M = 5_000

while len(no_fire_rows) < n:
    #choose random day from fire dataset
    date = random.choice(unique_dates)
    day_fire = fires_by_date.get_group(date)

    #choose seed zone
    zone_idx = random.choices(range(len(seed_zones_5070)), weights=zone_weights, k=1)[0]
    zone = seed_zones_5070.iloc[zone_idx]

    #sample point inside chosen zone
    p_xy = random_point(zone.geometry)
    x, y = p_xy.x, p_xy.y

    #convert back to lon/lat
    lon, lat = xy5070_to_lonlat(x, y)

    # distance to nearest same-day fire
    dist = np.sqrt((day_fire["x"] - x)**2 + (day_fire["y"] - y)**2)

    if dist.min() > MIN_DIST_M:
        no_fire_rows.append({
            "latitude": lat,
            "longitude": lon,
            "date": date,
            "fire": 0,
            "SEED_ZONE": zone["SEED_ZONE"],
            "REGION": zone["REGION"],
            "SUBREGION": zone["SUBREGION"],
            "SUBZONE": zone["SUBZONE"],
        })

no_fire = pd.DataFrame(no_fire_rows)
print("fires:", fires.shape, "no_fire:", no_fire.shape)
no_fire.head()

fires: (574297, 23) no_fire: (287148, 8)


Unnamed: 0,latitude,longitude,date,fire,SEED_ZONE,REGION,SUBREGION,SUBZONE
0,36.98241,-117.899236,2023-07-20,0,981,9,8,1
1,40.823712,-120.702148,2025-09-07,0,732,7,3,2
2,39.382897,-121.096158,2020-07-16,0,525,5,2,5
3,34.619255,-117.88868,2022-01-09,0,982,9,8,2
4,37.065257,-118.419327,2021-11-09,0,784,7,8,4


In [None]:


master_draft = pd.concat([fires, no_fire], ignore_index=True, sort=False)

front = ["fire", "date", "latitude", "longitude", "SEED_ZONE", "REGION", "SUBREGION", "SUBZONE"]
cols = front + [c for c in master_draft.columns if c not in front]
master_draft = master_draft[cols]

master.to_csv("master_draft.csv", index=False)
print("master:", master.shape)
master.head()

master: (861445, 23)


Unnamed: 0,fire,date,latitude,longitude,SEED_ZONE,REGION,SUBREGION,SUBZONE,brightness,scan,...,bright_t31,frp,daynight,type,acq_hour,acq_minute,timestamp_utc,index_right,x,y
0,1,2020-01-01,37.54193,-120.73184,962.0,9.0,6.0,2.0,323.38,0.34,...,267.36,1.38,N,0.0,9.0,11.0,2020-01-01 09:11:00,41.0,-2140218.0,1891557.0
1,1,2020-01-01,36.35637,-114.91145,,,,,310.17,0.5,...,273.5,1.7,N,2.0,9.0,11.0,2020-01-01 09:11:00,,-1670516.0,1645415.0
2,1,2020-01-01,37.54229,-120.73358,962.0,9.0,6.0,2.0,322.64,0.34,...,266.92,1.75,N,0.0,9.0,11.0,2020-01-01 09:11:00,41.0,-2140355.0,1891635.0
3,1,2020-01-01,33.09716,-116.12562,986.0,9.0,8.0,6.0,296.45,0.52,...,275.92,0.8,N,0.0,9.0,12.0,2020-01-01 09:12:00,87.0,-1852828.0,1311066.0
4,1,2020-01-01,33.61496,-117.82188,995.0,9.0,9.0,5.0,303.05,0.43,...,282.23,1.09,N,2.0,9.0,12.0,2020-01-01 09:12:00,79.0,-1993187.0,1401808.0


In [None]:
from meteostat import Stations, Daily
from datetime import datetime

# COMPILING NEARTST WEATHER STATIONS

# CONFIG
INPUT_CSV = "master_draft.csv"
OUTPUT_CSV = "master_with_stations.csv"

# load master csv
master_draft = pd.read_csv(INPUT_CSV, parse_dates=["date"])
master_draft["date"] = pd.to_datetime(master_draft["date"]).dt.date #join key

# GET STATION CATALOG (CALIFORNIA)
stations = (
    Stations()
    .region('US', 'CA')
    .inventory('daily')
    .fetch()
)

stations = stations.reset_index()
if "id" in stations.columns:
    stations = stations.rename(columns={"id": "station_id"})
elif "index" in stations.columns:
    stations = stations.rename(columns={"index": "station_id"})                                        

stations = stations.rename(
    columns={
        "latitude": "station_lat",
        "longitude": "station_lon",
    }
)

stations = stations[["station_id", "station_lat", "station_lon"]]
# HAVERSINE DISTANCE (KM)
def haversine(lat1, lon1, lat2, lon2):
    """
    Great-circle distance between two points (deg) in km.
    In simpler terms, gets the distance between two points in a sphere
    lat2/lon2 can be vector; returns numpy array.
    """
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c

# ASSIGN NEAREST STATION TO UNIQUE LOCATIONS
locs = master_draft[["latitude", "longitude"]].drop_duplicates().reset_index(drop=True)

def find_nearest_station(row):
  """
  Finds the nearest station by having the haversine return a numpy array
  with all the distances, and then simply choosing the minimum distance

  """
  lat = row["latitude"]
  lon = row["longitude"]
  dists = haversine(lat, lon, stations["station_lat"].values, stations["station_lon"].values)
  idx = int(dists.argmin())
  return pd.Series(
        {
            "nearest_station_id": stations.loc[idx, "station_id"],
            "nearest_station_distance_km": float(dists[idx]),
        }
    )

nearest_info = locs.apply(find_nearest_station, axis=1)
locs = pd.concat([locs, nearest_info], axis=1)

master_draft = master_draft.merge(
    locs,
    on=["latitude", "longitude"],
    how="left",
)

master_draft.to_csv(OUTPUT_CSV, index=False)
print("Saved:", OUTPUT_CSV, "rows:", len(master))
master_draft[
    ["latitude", "longitude", "nearest_station_id", "nearest_station_distance_km"]].head()




Saved: master_with_stations.csv rows: 861445


Unnamed: 0,latitude,longitude,nearest_station_id,nearest_station_distance_km
0,37.54193,-120.73184,KMOD0,21.715945
1,36.35637,-114.91145,72380,178.718331
2,37.54229,-120.73358,KMOD0,21.560266
3,33.09716,-116.12562,KL080,25.582935
4,33.61496,-117.82188,KSNA0,8.000062


In [None]:


# FETCH OTHER METEOSTAT COLUMNS (DAILY WEATHER)

INPUT_CSV = "master_with_stations.csv"
OUTPUT_CSV = "master_with_weather.csv"

master = pd.read_csv(INPUT_CSV, parse_dates=["date"])
master["date"] = pd.to_datetime(master["date"]).dt.date

#datetime bounds
start_date = min(master["date"])
end_date = max(master["date"])

start = datetime.combine(start_date, datetime.min.time())
end = datetime.combine(end_date, datetime.min.time())

all_daily = []
station_ids = master["nearest_station_id"].dropna().unique()

for sid in station_ids:
    daily = Daily(sid, start, end).fetch()
    if daily.empty:
        continue

    daily = daily.reset_index().rename(columns={"time": "date"})
    daily["nearest_station_id"] = sid

    daily = daily[
        [
            "nearest_station_id",
            "date",
            "tavg",
            "tmin",
            "tmax",
            "prcp",
            "wdir",
            "wspd",
            "wpgt",
            "pres",
            "tsun"
        ]
    ]
    all_daily.append(daily)

if not all_daily:
    raise RuntimeError("No Meteostat daily data for any station in use.")

weather = pd.concat(all_daily, ignore_index=True)

# RENAME TO wx_* AND ALIGN TYPES
weather = weather.rename(
    columns={
        "tavg": "wx_tavg_c",
        "tmin": "wx_tmin_c",
        "tmax": "wx_tmax_c",
        "prcp": "wx_prcp_mm",
        "wdir": "wx_wdir_deg",
        "wspd": "wx_wspd_kmh",
        "wpgt": "wx_wpgt_kmh",
        "pres": "wx_pres_hpa",
        "tsun": "wx_tsun_min"
    }
)

weather["date"] = pd.to_datetime(weather["date"]).dt.date


# JOIN WEATHER ONTO FIRMS

master_weather = master.merge(
    weather,
    on=["nearest_station_id", "date"],
    how="left",
)

master_weather.to_csv(OUTPUT_CSV, index=False)
print("Saved:", OUTPUT_CSV, "rows:", len(master_weather))

master_weather.head()



Saved: master_with_weather.csv rows: 861445


Unnamed: 0,fire,date,latitude,longitude,SEED_ZONE,REGION,SUBREGION,SUBZONE,brightness,scan,...,wx_tavg_c,wx_tmin_c,wx_tmax_c,wx_prcp_mm,snow,wx_wdir_deg,wx_wspd_ms,wx_wpgt_kmh,wx_pres_hpa,wx_tsun_min
0,1,2020-01-01,37.54193,-120.73184,962.0,9.0,6.0,2.0,323.38,0.34,...,9.2,3.9,15.6,0.0,,,6.0,,1019.8,
1,1,2020-01-01,36.35637,-114.91145,,,,,310.17,0.5,...,8.7,4.4,13.9,0.0,,,6.3,,1014.9,
2,1,2020-01-01,37.54229,-120.73358,962.0,9.0,6.0,2.0,322.64,0.34,...,9.2,3.9,15.6,0.0,,,6.0,,1019.8,
3,1,2020-01-01,33.09716,-116.12562,986.0,9.0,8.0,6.0,296.45,0.52,...,,,,,,,,,,
4,1,2020-01-01,33.61496,-117.82188,995.0,9.0,9.0,5.0,303.05,0.43,...,14.8,10.0,19.4,,,,5.6,,1015.9,


In [None]:
'''
Check percentage of NA and NaN just in case
'''

print(master_weather[["wx_tmax_c","wx_prcp_mm","wx_wspd_kmh", "wx_pres_hpa", "wx_tsun_min"]].isna().mean())
print("rows:", len(master_weather))
print("stations used:", master_weather["nearest_station_id"].nunique())

wx_tmax_c      0.136060
wx_prcp_mm     0.263890
wx_wspd_kmh    0.147582
wx_pres_hpa    0.215555
wx_tsun_min    1.000000
dtype: float64
rows: 861445
stations used: 153


In [12]:
''' Got my data from LANDFIRE, extracted .tif of CONUS for Existing Vegetation Type, Height, Cover (denoted evt, evh, evc respectively) and filtered it to only contain data from california (all done on a separate doc since I couldn't just download California data off rip). Code below is mostly done with the help of chat since I've never played with rasters and .tif fiels 
-Will '''

import rasterio

df = pd.read_csv("master_with_weather.csv")

gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]), crs="EPSG:4326"
)

def add_raster_values(gdf, raster_path, out_col):
    with rasterio.open(raster_path) as src:
        pts = gdf.to_crs(src.crs)

        coords = [(geom.x, geom.y) for geom in pts.geometry]
        vals = [v[0] for v in src.sample(coords)]

        nodata = src.nodata
        if nodata is not None:
            vals = [None if (v == nodata) else v for v in vals]

        gdf[out_col] = vals
    return gdf

gdf = add_raster_values(gdf, "evt_ca.tif", "lf_evt")
gdf = add_raster_values(gdf, "evc_ca.tif", "lf_evc")
gdf = add_raster_values(gdf, "evh_ca.tif", "lf_evh")




In [None]:
'''Merged lookup table adding columns 'lf_evc', 'lf_evh',
       'EVT_NAME', 'LFRDB', 'EVT_FUEL', 'EVT_FUEL_N', 'EVT_LF', 'EVT_PHYS',
       'EVT_GP', 'EVT_GP_N', 'SAF_SRM', 'EVT_ORDER', 'EVT_CLASS', 'EVT_SBCLS' to provide context on vegetation values 
-Will 
'''


# Debating if this is the right move. 
# gdf = gdf.dropna(subset=["lf_evt", "lf_evc", "lf_evh"]) # input?

evt_lookup = pd.read_csv("LF2024_EVT.csv")
evt_lookup["VALUE"] = evt_lookup["VALUE"].astype(float)
evt_lookup = evt_lookup.drop(columns=["R", "G", "B", "RED", "GREEN", "BLUE"])
gdf = gdf.merge(
    evt_lookup,
    left_on = "lf_evt",
    right_on = "VALUE",
    how="left"
)
gdf = gdf.drop(columns=["lf_evt", "VALUE", "LFRDB", "EVT_ORDER", "SAF_SRM", "geometry"])

gdf.to_csv("master_final.csv", index=False)
print("Saved master_final.csv", gdf.shape)

Saved master_final.csv (861445, 46)


## New Columns Added
lf_evc : (Categorical) % Ground covered by life vegetation (categorical bins defined by LANDFIRE)

lf_evh : (Categorical) vertical structure of vegetation

EVT_NAME : Name of vegetation type

EVT_FUEL : Describes how the vegetation typically behaves as fuel source in wildfire contexts

EVT_FUEL_N : numeric counterpart to EVT_FUEL

EVT_LF : lifeform classification

EVT_PHYS : Describing physical structure of vegetation (e.g., forest, woodland, shrubland)

EVT_GP : numeric code representing vegetation group that aggregates multiple EVT classes into categories 

EVT_GP_N  : Descriptive name

EVT_CLASS : higher-level vegetation class grouping EVT types into generalized categories



In [27]:
df = pd.read_csv("master_final.csv")
df.shape



(861445, 46)