# WiDS Datathon 2026 Notebook
This is where your analysis begins. Use this notebook for EDA, modeling, and explanations.

# WiDS Datathon 2026 – Route 1: Accelerating Equitable Evacuations

**Goal:** Identify evacuation alert delays and build an early-trigger recommendation baseline using WatchDuty data.

**Core KPI:** Minutes of lead time gained (how much earlier we could trigger protective action).



In [4]:
import pandas as pd
import numpy as np
import json

pd.set_option("display.max_columns", 200)




In [6]:
events = pd.read_csv("geo_events_geoevent.csv", low_memory=False)
changelog = pd.read_csv("geo_events_geoeventchangelog.csv", low_memory=False)

events = events[events["geo_event_type"] == "wildfire"].copy()

events["date_created"] = pd.to_datetime(events["date_created"], errors="coerce")
events["date_modified"] = pd.to_datetime(events["date_modified"], errors="coerce")
changelog["date_created"] = pd.to_datetime(changelog["date_created"], errors="coerce")

print("events:", events.shape, "changelog:", changelog.shape)
events.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'geo_events_geoevent.csv'

In [1]:
# Citim changelog-ul evenimentelor (timeline de modificări + semnale radio)
ge_changelog = pd.read_csv("geo_events_geoeventchangelog.csv", low_memory=False)

NameError: name 'pd' is not defined

In [None]:
# Citim maparea dintre incendii (geo_event_id) și zone (evac_zone_id)
map_event_zone = pd.read_csv("evac_zone_status_geo_event_map.csv", low_memory=False)

In [None]:
import sys
import csv

# Increase the CSV field size limit to handle potentially large fields
csv.field_size_limit(sys.maxsize)

# Citim changelog-ul zonelor de evacuare (status changes: warning -> order etc.)
evac_changelog = pd.read_csv(
    "evac_zones_gis_evaczonechangelog.csv",
    engine="python",
    on_bad_lines='skip' # Skip malformed lines
)


In [None]:
#  Citim perimetrele focului; nu e obligatoriu în MVP
fire_perimeters = pd.read_csv(
    "fire_perimeters_gis_fireperimeter.csv",
    engine="python",
    on_bad_lines='skip' # Skip malformed lines
)

In [None]:
# Filtrăm doar evenimentele de tip "wildfire"
geo_events = geo_events[geo_events["geo_event_type"] == "wildfire"].copy()

# Convertim timpii în datetime ca să putem face calcule pe timp
geo_events["date_created"] = pd.to_datetime(geo_events["date_created"], errors="coerce")
geo_events["date_modified"] = pd.to_datetime(geo_events["date_modified"], errors="coerce")

# Pentru changelog: convertim timestamp-ul
ge_changelog["date_created"] = pd.to_datetime(ge_changelog["date_created"], errors="coerce")

# Pentru evac changelog: convertim timestamp-ul
evac_changelog["date_created"] = pd.to_datetime(evac_changelog["date_created"], errors="coerce")

print("Wildfires:", geo_events.shape)
geo_events.head(3)

Wildfires: (61779, 17)


Unnamed: 0,id,date_created,date_modified,geo_event_type,name,is_active,description,address,lat,lng,data,notification_type,external_id,external_source,incident_id,reporter_managed,is_visible
0,76,2021-08-11 00:09:56.481066,2023-02-09 20:34:24.180117,wildfire,Todd Fire,0,,"Llano Rd & Todd Rd, Santa Rosa, California 95...",38.3861,-122.769299,"{""is_fps"": false, ""acreage"": 50, ""containment""...",normal,1008375748,pulsepoint,76.0,1,0
1,77,2021-08-11 07:21:46.054995,2023-02-09 20:34:24.225186,wildfire,Vegetation Fire,0,,"Guerneville Rd & W Steele Ln, Santa Rosa, CA 9...",38.459967,-122.728927,"{""is_fps"": false, ""acreage"": 0, ""containment"":...",normal,1008583077,pulsepoint,77.0,1,0
2,78,2021-08-11 21:02:16.301416,2023-02-09 20:34:24.266124,wildfire,Ford Fire,0,,"CA-1 & Valley Ford Estero Rd, Valley Ford, CA ...",38.318328,-122.925698,"{""is_fps"": false, ""acreage"": 0, ""containment"":...",normal,1008957703,pulsepoint,78.0,1,0


In [None]:
# Transformăm string-ul JSON din "changes" în dict Python, ca să putem citi cheile ușor
ge_changelog["changes_dict"] = ge_changelog["changes"].apply(
    lambda x: json.loads(x) if pd.notnull(x) else {}
)

# Verificăm primele rânduri
ge_changelog[["date_created", "geo_event_id", "changes_dict"]].head()

Unnamed: 0,date_created,geo_event_id,changes_dict
0,2024-06-19 00:30:52.819841,22339.0,"{'name': ['Vegetation Fire', 'Charlotte Fire']}"
1,2024-06-19 00:31:10.439467,22338.0,"{'data.links': [[], [{'label': 'Pulsepoint Inc..."
2,2024-06-19 00:31:25.085624,22339.0,"{'address': ['W Ave C & 110th St W, Antelope A..."
3,2024-06-19 00:31:45.603726,22339.0,{}
4,2024-06-19 00:31:51.368137,22339.0,"{'data.acreage': [None, 3]}"


In [None]:
# Numărăm cheile din changes_dict ca să vedem ce semnale avem în date
key_counter = Counter()
for d in ge_changelog["changes_dict"]:
    key_counter.update(d.keys())

# Listăm cheile radio traffic existente
radio_keys = [k for k in key_counter.keys() if "radio_traffic" in k]

print("Radio keys found:", radio_keys)

# Procentul de update-uri care conțin radio traffic (semnale rare, dar puternice)
radio_share = ge_changelog["changes"].str.contains("radio_traffic", na=False).mean()
print("Share of updates with radio_traffic:", radio_share)

Radio keys found: ['radio_traffic_indicates_structure_threat', 'radio_traffic_indicates_spotting', 'radio_traffic_indicates_rate_of_spread']
Share of updates with radio_traffic: 0.05538425379273295


In [None]:
def is_credible_signal(change):
    """
    Returnează True dacă un update conține un semnal credibil de escaladare.
    """
    # Dacă nu e dict, nu putem căuta chei
    if not isinstance(change, dict):
        return False

    # Semnal puternic: amenințare la structuri
    if "radio_traffic_indicates_structure_threat" in change:
        return True

    # Semnal puternic: spotting (focuri secundare)
    if "radio_traffic_indicates_spotting" in change:
        return True

    # Semnal de viteză de propagare
    if "radio_traffic_indicates_rate_of_spread" in change:
        val = change.get("radio_traffic_indicates_rate_of_spread")  # luăm valoarea
        if isinstance(val, list):  # uneori e [vechi, nou]
            val = val[-1]          # luăm noul
        if val in ["moderate", "rapid", "very_rapid", "extreme"]:
            return True

    # Escaladare în notification_type = normal
    if "notification_type" in change:
        val = change.get("notification_type")
        if isinstance(val, list) and val[-1] == "normal":
            return True

    return False


# Aplicăm funcția pe fiecare rând -> coloană booleană
ge_changelog["credible_signal"] = ge_changelog["changes_dict"].apply(is_credible_signal)

# Calculăm T0 = primul moment cu semnal credibil per incendiu
T0 = (
    ge_changelog[ge_changelog["credible_signal"]]
    .groupby("geo_event_id")["date_created"]
    .min()
    .reset_index()
    .rename(columns={"date_created": "T0_first_signal"})
)

print("T0 computed for events:", T0.shape)
T0.head()

T0 computed for events: (8451, 2)


Unnamed: 0,geo_event_id,T0_first_signal
0,88.0,2021-08-13 19:18:23.648715
1,89.0,2021-08-14 00:25:21.551458
2,99.0,2021-08-18 19:40:59.623819
3,103.0,2021-08-21 21:20:54.254949
4,117.0,2021-08-27 01:52:13.609321


In [None]:
# Transformăm changes în dict pentru a detecta schimbări de status
evac_changelog["changes_dict"] = evac_changelog["changes"].apply(
    lambda x: json.loads(x) if pd.notnull(x) else {}
)

def is_evacuation_order(change):
    """
    Returnează True dacă un update setează statusul zonei la 'order'.
    """
    if not isinstance(change, dict):
        return False

    # În dataset, statusul e urmărit prin cheia 'status' (conform dicționarului)
    if "status" in change:
        val = change.get("status")
        if isinstance(val, list):
            val = val[-1]
        if val == "order":
            return True

    return False


# Marcăm rândurile care sunt "Evacuation Order"
evac_changelog["is_evac_order"] = evac_changelog["changes_dict"].apply(is_evacuation_order)

# T2 = primul moment când o zonă primește "order"
T2 = (
    evac_changelog[evac_changelog["is_evac_order"]]
    .groupby("evac_zone_id")["date_created"]
    .min()
    .reset_index()
    .rename(columns={"date_created": "T2_evac_order"})
)

print("Evac orders (zones):", T2.shape)
T2.head()

Evac orders (zones): (0, 2)


Unnamed: 0,evac_zone_id,T2_evac_order


In [None]:
# Ne uităm la coloanele din map ca să știm cum se numesc exact
print(map_event_zone.columns)

# În mod normal ai geo_event_id și evac_zone_id (dacă diferă, îmi spui cum apar)
map_event_zone.head(3)

Index(['date_created', 'uid_v2', 'geo_event_id'], dtype='object')


Unnamed: 0,date_created,uid_v2,geo_event_id
0,2023-10-25 11:28:15.254517,sierra-CA_US-SIE-E062-778dddb3cad71e9f,14107
1,2023-10-25 11:28:15.254550,sierra-CA_US-SIE-E063-f59f54f1edf93b15,14107
2,2023-10-25 11:28:15.254565,sierra-CA_US-SIE-E060-64dbd0e15d47d9bb,14107


In [None]:
from collections import Counter

k = Counter()
for d in evac_changelog["changes_dict"]:
    k.update(d.keys())

k.most_common(30)

[('geom', 3112), ('is_active', 1488), ('status', 1479)]

In [None]:
def is_evacuation_order(change):
    # dacă nu e dict, sigur nu e order
    if not isinstance(change, dict):
        return False

    # verificăm mai multe câmpuri posibile
    possible_keys = ["status", "external_status"]

    for key in possible_keys:
        if key in change:
            val = change.get(key)

            # dacă e listă, luăm ultima valoare (noul status)
            if isinstance(val, list):
                val = val[-1]

            # normalizăm la string mic
            if isinstance(val, str):
                v = val.lower()

                # căutăm "order" în text (ex: "Evacuation Order", "ORDER", etc.)
                if "order" in v:
                    return True

    return False

In [None]:
evac_changelog["is_evac_order"] = evac_changelog["changes_dict"].apply(is_evacuation_order)
evac_changelog["is_evac_order"].sum()

np.int64(208)

In [None]:
print(evac_changelog.columns)

Index(['id', 'date_created', 'changes', 'evac_zone_id', 'changes_dict',
       'is_evac_order'],
      dtype='object')


In [None]:
# Afișăm numele coloanelor ca să știm ce ID de zonă există în evac_changelog
print(evac_changelog.columns)

Index(['id', 'date_created', 'changes', 'evac_zone_id', 'changes_dict',
       'is_evac_order'],
      dtype='object')


In [None]:
# Convertim timpul în datetime (dacă nu e deja)
evac_changelog["date_created"] = pd.to_datetime(evac_changelog["date_created"], errors="coerce")

# Păstrăm doar rândurile care sunt evacuation orders
orders = evac_changelog[evac_changelog["is_evac_order"]].copy()

# Pentru fiecare zonă (evac_zone_id), luăm primul moment când apare order (T2)
T2 = (
    orders.groupby("evac_zone_id")["date_created"]
    .min()
    .reset_index()
    .rename(columns={"date_created": "T2_evac_order"})
)

print("T2 shape:", T2.shape)
T2.head()


T2 shape: (186, 2)


Unnamed: 0,evac_zone_id,T2_evac_order
0,4602,2025-06-13 00:30:51.576399+00:00
1,4616,2025-06-13 00:47:27.312292+00:00
2,4851,2025-06-28 02:04:23.204358+00:00
3,5482,2025-06-04 00:37:53.270287+00:00
4,6288,2025-06-18 20:45:46.776645+00:00


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print(evac_changelog.columns)

Index(['id', 'date_created', 'changes', 'evac_zone_id', 'changes_dict',
       'is_evac_order'],
      dtype='object')


In [None]:
print(T2.shape)
T2.head()

(186, 2)


Unnamed: 0,evac_zone_id,T2_evac_order
0,4602,2025-06-13 00:30:51.576399+00:00
1,4616,2025-06-13 00:47:27.312292+00:00
2,4851,2025-06-28 02:04:23.204358+00:00
3,5482,2025-06-04 00:37:53.270287+00:00
4,6288,2025-06-18 20:45:46.776645+00:00


In [None]:
# Citim doar coloanele necesare (id, uid_v2) ca să evităm problemele din geom
evac_zones = pd.read_csv(
    "evac_zones_gis_evaczone.csv",
    sep=",",
    engine="python",
    quoting=3,
    usecols=["id", "uid_v2"]   # citim DOAR ce ne trebuie
)

# Construim tabelul de legătură evac_zone_id -> uid_v2
zone_link = evac_zones.rename(columns={"id": "evac_zone_id"}).copy()

print("zone_link loaded ✅", zone_link.shape)
zone_link.head()

zone_link loaded ✅ (5697, 2)


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,evac_zone_id,uid_v2
1,2025-04-10 13:22:22.896704+00,2025-08-19 01:58:01.009206+00,boulder-CO_US-BO-030-cc87104888faec63,True,BO-030,boulder-CO_US
2,2025-04-10 13:22:22.896921+00,2025-08-19 01:58:01.00957+00,boulder-CO_US-BO-023-bf0c7318a31e33c2,True,BO-023,boulder-CO_US
3,2025-04-10 13:22:22.897002+00,2025-08-19 01:58:01.009741+00,boulder-CO_US-BO-031-d365d7b1d6a30d16,True,BO-031,boulder-CO_US
4,2025-04-10 13:22:22.897075+00,2025-08-19 01:58:01.009867+00,boulder-CO_US-BO-005-ccd5de89e7c9e57a,True,BO-005,boulder-CO_US
5,2025-04-10 13:22:22.897156+00,2025-08-19 01:58:01.010002+00,boulder-CO_US-BO-028-2693ee3d2f362647,True,BO-028,boulder-CO_US
