# WiDS Datathon 2026 Notebook
This is where your analysis begins. Use this notebook for EDA, modeling, and explanations.

# WiDS Datathon 2026 – Route 1: Accelerating Equitable Evacuations

**Goal:** Identify evacuation alert delays and build an early-trigger recommendation baseline using WatchDuty data.

**Core KPI:** Minutes of lead time gained (how much earlier we could trigger protective action).



In [None]:
# Your code goes here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json          # pentru a interpreta string-uri JSON din coloana "changes"
from collections import Counter  # numărăm frecvențe de chei

# Setări pentru afișare (ca să vedem mai multe coloane)
pd.set_option("display.max_columns", 200)

# Setăm un seed pentru reproducibilitate (același random mereu)
np.random.seed(42)

print("Setup ready ✅")




Setup ready ✅


In [33]:
# Citim fișierul cu evenimente (incendii + locații); noi vom filtra doar wildfire
geo_events = pd.read_csv("geo_events_geoevent.csv", low_memory=False)

In [34]:
# Citim changelog-ul evenimentelor (timeline de modificări + semnale radio)
ge_changelog = pd.read_csv("geo_events_geoeventchangelog.csv", low_memory=False)

In [39]:
# Citim maparea dintre incendii (geo_event_id) și zone (evac_zone_id)
map_event_zone = pd.read_csv("evac_zone_status_geo_event_map.csv", low_memory=False)

In [43]:
import sys
import csv

# Increase the CSV field size limit to handle potentially large fields
csv.field_size_limit(sys.maxsize)

# Citim changelog-ul zonelor de evacuare (status changes: warning -> order etc.)
evac_changelog = pd.read_csv(
    "evac_zones_gis_evaczonechangelog.csv",
    engine="python",
    on_bad_lines='skip' # Skip malformed lines
)


In [45]:
#  Citim perimetrele focului; nu e obligatoriu în MVP
fire_perimeters = pd.read_csv(
    "fire_perimeters_gis_fireperimeter.csv",
    engine="python",
    on_bad_lines='skip' # Skip malformed lines
)

In [46]:
# Filtrăm doar evenimentele de tip "wildfire"
geo_events = geo_events[geo_events["geo_event_type"] == "wildfire"].copy()

# Convertim timpii în datetime ca să putem face calcule pe timp
geo_events["date_created"] = pd.to_datetime(geo_events["date_created"], errors="coerce")
geo_events["date_modified"] = pd.to_datetime(geo_events["date_modified"], errors="coerce")

# Pentru changelog: convertim timestamp-ul
ge_changelog["date_created"] = pd.to_datetime(ge_changelog["date_created"], errors="coerce")

# Pentru evac changelog: convertim timestamp-ul
evac_changelog["date_created"] = pd.to_datetime(evac_changelog["date_created"], errors="coerce")

print("Wildfires:", geo_events.shape)
geo_events.head(3)

Wildfires: (61779, 17)


Unnamed: 0,id,date_created,date_modified,geo_event_type,name,is_active,description,address,lat,lng,data,notification_type,external_id,external_source,incident_id,reporter_managed,is_visible
0,76,2021-08-11 00:09:56.481066,2023-02-09 20:34:24.180117,wildfire,Todd Fire,0,,"Llano Rd & Todd Rd, Santa Rosa, California 95...",38.3861,-122.769299,"{""is_fps"": false, ""acreage"": 50, ""containment""...",normal,1008375748,pulsepoint,76.0,1,0
1,77,2021-08-11 07:21:46.054995,2023-02-09 20:34:24.225186,wildfire,Vegetation Fire,0,,"Guerneville Rd & W Steele Ln, Santa Rosa, CA 9...",38.459967,-122.728927,"{""is_fps"": false, ""acreage"": 0, ""containment"":...",normal,1008583077,pulsepoint,77.0,1,0
2,78,2021-08-11 21:02:16.301416,2023-02-09 20:34:24.266124,wildfire,Ford Fire,0,,"CA-1 & Valley Ford Estero Rd, Valley Ford, CA ...",38.318328,-122.925698,"{""is_fps"": false, ""acreage"": 0, ""containment"":...",normal,1008957703,pulsepoint,78.0,1,0


In [47]:
# Transformăm string-ul JSON din "changes" în dict Python, ca să putem citi cheile ușor
ge_changelog["changes_dict"] = ge_changelog["changes"].apply(
    lambda x: json.loads(x) if pd.notnull(x) else {}
)

# Verificăm primele rânduri
ge_changelog[["date_created", "geo_event_id", "changes_dict"]].head()

Unnamed: 0,date_created,geo_event_id,changes_dict
0,2024-06-19 00:30:52.819841,22339.0,"{'name': ['Vegetation Fire', 'Charlotte Fire']}"
1,2024-06-19 00:31:10.439467,22338.0,"{'data.links': [[], [{'label': 'Pulsepoint Inc..."
2,2024-06-19 00:31:25.085624,22339.0,"{'address': ['W Ave C & 110th St W, Antelope A..."
3,2024-06-19 00:31:45.603726,22339.0,{}
4,2024-06-19 00:31:51.368137,22339.0,"{'data.acreage': [None, 3]}"


In [48]:
# Numărăm cheile din changes_dict ca să vedem ce semnale avem în date
key_counter = Counter()
for d in ge_changelog["changes_dict"]:
    key_counter.update(d.keys())

# Listăm cheile radio traffic existente
radio_keys = [k for k in key_counter.keys() if "radio_traffic" in k]

print("Radio keys found:", radio_keys)

# Procentul de update-uri care conțin radio traffic (semnale rare, dar puternice)
radio_share = ge_changelog["changes"].str.contains("radio_traffic", na=False).mean()
print("Share of updates with radio_traffic:", radio_share)

Radio keys found: ['radio_traffic_indicates_structure_threat', 'radio_traffic_indicates_spotting', 'radio_traffic_indicates_rate_of_spread']
Share of updates with radio_traffic: 0.05538425379273295


In [49]:
def is_credible_signal(change):
    """
    Returnează True dacă un update conține un semnal credibil de escaladare.
    """
    # Dacă nu e dict, nu putem căuta chei
    if not isinstance(change, dict):
        return False

    # Semnal puternic: amenințare la structuri
    if "radio_traffic_indicates_structure_threat" in change:
        return True

    # Semnal puternic: spotting (focuri secundare)
    if "radio_traffic_indicates_spotting" in change:
        return True

    # Semnal de viteză de propagare
    if "radio_traffic_indicates_rate_of_spread" in change:
        val = change.get("radio_traffic_indicates_rate_of_spread")  # luăm valoarea
        if isinstance(val, list):  # uneori e [vechi, nou]
            val = val[-1]          # luăm noul
        if val in ["moderate", "rapid", "very_rapid", "extreme"]:
            return True

    # Escaladare în notification_type = normal
    if "notification_type" in change:
        val = change.get("notification_type")
        if isinstance(val, list) and val[-1] == "normal":
            return True

    return False


# Aplicăm funcția pe fiecare rând -> coloană booleană
ge_changelog["credible_signal"] = ge_changelog["changes_dict"].apply(is_credible_signal)

# Calculăm T0 = primul moment cu semnal credibil per incendiu
T0 = (
    ge_changelog[ge_changelog["credible_signal"]]
    .groupby("geo_event_id")["date_created"]
    .min()
    .reset_index()
    .rename(columns={"date_created": "T0_first_signal"})
)

print("T0 computed for events:", T0.shape)
T0.head()

T0 computed for events: (8451, 2)


Unnamed: 0,geo_event_id,T0_first_signal
0,88.0,2021-08-13 19:18:23.648715
1,89.0,2021-08-14 00:25:21.551458
2,99.0,2021-08-18 19:40:59.623819
3,103.0,2021-08-21 21:20:54.254949
4,117.0,2021-08-27 01:52:13.609321


In [50]:
# Transformăm changes în dict pentru a detecta schimbări de status
evac_changelog["changes_dict"] = evac_changelog["changes"].apply(
    lambda x: json.loads(x) if pd.notnull(x) else {}
)

def is_evacuation_order(change):
    """
    Returnează True dacă un update setează statusul zonei la 'order'.
    """
    if not isinstance(change, dict):
        return False

    # În dataset, statusul e urmărit prin cheia 'status' (conform dicționarului)
    if "status" in change:
        val = change.get("status")
        if isinstance(val, list):
            val = val[-1]
        if val == "order":
            return True

    return False


# Marcăm rândurile care sunt "Evacuation Order"
evac_changelog["is_evac_order"] = evac_changelog["changes_dict"].apply(is_evacuation_order)

# T2 = primul moment când o zonă primește "order"
T2 = (
    evac_changelog[evac_changelog["is_evac_order"]]
    .groupby("evac_zone_id")["date_created"]
    .min()
    .reset_index()
    .rename(columns={"date_created": "T2_evac_order"})
)

print("Evac orders (zones):", T2.shape)
T2.head()

Evac orders (zones): (0, 2)


Unnamed: 0,evac_zone_id,T2_evac_order


In [51]:
# Ne uităm la coloanele din map ca să știm cum se numesc exact
print(map_event_zone.columns)

# În mod normal ai geo_event_id și evac_zone_id (dacă diferă, îmi spui cum apar)
map_event_zone.head(3)

Index(['date_created', 'uid_v2', 'geo_event_id'], dtype='object')


Unnamed: 0,date_created,uid_v2,geo_event_id
0,2023-10-25 11:28:15.254517,sierra-CA_US-SIE-E062-778dddb3cad71e9f,14107
1,2023-10-25 11:28:15.254550,sierra-CA_US-SIE-E063-f59f54f1edf93b15,14107
2,2023-10-25 11:28:15.254565,sierra-CA_US-SIE-E060-64dbd0e15d47d9bb,14107


In [52]:
from collections import Counter

k = Counter()
for d in evac_changelog["changes_dict"]:
    k.update(d.keys())

k.most_common(30)

[('geom', 3112), ('is_active', 1488), ('status', 1479)]

In [53]:
def is_evacuation_order(change):
    # dacă nu e dict, sigur nu e order
    if not isinstance(change, dict):
        return False

    # verificăm mai multe câmpuri posibile
    possible_keys = ["status", "external_status"]

    for key in possible_keys:
        if key in change:
            val = change.get(key)

            # dacă e listă, luăm ultima valoare (noul status)
            if isinstance(val, list):
                val = val[-1]

            # normalizăm la string mic
            if isinstance(val, str):
                v = val.lower()

                # căutăm "order" în text (ex: "Evacuation Order", "ORDER", etc.)
                if "order" in v:
                    return True

    return False

In [54]:
evac_changelog["is_evac_order"] = evac_changelog["changes_dict"].apply(is_evacuation_order)
evac_changelog["is_evac_order"].sum()

np.int64(208)

In [55]:
print(evac_changelog.columns)

Index(['id', 'date_created', 'changes', 'evac_zone_id', 'changes_dict',
       'is_evac_order'],
      dtype='object')


In [56]:
# Afișăm numele coloanelor ca să știm ce ID de zonă există în evac_changelog
print(evac_changelog.columns)

Index(['id', 'date_created', 'changes', 'evac_zone_id', 'changes_dict',
       'is_evac_order'],
      dtype='object')


In [57]:
# Convertim timpul în datetime (dacă nu e deja)
evac_changelog["date_created"] = pd.to_datetime(evac_changelog["date_created"], errors="coerce")

# Păstrăm doar rândurile care sunt evacuation orders
orders = evac_changelog[evac_changelog["is_evac_order"]].copy()

# Pentru fiecare zonă (evac_zone_id), luăm primul moment când apare order (T2)
T2 = (
    orders.groupby("evac_zone_id")["date_created"]
    .min()
    .reset_index()
    .rename(columns={"date_created": "T2_evac_order"})
)

print("T2 shape:", T2.shape)
T2.head()


T2 shape: (186, 2)


Unnamed: 0,evac_zone_id,T2_evac_order
0,4602,2025-06-13 00:30:51.576399+00:00
1,4616,2025-06-13 00:47:27.312292+00:00
2,4851,2025-06-28 02:04:23.204358+00:00
3,5482,2025-06-04 00:37:53.270287+00:00
4,6288,2025-06-18 20:45:46.776645+00:00


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [58]:
print(evac_changelog.columns)

Index(['id', 'date_created', 'changes', 'evac_zone_id', 'changes_dict',
       'is_evac_order'],
      dtype='object')


In [60]:
print(T2.shape)
T2.head()

(186, 2)


Unnamed: 0,evac_zone_id,T2_evac_order
0,4602,2025-06-13 00:30:51.576399+00:00
1,4616,2025-06-13 00:47:27.312292+00:00
2,4851,2025-06-28 02:04:23.204358+00:00
3,5482,2025-06-04 00:37:53.270287+00:00
4,6288,2025-06-18 20:45:46.776645+00:00


In [62]:
# Citim doar coloanele necesare (id, uid_v2) ca să evităm problemele din geom
evac_zones = pd.read_csv(
    "evac_zones_gis_evaczone.csv",
    sep=",",
    engine="python",
    quoting=3,
    usecols=["id", "uid_v2"]   # citim DOAR ce ne trebuie
)

# Construim tabelul de legătură evac_zone_id -> uid_v2
zone_link = evac_zones.rename(columns={"id": "evac_zone_id"}).copy()

print("zone_link loaded ✅", zone_link.shape)
zone_link.head()

zone_link loaded ✅ (5697, 2)


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,evac_zone_id,uid_v2
1,2025-04-10 13:22:22.896704+00,2025-08-19 01:58:01.009206+00,boulder-CO_US-BO-030-cc87104888faec63,True,BO-030,boulder-CO_US
2,2025-04-10 13:22:22.896921+00,2025-08-19 01:58:01.00957+00,boulder-CO_US-BO-023-bf0c7318a31e33c2,True,BO-023,boulder-CO_US
3,2025-04-10 13:22:22.897002+00,2025-08-19 01:58:01.009741+00,boulder-CO_US-BO-031-d365d7b1d6a30d16,True,BO-031,boulder-CO_US
4,2025-04-10 13:22:22.897075+00,2025-08-19 01:58:01.009867+00,boulder-CO_US-BO-005-ccd5de89e7c9e57a,True,BO-005,boulder-CO_US
5,2025-04-10 13:22:22.897156+00,2025-08-19 01:58:01.010002+00,boulder-CO_US-BO-028-2693ee3d2f362647,True,BO-028,boulder-CO_US


In [63]:
# verificăm dacă avem valori lipsă în uid_v2 și câteva exemple
print("Missing uid_v2:", zone_link["uid_v2"].isna().sum())
print("Missing evac_zone_id:", zone_link["evac_zone_id"].isna().sum())

zone_link.sample(5, random_state=42)

Missing uid_v2: 0
Missing evac_zone_id: 0


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,evac_zone_id,uid_v2
1437,2025-04-10 13:22:30.3843+00,2025-07-23 20:50:20.116941+00,butte-CA_US-BUT-PAL-868-6383fbdd2e93ef63,True,BUT-PAL-868,butte-CA_US
749,2025-04-10 13:22:24.955749+00,2025-07-18 02:37:04.956999+00,alameda-CA_US-HAY-027-41416535154c0d1d,True,HAY-027,alameda-CA_US
4596,2025-04-10 13:22:52.944735+00,2025-09-03 18:46:48.387493+00,fresno-CA_US-P18-d12c33b076e3da6b,False,P18,fresno-CA_US
1449,2025-04-10 13:22:30.38541+00,2025-07-23 20:50:20.11886+00,butte-CA_US-BUT-PAL-883-4a856e0ba2bc7c33,True,BUT-PAL-883,butte-CA_US
2544,2025-04-10 13:22:38.783675+00,2025-07-18 02:36:33.600425+00,contra_costa-CA_US-PNL-E010-eda220596a0592ea,True,PNL-E010,contra_costa-CA_US


In [65]:
# Forțăm evac_zone_id să fie string în ambele tabele

T2["evac_zone_id"] = T2["evac_zone_id"].astype(str)
zone_link["evac_zone_id"] = zone_link["evac_zone_id"].astype(str)

# Acum merge join-ul
T2_uid = T2.merge(zone_link, on="evac_zone_id", how="left")

print("T2_uid shape:", T2_uid.shape)
print("Missing uid_v2 in T2_uid:", T2_uid["uid_v2"].isna().sum())

T2_uid.head()

T2_uid shape: (186, 3)
Missing uid_v2 in T2_uid: 186


Unnamed: 0,evac_zone_id,T2_evac_order,uid_v2
0,4602,2025-06-13 00:30:51.576399+00:00,
1,4616,2025-06-13 00:47:27.312292+00:00,
2,4851,2025-06-28 02:04:23.204358+00:00,
3,5482,2025-06-04 00:37:53.270287+00:00,
4,6288,2025-06-18 20:45:46.776645+00:00,


In [66]:
import pandas as pd

# Citim DOAR primele 0 rânduri ca să obținem header-ul real
evac_header = pd.read_csv(
    "evac_zones_gis_evaczone.csv",
    engine="python",
    quoting=3,
    nrows=0
)

print(list(evac_header.columns))


['id', 'date_created', 'date_modified', 'uid_v2', 'is_active', 'display_name', 'region_id', 'source_attribution', 'dataset_name', 'source_extra_data', 'geom', 'status', 'geom_label', 'is_pending_review', 'pending_updates', 'external_status']


In [70]:
# Vom extrage doar: id (prima coloană) și uid_v2 (a 4-a coloană)
rows = []

# Deschidem fișierul ca text (asta evită complet problemele de CSV quoting)
with open("evac_zones_gis_evaczone.csv", "r", encoding="utf-8", errors="ignore") as f:
    header = next(f)  # sărim peste header

    for line in f:
        # Tăiem doar primele 3 virgule:
        # parts = [id, date_created, date_modified, rest...]
        parts = line.rstrip("\n").split(",", 3)

        # Dacă linia e prea scurtă / stricată, o sărim
        if len(parts) < 4:
            continue

        # id e prima coloană
        evac_zone_id = parts[0]

        # uid_v2 este primul câmp din "rest" (până la următoarea virgulă)
        uid_v2 = parts[3].split(",", 1)[0]

        # Curățăm ghilimelele dacă apar
        evac_zone_id = evac_zone_id.strip().strip('"')
        uid_v2 = uid_v2.strip().strip('"')

        # Păstrăm doar dacă id e numeric
        if evac_zone_id.isdigit():
            rows.append((int(evac_zone_id), uid_v2))

# Construim zone_link corect
zone_link = pd.DataFrame(rows, columns=["evac_zone_id", "uid_v2"]).drop_duplicates()

print("zone_link built ✅", zone_link.shape)
zone_link.head()

zone_link built ✅ (5697, 2)


Unnamed: 0,evac_zone_id,uid_v2
0,1,boulder-CO_US-BO-030-cc87104888faec63
1,2,boulder-CO_US-BO-023-bf0c7318a31e33c2
2,3,boulder-CO_US-BO-031-d365d7b1d6a30d16
3,4,boulder-CO_US-BO-005-ccd5de89e7c9e57a
4,5,boulder-CO_US-BO-028-2693ee3d2f362647


In [71]:
# Asigurăm tip numeric pentru cheia de join
T2["evac_zone_id"] = pd.to_numeric(T2["evac_zone_id"], errors="coerce")

T2_uid = T2.merge(zone_link, on="evac_zone_id", how="left")

print("T2_uid shape:", T2_uid.shape)
print("Missing uid_v2 in T2_uid:", T2_uid["uid_v2"].isna().sum())
T2_uid.head()

T2_uid shape: (186, 3)
Missing uid_v2 in T2_uid: 182


Unnamed: 0,evac_zone_id,T2_evac_order,uid_v2
0,4602,2025-06-13 00:30:51.576399+00:00,fresno-CA_US-P19-f38e86879b25efa5
1,4616,2025-06-13 00:47:27.312292+00:00,fresno-CA_US-P20-2cd717a1b0e8f8c5
2,4851,2025-06-28 02:04:23.204358+00:00,fresno-CA_US-K29-7f5f239fcd2b72a6
3,5482,2025-06-04 00:37:53.270287+00:00,inyo-CA_US-ICU-123-725dac4b91488a8c
4,6288,2025-06-18 20:45:46.776645+00:00,


In [72]:
rows = []
with open("evac_zones_gis_evaczone.csv", "r", encoding="utf-8", errors="ignore") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', escapechar="\\")
    header = next(reader)

    # indexurile pentru id și uid_v2 în header
    id_idx = header.index("id")
    uid_idx = header.index("uid_v2")

    for r in reader:
        # dacă rândul e prea scurt, îl sărim
        if len(r) <= max(id_idx, uid_idx):
            continue

        evac_id = r[id_idx]
        uid = r[uid_idx]

        if evac_id and evac_id.isdigit() and uid:
            rows.append((int(evac_id), uid))

zone_link2 = pd.DataFrame(rows, columns=["evac_zone_id", "uid_v2"]).drop_duplicates()

print("zone_link2 ✅", zone_link2.shape)
zone_link2.head()

zone_link2 ✅ (5697, 2)


Unnamed: 0,evac_zone_id,uid_v2
0,1,boulder-CO_US-BO-030-cc87104888faec63
1,2,boulder-CO_US-BO-023-bf0c7318a31e33c2
2,3,boulder-CO_US-BO-031-d365d7b1d6a30d16
3,4,boulder-CO_US-BO-005-ccd5de89e7c9e57a
4,5,boulder-CO_US-BO-028-2693ee3d2f362647


In [73]:
T2_uid2 = T2.merge(zone_link2, on="evac_zone_id", how="left")
print("Missing uid_v2:", T2_uid2["uid_v2"].isna().sum())
T2_uid2.head()

Missing uid_v2: 182


Unnamed: 0,evac_zone_id,T2_evac_order,uid_v2
0,4602,2025-06-13 00:30:51.576399+00:00,fresno-CA_US-P19-f38e86879b25efa5
1,4616,2025-06-13 00:47:27.312292+00:00,fresno-CA_US-P20-2cd717a1b0e8f8c5
2,4851,2025-06-28 02:04:23.204358+00:00,fresno-CA_US-K29-7f5f239fcd2b72a6
3,5482,2025-06-04 00:37:53.270287+00:00,inyo-CA_US-ICU-123-725dac4b91488a8c
4,6288,2025-06-18 20:45:46.776645+00:00,


In [76]:

# citim changelog-ul GIS (acesta e cel bun pentru uid_v2)
evac_gis_changelog = pd.read_csv(
    "evac_zones_gis_evaczonechangelog.csv",
    sep=",",
    engine="python",
    quoting=3
)

print("evac_gis_changelog loaded ✅", evac_gis_changelog.shape)
print(evac_gis_changelog.columns)
evac_gis_changelog.head()

ParserError: Expected 56 fields in line 9, saw 69