In [9]:
from openai import OpenAI
import os
import pandas as pd
import ast
import time

client = OpenAI(api_key="...")


In [10]:
tag_info = {
    "death": {
        "description": "Human fatalities, including drowning, crush injuries, electrocution or other disaster-related deaths",
        "keywords": "death, deaths, dead, drowned, fatality, fatalities, crushed, electrocution"
    },
    "injury": {
        "description": "Physical injuries such as fractures, lacerations, contusions, burns, head trauma, requiring on-site first aid or hospitalization",
        "keywords": "injury, injuries, injured, fracture, laceration, burn, trauma, contusion, head injury"
    },
    "missing_persons": {
        "description": "Individuals reported as missing, unaccounted for, or trapped in debris or flooded areas",
        "keywords": "missing, unaccounted, trapped, debris, search ongoing"
    },
    "evacuation": {
        "description": "Voluntary or forced population displacement to safe locations, including organized mass transport or shelter referral",
        "keywords": "evacuated, evacuation, relocated, displaced, moved out, transported, shelter referral"
    },
    "rescue": {
        "description": "Search and rescue operations by land, water or air, including swift-water rescue, helicopter extraction, or canine teams",
        "keywords": "rescued, rescue, swift-water, helicopter, extraction, canine team"
    },
    "shelter_activated": {
        "description": "Opening and use of emergency shelters, community centers or temporary housing facilities",
        "keywords": "shelter opened, shelter activated, community center, emergency housing, temporary shelter"
    },
    "emergency_declared": {
        "description": "Official declaration of a state of emergency by government bodies or emergency management agencies",
        "keywords": "state of emergency, emergency declared, emergency order, disaster declaration"
    },
    "car_crash": {
        "description": "Vehicle collisions or traffic accidents, including multi-vehicle pileups and single-vehicle crashes",
        "keywords": "car crash, vehicle collision, accident, pileup, traffic incident"
    },
    "infrastructure_damage": {
        "description": "Damage to infrastructure including roads, bridges, tunnels, subway stations, pumping stations, levees and seawalls",
        "keywords": "road damage, bridge damage, tunnel, subway, pumping station, levee, seawall, infrastructure damaged"
    },
    "transportation_disruption": {
        "description": "Service interruptions such as rail or metro line suspensions, bus cancellations, blocked or closed roads and intersections",
        "keywords": "transport disrupted, metro canceled, bus service halted, road closed, intersection blocked"
    },
    "power_outage": {
        "description": "Loss of electrical service due to downed lines, damaged substations or transformer failures",
        "keywords": "power outage, blackout, downed line, transformer failure, electricity loss"
    },
    "water_disruption": {
        "description": "Water supply interruptions from broken mains, pipe bursts or contaminated sources",
        "keywords": "water disruption, broken main, pipe burst, contaminated water"
    },
    "gas_disruption": {
        "description": "Interruptions or leaks in natural gas pipelines or distribution networks",
        "keywords": "gas leak, pipeline rupture, gas disruption, natural gas outage"
    },
    "telecom_disruption": {
        "description": "Communication network outages affecting mobile, landline or broadband services",
        "keywords": "telecom outage, cell service lost, mobile network down, landline failure, broadband outage"
    },
    "sewer_overflow": {
        "description": "Sewage pipeline overflows or backups leading to street flooding or facility contamination",
        "keywords": "sewer overflow, sewage backup, sewer spill, contamination"
    },
    "landslide_triggered": {
        "description": "Occurrence of landslides or mudslides that block or damage terrain",
        "keywords": "landslide, mudslide, slope failure, hillside collapse"
    },
    "hazardous_materials_released": {
        "description": "Release of hazardous substances such as industrial chemicals, petroleum fuels, solvents or pesticides",
        "keywords": "hazardous material, chemical spill, fuel release, solvent leak, pesticide contamination"
    },
    "tree_damage": {
        "description": "Uprooted, snapped or broken trees causing road blockages or property damage",
        "keywords": "tree damage, downed tree, snapped tree, broken limb"
    },
    "campground_damage": {
        "description": "Damage to campgrounds, hiking trails, picnic areas or outdoor recreation sites",
        "keywords": "campground damage, trail erosion, picnic area closed, park trail washed out"
    },
    "home_damage": {
        "description": "Structural damage to residences, including roof collapse, wall breaches, flooded basements, foundation undermining",
        "keywords": "home damage, roof collapse, wall breach, basement flooded, house destroyed, foundation undermined"
    },
    "non_residential_building_damage": {
        "description": "Damage to non-residential structures such as warehouses, shops, factories, office buildings and community centers",
        "keywords": "shop damage, warehouse collapse, office flooded, non-residential building damaged"
    },
    "critical_facility_damage": {
        "description": "Damage to key public facilities including schools, hospitals, police/fire stations and places of worship",
        "keywords": "school damaged, hospital flooded, fire station, police station, church collapsed"
    },
    "agricultural_damage": {
        "description": "Losses to crops, farmland erosion, damaged irrigation systems or collapsed barns",
        "keywords": "crop loss, farmland flooded, irrigation destroyed, barn collapsed"
    },
    "animal_loss": {
        "description": "Mortality or injury of livestock, poultry or other farm animals",
        "keywords": "animal loss, livestock drowned, poultry killed, farm animal injury"
    },
    "business_interruption": {
        "description": "Disruption of commercial activities, such as store closures, halted production lines or canceled services",
        "keywords": "business interruption, store closed, operations halted, service canceled"
    },
    "insurance_claims_mentioned": {
        "description": "Mentions of filed or pending insurance claims, adjuster visits or claims processing steps",
        "keywords": "insurance claim, adjuster visit, filed claim, claims processing"
    }
}


tag_block = "\n".join([
    f"- **{tag}**: {info['description']} (keywords: {info['keywords']})"
    for tag, info in tag_info.items()
])


In [11]:
def generate_tags(narrative, model):
    prompt = f"""
You are an expert annotator for flood-related incident reports.

---
**Task A ‚Äì Tag Selection**  
Select all relevant tags based on the narrative provided. Each tag has a name, description, and example keywords to guide your decision.

Here is the complete list of allowed tags as my tag_block (you must only choose from this list):

{tag_block}

‚ö†Ô∏è Important tag selection rules:
- Do **not** generate any tag that is not explicitly listed above.
- Do **not** include general concepts, category names, or field names such as `"event_type"`, `"flooded_closed_locations"`, `"structural_damage"`, or `"rainfall_information"`.
- The `"tags"` field must be a list of tag names from the list above ‚Äî no exceptions.

‚ùå Invalid examples:  
["flooded_closed_locations", "rainfall_information"]

‚úÖ Valid example:  
["power_outage", "rescue", "home_damage"]


---

**Task B ‚Äì Information Extraction**  
From the same narrative, extract the following structured details using the original wording where possible:
- "event_type"
- "flooded_closed_locations" (list of specific streets/locations)
- "structural_damage" (list of concise damage descriptions)
- "rainfall_information" (list of rainfall amounts and locations)

---

**Example for reference**  
Narrative:
\"\"\"
Hurricane Floyd made landfall just east of Cape Fear, North Carolina in the early morning hours of the 16th and moved north-northeast across extreme southeast Virginia to near Ocean City, Maryland by evening on the 16th... [truncated for brevity]
\"\"\"
Expected result:
json
{{
  "tags": ["power_outage", "home_damage", "rescue"],
  "event_type": "Hurricane Floyd",
  "flooded_closed_locations": [
    "Independence Avenue from 15th to 23rd Street",
    "Rock Creek Parkway from Virginia Avenue to Calvert Street"
  ],
  "structural_damage": [
    "Vacant building at 600 block of New York Avenue collapsed",
    "Downed trees damaged houses and cars"
  ],
  "rainfall_information": [
    "6.00 inches at Foggy Bottom",
    "5.57 inches in Upper Northwest",
    "5.39 inches at Children's Hospital",
    "4.57 inches at National Airport"
  ]
}}

Now apply this same format to the following Flood narrative:

\"\"\"{narrative}\"\"\"

Your response must be only the JSON object. Do not include any explanation or extra text. Make sure the JSON is complete and valid, and never leave any array or string open.

"""
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[
                      {"role": "system", "content": "You are a strict JSON formatter."},
                      {"role": "user", "content": prompt}
                    ],
            temperature=0
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {e}"

In [23]:
import json, time
import pandas as pd

def annotate_dataframe(df, model, save_path="annotated_output.csv", resume=True, save_every=50):
#     df["narrative"] = (
#         df["EPISODE_NARRATIVE"].fillna("") +
#         " " +
#         df["EVENT_NARRATIVE"].fillna("")
#     ).str.strip()

    # ÂàùÂßãÂåñÁõÆÊ†áÂàóÔºà‰∏ç‰ºöË¶ÜÁõñÂ∑≤ÊúâÂÜÖÂÆπÔºâ
    for col in ["tags", "event_type",
                "flooded_closed_locations",
                "structural_damage",
                "rainfall_information"]:
        if col not in df.columns:
            df[col] = ""

    # üí° Ëé∑ÂèñÂì™‰∫õË°åÊú™Â§ÑÁêÜÔºàtagsÂàó‰∏∫Á©∫Ôºâ
    if resume:
        to_process_indices = df[df["tags"].isna() | (df["tags"].str.strip() == "")].index
    else:
        to_process_indices = df.index

    print(f"üü¢ Will process {len(to_process_indices)} rows out of {len(df)}")

    processed_count = 0

    for idx in to_process_indices:
        row = df.loc[idx]
        print(f"Processing row {idx+1}/{len(df)}...")

        try:
            response_raw = generate_tags(row["narrative"], model=model)

            if not response_raw or response_raw.strip().upper().startswith("ERROR"):
                print(f"‚ö†Ô∏è  Skip row {idx} -- response is empty or starts with ERROR")
                continue

            response_cleaned = response_raw.strip()
            if response_cleaned.startswith("```json"):
                response_cleaned = response_cleaned[len("```json"):].strip()
            if response_cleaned.endswith("```"):
                response_cleaned = response_cleaned[:-3].strip()

            if not response_cleaned:
                print(f"‚ö†Ô∏è  Skip row {idx} -- response empty after stripping")
                continue

            result = json.loads(response_cleaned)

            # ÂÜôÂÖ• DataFrame
            df.at[idx, "tags"]                     = ", ".join(result.get("tags") or [])
            df.at[idx, "event_type"]               = result.get("event_type", "")
            df.at[idx, "flooded_closed_locations"] = "; ".join(result.get("flooded_closed_locations") or [])
            df.at[idx, "structural_damage"]        = "; ".join(result.get("structural_damage") or [])
            df.at[idx, "rainfall_information"]     = "; ".join(result.get("rainfall_information") or [])

            processed_count += 1

            if processed_count % save_every == 0:
                df.to_csv(save_path, index=False)
                print(f"‚úÖ Auto-saved after {processed_count} rows to {save_path}")

            time.sleep(1.2)

        except Exception as e:
            print(f"\n‚ö†Ô∏è  Failed at row {idx}: {e}")
            print("=" * 60)
            print("Raw response:")
            print(response_raw[:500])
            print("=" * 60)
            df.to_csv(save_path, index=False)
            print(f"üíæ Progress saved to {save_path} before crash.")
            raise e

    df.to_csv(save_path, index=False)
    print(f"üéâ Finished. Final result saved to {save_path}")
    return df


In [27]:
df = pd.read_csv("narratives_batch_2.csv")

In [28]:
df0 = annotate_dataframe(df, model="gpt-4-turbo", save_path="output.csv")

üü¢ Will process 100 rows out of 100
Processing row 1/100...
Processing row 2/100...
Processing row 3/100...
Processing row 4/100...
Processing row 5/100...
Processing row 6/100...
Processing row 7/100...
Processing row 8/100...
Processing row 9/100...
Processing row 10/100...
Processing row 11/100...
Processing row 12/100...
Processing row 13/100...
Processing row 14/100...
Processing row 15/100...
Processing row 16/100...
Processing row 17/100...
Processing row 18/100...
Processing row 19/100...
Processing row 20/100...
Processing row 21/100...
Processing row 22/100...
Processing row 23/100...
Processing row 24/100...
Processing row 25/100...
Processing row 26/100...
Processing row 27/100...
Processing row 28/100...
Processing row 29/100...
Processing row 30/100...
Processing row 31/100...
Processing row 32/100...
Processing row 33/100...
Processing row 34/100...
Processing row 35/100...
Processing row 36/100...
Processing row 37/100...
Processing row 38/100...
Processing row 39/100

In [29]:
df0.to_csv("output2.csv", index=False)