# Flood Risk Exploratory Data Analysis

This notebook explores NFIP claims, policies, and disaster declaration data
from the OpenFEMA API to understand patterns in flood risk and insurance coverage.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from src.ingestion.openfema_client import OpenFEMAClient

## 1. Load Data

Fetch a sample of claims, policies, and disaster declarations from OpenFEMA.
Adjust `max_records` to control the sample size.

In [None]:
client = OpenFEMAClient()

# Fetch sample data (increase max_records for deeper analysis)
claims = client.fetch_claims(max_records=10000)
policies = client.fetch_policies(max_records=10000)
disasters = client.fetch_disasters(max_records=5000)

print(f"Claims:    {claims.shape}")
print(f"Policies:  {policies.shape}")
print(f"Disasters: {disasters.shape}")

## 2. Claims Analysis

In [None]:
claims.head()

In [None]:
claims.info()

In [None]:
# Total claim payouts by year
if 'yearOfLoss' in claims.columns and 'amountPaidOnBuildingClaim' in claims.columns:
    claims['totalPaid'] = (
        claims['amountPaidOnBuildingClaim'].fillna(0) +
        claims['amountPaidOnContentsClaim'].fillna(0)
    )
    yearly = claims.groupby('yearOfLoss').agg(
        total_paid=('totalPaid', 'sum'),
        claim_count=('totalPaid', 'count'),
        avg_claim=('totalPaid', 'mean'),
    ).reset_index()

    fig = px.bar(
        yearly, x='yearOfLoss', y='total_paid',
        title='Total NFIP Claim Payouts by Year',
        labels={'yearOfLoss': 'Year', 'total_paid': 'Total Paid ($)'},
    )
    fig.show()

In [None]:
# Claims by state
if 'state' in claims.columns:
    state_claims = claims.groupby('state').agg(
        claim_count=('state', 'count'),
        total_paid=('totalPaid', 'sum'),
    ).reset_index().sort_values('claim_count', ascending=False)

    fig = px.bar(
        state_claims.head(20), x='state', y='claim_count',
        title='Top 20 States by Claim Count',
        labels={'state': 'State', 'claim_count': 'Number of Claims'},
    )
    fig.show()

In [None]:
# Claims by flood zone
if 'floodZone' in claims.columns:
    zone_claims = claims.groupby('floodZone').size().reset_index(name='count')
    zone_claims = zone_claims.sort_values('count', ascending=False).head(15)

    fig = px.bar(
        zone_claims, x='floodZone', y='count',
        title='Claims by Flood Zone',
        labels={'floodZone': 'Flood Zone', 'count': 'Number of Claims'},
    )
    fig.show()

## 3. Policy Analysis

In [None]:
policies.head()

In [None]:
# Policy counts by state
state_col = 'propertyState' if 'propertyState' in policies.columns else 'state'
if state_col in policies.columns:
    policy_col = 'policyCount' if 'policyCount' in policies.columns else state_col
    if 'policyCount' in policies.columns:
        state_policies = policies.groupby(state_col)['policyCount'].sum().reset_index()
    else:
        state_policies = policies.groupby(state_col).size().reset_index(name='policyCount')
    state_policies = state_policies.sort_values('policyCount', ascending=False)

    fig = px.bar(
        state_policies.head(20), x=state_col, y='policyCount',
        title='Top 20 States by Policy Count',
        labels={state_col: 'State', 'policyCount': 'Policies'},
    )
    fig.show()

## 4. Disaster Declarations

In [None]:
disasters.head()

In [None]:
# Flood-related disaster declarations over time
if 'incidentType' in disasters.columns:
    flood_disasters = disasters[disasters['incidentType'].str.contains('Flood', case=False, na=False)]
    if 'declarationDate' in flood_disasters.columns:
        flood_disasters = flood_disasters.copy()
        flood_disasters['year'] = pd.to_datetime(flood_disasters['declarationDate']).dt.year
        yearly_disasters = flood_disasters.groupby('year').size().reset_index(name='count')

        fig = px.line(
            yearly_disasters, x='year', y='count',
            title='Flood-Related Federal Disaster Declarations per Year',
            labels={'year': 'Year', 'count': 'Declarations'},
        )
        fig.show()

## 5. Coverage Gap Preview

Quick look at the ratio of claims to policies by state — a simple proxy for coverage gaps.

In [None]:
# Merge claims and policy counts by state for a quick coverage gap view
if 'state' in claims.columns and state_col in policies.columns:
    claims_by_state = claims.groupby('state').size().reset_index(name='claim_count')
    if 'policyCount' in policies.columns:
        policies_by_state = policies.groupby(state_col)['policyCount'].sum().reset_index()
        policies_by_state.columns = ['state', 'policy_count']
    else:
        policies_by_state = policies.groupby(state_col).size().reset_index(name='policy_count')
        policies_by_state.columns = ['state', 'policy_count']

    gap = claims_by_state.merge(policies_by_state, on='state', how='outer').fillna(0)
    gap['claims_per_policy'] = gap['claim_count'] / gap['policy_count'].replace(0, 1)
    gap = gap.sort_values('claims_per_policy', ascending=False)

    fig = px.bar(
        gap.head(20), x='state', y='claims_per_policy',
        title='Claims-to-Policy Ratio by State (Higher = Bigger Gap)',
        labels={'state': 'State', 'claims_per_policy': 'Claims per Policy'},
    )
    fig.show()

    gap.head(20)

## 6. Coverage Gap Deep-Dive

Use `compute_coverage_metrics` and `cluster_risk_regions` to identify
counties where claims outpace policy coverage — the true coverage gaps.

In [None]:
from src.analysis.coverage_gap import compute_coverage_metrics, cluster_risk_regions

# Normalize the policy state column so both DataFrames share "countyCode"
# (policies use "propertyState", claims use "state" — but both have "countyCode")
county_metrics = compute_coverage_metrics(claims, policies, geo_col="countyCode")
county_clustered = cluster_risk_regions(county_metrics, n_clusters=5)

print(f"Counties analyzed: {len(county_clustered)}")
county_clustered.head(10)

In [None]:
# Risk tier distribution
tier_summary = county_clustered.groupby("risk_cluster").agg(
    counties=("countyCode", "count"),
    avg_claims_per_policy=("claims_per_policy", "mean"),
    avg_claim_amount=("avg_claim", "mean"),
    total_paid=("total_paid", "sum"),
).reset_index()

fig = px.bar(
    tier_summary, x="risk_cluster", y="counties",
    color="avg_claims_per_policy",
    color_continuous_scale="RdYlGn_r",
    title="Counties per Risk Tier (color = avg claims/policy ratio)",
    labels={"risk_cluster": "Risk Tier", "counties": "Number of Counties",
            "avg_claims_per_policy": "Claims/Policy"},
)
fig.show()

tier_summary

In [None]:
# Top 20 coverage gap counties — scatter: claims/policy vs avg claim severity
top_gaps = county_clustered.head(20)

fig = px.scatter(
    county_clustered, x="claims_per_policy", y="avg_claim",
    size="claim_count", color="risk_cluster",
    color_continuous_scale="RdYlGn_r",
    hover_data=["countyCode", "claim_count", "policy_count", "total_paid"],
    title="Coverage Gap Landscape: Claims/Policy vs Claim Severity by County",
    labels={"claims_per_policy": "Claims per Policy",
            "avg_claim": "Avg Claim ($)",
            "risk_cluster": "Risk Tier"},
)
fig.show()

In [None]:
# Top 20 gap counties ranked
fig = px.bar(
    top_gaps, x="countyCode", y="gap_score",
    color="avg_claim",
    color_continuous_scale="Reds",
    hover_data=["claim_count", "policy_count", "claims_per_policy"],
    title="Top 20 Coverage Gap Counties (by composite gap score)",
    labels={"countyCode": "County FIPS", "gap_score": "Gap Score",
            "avg_claim": "Avg Claim ($)"},
)
fig.update_xaxes(type="category")
fig.show()

### 6a. Zone X "Residual Risk"

FEMA designates Zone X as moderate-to-low flood risk, yet published data shows
25-40% of NFIP claims originate outside Special Flood Hazard Areas. How does
our data compare?

In [None]:
if 'floodZone' in claims.columns:
    # Classify zones into SFHA (high-risk) vs non-SFHA
    sfha_zones = {'A', 'AE', 'AH', 'AO', 'AR', 'A99', 'V', 'VE'}
    claims['zone_category'] = claims['floodZone'].apply(
        lambda z: 'SFHA (High Risk)' if z in sfha_zones else 'Non-SFHA (Zone X/B/C/D)'
    )

    zone_summary = claims.groupby('zone_category').agg(
        claim_count=('totalPaid', 'count'),
        avg_payout=('totalPaid', 'mean'),
        total_payout=('totalPaid', 'sum'),
    ).reset_index()
    zone_summary['pct_of_claims'] = (
        zone_summary['claim_count'] / zone_summary['claim_count'].sum() * 100
    )
    zone_summary['pct_of_payouts'] = (
        zone_summary['total_payout'] / zone_summary['total_payout'].sum() * 100
    )

    fig = px.pie(
        zone_summary, names='zone_category', values='claim_count',
        title='Share of Claims: SFHA vs Non-SFHA Zones',
        color_discrete_sequence=['#d62728', '#2ca02c'],
    )
    fig.show()

    print("Zone category breakdown:")
    display(zone_summary)

In [None]:
# Per-zone severity comparison
if 'floodZone' in claims.columns:
    zone_detail = claims.groupby('floodZone').agg(
        claims=('totalPaid', 'count'),
        avg_payout=('totalPaid', 'mean'),
        total_payout=('totalPaid', 'sum'),
    ).sort_values('total_payout', ascending=False).reset_index()

    fig = px.bar(
        zone_detail, x='floodZone', y='avg_payout',
        color='claims', color_continuous_scale='Blues',
        title='Average Claim Payout by Flood Zone (color = claim volume)',
        labels={'floodZone': 'Flood Zone', 'avg_payout': 'Avg Payout ($)',
                'claims': 'Claim Count'},
    )
    fig.show()

### 6b. Claims Frequency Trend

Are flood claims accelerating over time? Research suggests claim frequency has
been increasing while average severity remains relatively stable.

In [None]:
if 'yearOfLoss' in claims.columns:
    yearly_trend = claims.groupby('yearOfLoss').agg(
        claim_count=('totalPaid', 'count'),
        avg_payout=('totalPaid', 'mean'),
        total_payout=('totalPaid', 'sum'),
    ).reset_index()

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=yearly_trend['yearOfLoss'], y=yearly_trend['claim_count'],
        name='Claim Count', yaxis='y',
    ))
    fig.add_trace(go.Scatter(
        x=yearly_trend['yearOfLoss'], y=yearly_trend['avg_payout'],
        name='Avg Payout ($)', yaxis='y2', mode='lines+markers',
        line=dict(color='red', width=2),
    ))
    fig.update_layout(
        title='Claim Frequency vs Severity Over Time',
        xaxis_title='Year of Loss',
        yaxis=dict(title='Claim Count'),
        yaxis2=dict(title='Avg Payout ($)', overlaying='y', side='right'),
        legend=dict(x=0.01, y=0.99),
    )
    fig.show()

In [None]:
# SFHA vs Non-SFHA trend over time
if 'floodZone' in claims.columns and 'yearOfLoss' in claims.columns:
    zone_trend = claims.groupby(['yearOfLoss', 'zone_category']).agg(
        claim_count=('totalPaid', 'count'),
    ).reset_index()

    fig = px.line(
        zone_trend, x='yearOfLoss', y='claim_count', color='zone_category',
        title='Claims Over Time: SFHA vs Non-SFHA Zones',
        labels={'yearOfLoss': 'Year', 'claim_count': 'Claims',
                'zone_category': 'Zone Category'},
        color_discrete_map={
            'SFHA (High Risk)': '#d62728',
            'Non-SFHA (Zone X/B/C/D)': '#2ca02c',
        },
    )
    fig.show()

## 7. Published Research Context

The patterns in this analysis are well-documented in the academic and government
literature. Below is a summary of key studies organized by finding, to help
interpret the results above.

### Why do coverage gaps exist?

**Low take-up rates, even in mandatory-purchase zones.** Dixon et al. (2006,
RAND Corporation) estimated that only about half of single-family homes in
100-year floodplains carry flood insurance. Compliance with the mandatory
purchase requirement erodes over time: once a mortgage is paid off, the
requirement no longer applies and many homeowners lapse their policies (CRS
Report R44593, Horn, updated 2024).

**Affordability.** Gourevitch, Snyder & Kousky (2025, *J. of Catastrophe Risk
and Resilience*) found that Risk Rating 2.0 premium increases led to 11-39%
declines in new NFIP policies, with the largest reductions in lower-income
areas. Dixon & Clancy (2017, RAND RR-1776) found flood insurance is already
unaffordable for 25% of households in NYC's flood-prone areas.

**Income and education.** Atreya, Ferreira & Michel-Kerjan (2015, *Ecological
Economics*) found positive relationships between flood insurance adoption and
both income (elasticity ~0.39) and education levels in Georgia. This helps
explain why states with lower median incomes (MS, AL, SC) may have worse
coverage gaps despite high flood risk.

**Risk perception.** Petrolia, Landry & Coble (2013, *Land Economics*) found
that subjective risk perceptions are a significant driver of purchase decisions
on the Gulf Coast. Individuals who underestimate flood risk — particularly
outside SFHAs — are far less likely to carry insurance.

**"Charity hazard."** Landry, Turner & Petrolia (2021, *Environmental and
Resource Economics*) estimated that expectations of disaster relief reduce flood
insurance uptake by 25-42% among coastal households in TX, LA, MS, AL, and FL,
potentially accounting for ~817,000 uninsured homes nationally. Kousky,
Michel-Kerjan & Raschky (2018, *J. of Environmental Economics and Management*)
provided the first causal estimates of this crowding-out effect.

### Why do Zone X areas generate so many claims?

**Residual risk is real and large.** FEMA's own data shows 25-40% of NFIP
claims come from outside designated Special Flood Hazard Areas. Amornsiripanitch
et al. (2025, *Nature Climate Change*) found that 70% ($17.1B/yr) of total
flood losses to single-family homes are uninsured, and underinsurance persists
both inside and outside SFHAs.

**Flood maps lag reality.** Wing et al. (2022, *Nature Climate Change*) showed
that climate change is shifting flood risk distribution in ways not captured by
static FEMA maps. Properties in Zone X face meaningful risk that the
designation obscures.

### Why is claim frequency accelerating?

**Hyperclustered catastrophic events.** Wagner et al. (2025, Columbia
University / Society for Risk Analysis) found that the eight largest
"hyperclustered storms" account for over 50% of all NFIP claims, and all
occurred in the last 21 years.

**Repetitive loss properties.** NRDC's "Losing Ground" report (2024) documents
~45,000 Severe Repetitive Loss properties that have flooded ~5 times each on
average. These comprise ~2.5% of policies but ~50% of claims by dollar value.
21% of these properties are now outside FEMA high-risk areas, up from 19% in
2018.

**Structural NFIP insolvency.** GAO-23-105977 (2023) reported that NFIP's
total borrowing from Treasury reached $36.5B, and that 95% of policies won't
reach full-risk rates until 2037 due to premium caps. GAO-17-425 (2017)
identified six areas requiring comprehensive reform: outstanding debt, premium
rates, affordability, consumer participation, private-sector barriers, and
flood resilience.

### Community Rating System

Highfield & Brody (2017, *Int. J. of Disaster Risk Reduction*) found CRS
participation is associated with a 41.6% average reduction in flood claims, with
cumulative damage reductions of ~$10.1B between 1998-2020. However, only 6.6%
(~1,752) of 22,000+ NFIP communities participate.

### References

1. Amornsiripanitch, N., Biswas, S., Orellana-Li, J. & Zink, D. (2025). Measuring Flood Underinsurance in the USA. *Nature Climate Change*, 15(9). doi:10.1038/s41558-025-02396-w
2. Atreya, A., Ferreira, S. & Michel-Kerjan, E. (2015). What Drives Households to Buy Flood Insurance? *Ecological Economics*, 117, 153-161.
3. Browne, M.J. & Hoyt, R.E. (2000). The Demand for Flood Insurance: Empirical Evidence. *Journal of Risk and Uncertainty*, 20(3), 291-306.
4. Choi, J., Diffenbaugh, N.S. & Burke, M. (2024). The Effect of Flood Exposure on Insurance Adoption Among US Households. *Earth's Future*, 12(7), e2023EF004110.
5. Dixon, L. & Clancy, N. (2017). *The Cost and Affordability of Flood Insurance in New York City*. RAND Corporation, RR-1776.
6. Dixon, L., Clancy, N., Seabury, S.A. & Overton, A. (2006). *The NFIP's Market Penetration Rate: Estimates and Policy Implications*. RAND Corporation.
7. GAO-17-425 (2017). Flood Insurance: Comprehensive Reform Could Improve Solvency and Enhance Resilience.
8. GAO-23-105977 (2023). Flood Insurance: FEMA's New Rate-Setting Methodology Improves Actuarial Soundness but Highlights Need for Broader Program Reform.
9. Gourevitch, J., Snyder, M. & Kousky, C. (2025). Effects of Risk-Based Pricing Reform on Flood Insurance Uptake. *J. of Catastrophe Risk and Resilience*, 3.
10. Highfield, W.E. & Brody, S.D. (2017). Determining the Effects of the FEMA CRS Program on Flood Losses. *Int. J. of Disaster Risk Reduction*, 21, 396-404.
11. Horn, D.P. (2024). Introduction to the National Flood Insurance Program (NFIP). CRS Report R44593.
12. Kousky, C. & Michel-Kerjan, E. (2017). Examining Flood Insurance Claims in the United States: Six Key Findings. *J. of Risk and Insurance*, 84(3), 819-850.
13. Kousky, C., Michel-Kerjan, E. & Raschky, P. (2018). Does Federal Disaster Assistance Crowd Out Flood Insurance? *J. of Environmental Economics and Management*, 87, 150-164.
14. Landry, C.E., Turner, D. & Petrolia, D.R. (2021). Flood Insurance Market Penetration and Expectations of Disaster Assistance. *Environmental and Resource Economics*, 79(2), 357-386.
15. Michel-Kerjan, E. & Kousky, C. (2010). Come Rain or Shine: Evidence on Flood Insurance Purchases in Florida. *J. of Risk and Insurance*, 77(2), 369-397.
16. NRDC (2024). Losing Ground: Severe Repetitive Flooding in the United States.
17. Petrolia, D.R., Landry, C.E. & Coble, K.H. (2013). Risk Preferences, Risk Perceptions, and Flood Insurance. *Land Economics*, 89(2), 227-245.
18. Wing, O.E.J. et al. (2022). Inequitable Patterns of US Flood Risk in the Anthropocene. *Nature Climate Change*, 12, 156-162.