# 01 - Ingest and Clean: Core Sources

This notebook pulls and lightly cleans initial datasets for a small Gulf Coast subset to enable fast iteration.


In [1]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
import requests
from tqdm import tqdm

PROJECT_ROOT = Path('/Users/liamguest/LProjects/AURA/AURA')
DATA_RAW = PROJECT_ROOT / 'data' / 'raw'
DATA_INTERIM = PROJECT_ROOT / 'data' / 'interim'
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

print('Raw:', DATA_RAW)
print('Interim:', DATA_INTERIM)


Raw: /Users/liamguest/LProjects/AURA/AURA/data/raw
Interim: /Users/liamguest/LProjects/AURA/AURA/data/interim


## OpenFEMA Individual Assistance (IA) - Minimal Pull

Filters: Texas and Louisiana, last ~15 years. Adjust as needed.


In [3]:
import datetime as dt

OPENFEMA_URL = 'https://www.fema.gov/api/open/v2/IndividualAssistanceApplications'
start_year = dt.date.today().year - 15

# Build state list once to avoid nested f-string braces
states_str = "','".join(STATE_ABBR)
query = (
    f"$filter=state%20in%20('{states_str}')%20and%20incidentBeginDate%20ge%20'{start_year}-01-01'"
    "&$top=10000&$format=json"
)
url = f"{OPENFEMA_URL}?{query}"
print(url)

resp = requests.get(url, timeout=60)
resp.raise_for_status()
ia_json = resp.json()

ia = pd.json_normalize(ia_json.get('IndividualAssistanceApplications', []))
print(ia.shape)

ia_out = DATA_RAW / 'openfema_ia_tx_la_ms_al_fl.json'
ia.to_json(ia_out, orient='records', lines=False)
print('Wrote', ia_out)


NameError: name 'STATE_ABBR' is not defined

## NOAA HURDAT2 - Best Track Data

We will download the Atlantic basin text file and parse into a tabular format.


In [None]:
HURDAT_URL = 'https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2023-070924.txt'
hurdat_path = DATA_RAW / 'hurdat2_atlantic.txt'

r = requests.get(HURDAT_URL, timeout=60)
r.raise_for_status()
hurdat_path.write_bytes(r.content)
print('Wrote', hurdat_path)

# Quick parse sketch: collect header and record lines
lines = hurdat_path.read_text().splitlines()
records = []
current_storm = None
for line in lines:
    if line and line[0].isalpha():
        # Header: e.g., AL011851, UNNAMED, 14
        parts = [p.strip() for p in line.split(',')]
        current_storm = {'id': parts[0], 'name': parts[1], 'n': int(parts[2])}
    else:
        # Data line
        parts = [p.strip() for p in line.split(',')]
        if len(parts) >= 8 and current_storm:
            ymdh = parts[0]
            rec = {
                'storm_id': current_storm['id'],
                'storm_name': current_storm['name'],
                'date': ymdh[:8],
                'time': ymdh[8:],
                'record_id': parts[2],
                'status': parts[3],
                'lat': parts[4],
                'lon': parts[5],
                'max_wind_kt': parts[6],
                'min_pres_mb': parts[7]
            }
            records.append(rec)

hurdat_df = pd.DataFrame(records)
hurdat_csv = DATA_INTERIM / 'hurdat2_atlantic_parsed.csv'
hurdat_df.to_csv(hurdat_csv, index=False)
print('Parsed records:', len(hurdat_df), '->', hurdat_csv)


## ACS Demographics (Census API) - Tract Level

We will fetch a small set of variables for TX/LA tracts.


In [None]:
CENSUS_BASE = 'https://api.census.gov/data/2022/acs/acs5'
# Example variables: total population, median household income
vars_ = ['NAME', 'B01003_001E', 'B19013_001E']
get = ','.join(['GEO_ID'] + vars_)

params = {
    'get': get,
    'for': 'tract:*',
}

acs_frames = []
for state in STATE_FIPS:
    p = params | {'in': f'state:{state}'}
    if CENSUS_API_KEY:
        p['key'] = CENSUS_API_KEY
    r = requests.get(CENSUS_BASE, params=p, timeout=60)
    r.raise_for_status()
    df = pd.DataFrame(r.json()[1:], columns=r.json()[0])
    df['state_fips'] = state
    acs_frames.append(df)

acs = pd.concat(acs_frames, ignore_index=True)
acs_out = DATA_INTERIM / 'acs_2022_tx_la_ms_al_fl.csv'
acs.to_csv(acs_out, index=False)
print('Wrote', acs_out, 'rows:', len(acs))


## Configuration: five Gulf states and API key

Defines state lists (TX, LA, MS, AL, FL) and reads your Census API key from the environment.


In [None]:
import os

STATE_ABBR = ['TX', 'LA', 'MS', 'AL', 'FL']
STATE_FIPS = ['48', '22', '28', '01', '12']
CENSUS_API_KEY = os.getenv('CENSUS_API_KEY')
print('CENSUS_API_KEY set:', bool(CENSUS_API_KEY))


## CDC Social Vulnerability Index (SVI) - Placeholder

Download the latest SVI (tract-level) for TX, LA, MS, AL, FL from CDC/ATSDR. Place CSVs under `data/raw/svi/` and run the next cell to combine.


In [None]:
svi_dir = DATA_RAW / 'svi'
svi_dir.mkdir(parents=True, exist_ok=True)

# Expect one or multiple CSVs dropped here; we will read all CSVs and concat
svi_frames = []
for p in sorted(svi_dir.glob('*.csv')):
    try:
        df = pd.read_csv(p, dtype=str)
        df['source_file'] = p.name
        svi_frames.append(df)
    except Exception as e:
        print('Failed to read', p, e)

if svi_frames:
    svi = pd.concat(svi_frames, ignore_index=True)
    svi_out = DATA_INTERIM / 'svi_combined.csv'
    svi.to_csv(svi_out, index=False)
    print('Wrote', svi_out, 'rows:', len(svi))
else:
    print('No SVI CSVs found in', svi_dir)
