In [62]:
# 1) Imports & Paths
import pandas as pd
from pathlib import Path

ROOT      = Path().resolve().parents[0]
RAW_DIR   = ROOT / "data" / "raw"
CLEAN_DIR = ROOT / "data" / "clean"

In [63]:
# 2) Load raw data (all as strings to avoid dtype surprises)
raw_path = RAW_DIR / "fire_and_ems_runs_2018_2025.csv"
df = pd.read_csv(raw_path, dtype=str)

In [64]:
# 3) Rename columns to snake_case
df = df.rename(columns={
    "Agency FDID":                         "agency_fdid",
    "Basic Incident Number (FD1)":         "incident_number",
    "Basic Incident Date Original (FD1.3)":"incident_date",
    "Basic Incident Year (FD1.3)":         "incident_year",
    "Basic Incident Type Code (FD1.21)":   "incident_type_code",
    "Basic Incident Type (FD1.21)":        "incident_type",
    "Basic Aid Given Or Received Code (FD1.22)": "aid_code",
    "Basic Aid Given Or Received (FD1.22)":      "aid",
    "Basic Property Use Category (FD1.46)":     "property_use_category",
    "Basic Property Use Code (FD1.46)":         "property_use_code",
    "Basic Property Use (FD1.46)":              "property_use",
    "Basic Incident Full Street Address":       "address",
    "Basic Incident City Name (FD1.16)":        "city",
    "Basic Incident State (FD1.18)":            "state",
    "Basic Incident Postal Code (FD1.19)":      "postal_code"
})

In [65]:
# 4) Parse dates and drop unparseable rows
df["incident_date"] = pd.to_datetime(
    df["incident_date"],
    format="%m/%d/%Y",
    errors="coerce"
)
df = df.dropna(subset=["incident_date"])

In [66]:
# 5) Fix short incident_number values by prefixing the year
df["incident_year"] = df["incident_year"].astype(str).str.strip()
def fix_incident_number(row):
    num = row["incident_number"].strip()
    yr  = row["incident_year"]
    if num.startswith(yr):
        return num
    return yr + num

df["incident_number"] = df.apply(fix_incident_number, axis=1)


In [67]:
# 6) Restrict to our analysis window
df = df[
    (df["incident_date"] >= "2018-01-01") &
    (df["incident_date"] <= "2025-07-31")
]

In [68]:
# 7) Standardize city names
df["city_clean"] = df["city"].str.strip().str.lower()

In [69]:
# 8) Drop exact duplicates by composite key
df = df.drop_duplicates(subset=["incident_number", "incident_date"])

In [70]:
# 9) Create a month‐stamp for later aggregation
df["incident_month"] = df["incident_date"].dt.to_period("M").dt.to_timestamp()

In [71]:
# 10) Save cleaned dataset
output_path = CLEAN_DIR / "fire_and_ems_runs_clean.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned rows: {len(df)}")
df.head()
df.info()

Cleaned rows: 15431
<class 'pandas.core.frame.DataFrame'>
Index: 15431 entries, 0 to 15436
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   agency_fdid            15431 non-null  object        
 1   incident_number        15431 non-null  object        
 2   incident_date          15431 non-null  datetime64[ns]
 3   incident_year          15431 non-null  object        
 4   incident_type_code     15431 non-null  object        
 5   incident_type          15431 non-null  object        
 6   aid_code               7724 non-null   object        
 7   aid                    7724 non-null   object        
 8   property_use_category  11548 non-null  object        
 9   property_use_code      11551 non-null  object        
 10  property_use           11549 non-null  object        
 11  address                15431 non-null  object        
 12  city                   15431 non-null  object