In [46]:
# ── 1. Project paths ──────────────────────────────────────────────────────
ROOT     = Path.cwd()
while not (ROOT / ".gitignore").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent

RAW_DIR   = ROOT / "data" / "raw"
FIRE_CSV  = RAW_DIR / "Fire-Incidents-Report_2025-07-24_132502.csv"

CLEAN_DIR = ROOT / "data" / "clean"
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV   = CLEAN_DIR / "runs_cleaned.csv"

In [47]:
# ── 2. Load raw data & normalize column names ────────────────────────────
df = pd.read_csv(FIRE_CSV, dtype=str)
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(r"[^\w]+", "_", regex=True)
              .str.strip("_")
)

In [48]:
# ── 3. Helper functions ───────────────────────────────────────────────────
def coalesce(df, newcol, cols, dtype=None):
    existing = [c for c in cols if c in df.columns]
    if not existing:
        return df
    df[newcol] = df[existing].bfill(axis=1).iloc[:, 0]
    if dtype == "datetime":
        df[newcol] = pd.to_datetime(df[newcol], errors="coerce")
    elif dtype == "Int64":
        df[newcol] = pd.to_numeric(df[newcol], errors="coerce").astype("Int64")
    elif dtype == "category":
        df[newcol] = df[newcol].astype("category")
    return df

def normalize_addr(s: pd.Series) -> pd.Series:
    s = s.str.upper().fillna("")
    # strip directionals
    s = s.str.replace(r"\b(?:NORTH|SOUTH|EAST|WEST|N|S|E|W)\b", "", regex=True)
    # drop trailing ZIP codes
    s = s.str.replace(r"\s+\d{5}$", "", regex=True)
    # remove punctuation
    s = s.str.replace(r"[,\.\#]", " ", regex=True)
    # expand suffix abbreviations
    suffix_map = {
        r"\bST\b": "STREET",
        r"\bRD\b": "ROAD",
        r"\bAVE\b": "AVENUE",
        r"\bBLVD\b": "BOULEVARD",
        r"\bDR\b": "DRIVE",
        r"\bLN\b": "LANE",
        r"\bCT\b": "COURT",
        r"\bPKWY\b": "PARKWAY"
    }
    for abbr, full in suffix_map.items():
        s = s.str.replace(abbr, full, regex=True)
    return s.str.replace(r"\s+", " ", regex=True).str.strip()

In [49]:
# ── 4. Coalesce & parse the best timestamp ─────────────────────────────────
df = coalesce(df, "date", [
    "basic_incident_date_time",
    "cad_entry_date_time",
    "cad_dispatch_date_time"
], dtype="datetime")

In [50]:
# ── 5. Extract time features & filter years ───────────────────────────────
df["year"]        = df["date"].dt.year.astype("Int64")
df["month"]       = df["date"].dt.month.astype("Int64")
df["day"]         = df["date"].dt.day.astype("Int64")
df["hour_of_day"] = df["date"].dt.hour.astype("Int64")
df["is_weekend"]  = df["date"].dt.weekday.isin([5,6])

# keep only 2018–2024
df = df[df["year"].between(2018, 2024)]

In [51]:
# ── 6. Build full address & normalize ────────────────────────────────────
addr_parts = [
    "basic_incident_street_number_fd1_10",
    "basic_incident_street_prefix_fd1_11",
    "basic_incident_street_name_fd1_12",
    "basic_incident_street_type_fd1_13",
    "basic_incident_postal_code_fd1_19"
]
df["address"] = (
    df[addr_parts].fillna("").agg(" ".join, axis=1)
                   .str.replace(r"\s+", " ", regex=True).str.strip()
)
df["addr_norm"] = normalize_addr(df["address"])

In [52]:
# ── 7. Coalesce codes & descriptions ──────────────────────────────────────
df = coalesce(df, "incident_id", ["basic_incident_number_fd1"], dtype=None)
df = coalesce(df, "inc_type_code",
              ["basic_incident_type_code_fd1_21"], dtype="Int64")
df = coalesce(df, "inc_type_cat",
              ["basic_incident_type_category_fd1_21"], dtype="category")
df = coalesce(df, "inc_type_desc",
              ["basic_incident_type_fd1_21"], dtype=None)

df = coalesce(df, "prop_use_code",
              ["basic_property_use_code_fd1_46"], dtype="Int64")
df = coalesce(df, "prop_use_cat",
              ["basic_property_use_category_fd1_46"], dtype="category")
df = coalesce(df, "prop_use_desc",
              ["basic_property_use_fd1_46"], dtype=None)

df = coalesce(df, "aid_code",
              ["basic_aid_given_or_received_code_fd1_22"], dtype="Int64")
df = coalesce(df, "aid_desc",
              ["basic_aid_given_or_received_code_and_description_fd1_22"], dtype=None)

In [42]:
# ── 8. Address & agency (unchanged) ──────────────────────────────────────
addr_parts = [
    "basic_incident_street_number_fd1_10",
    "basic_incident_street_prefix_fd1_11",
    "basic_incident_street_name_fd1_12",
    "basic_incident_street_type_fd1_13",
    "basic_incident_postal_code_fd1_19",
]
runs["address"] = (
    runs[addr_parts].fillna("")
        .agg(" ".join, axis=1)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
)

In [53]:
# ── 8. Subset, dedupe & save ──────────────────────────────────────────────
keep_cols = [
    "incident_id", "date", "year", "month", "day", "hour_of_day", "is_weekend",
    "inc_type_code", "inc_type_cat", "inc_type_desc",
    "prop_use_code", "prop_use_cat", "prop_use_desc",
    "aid_code", "aid_desc",
    "address", "addr_norm", "agency_name"
]
out = (
    df.loc[:, [c for c in keep_cols if c in df.columns]]
      .drop_duplicates(subset="incident_id")
)

out.to_csv(OUT_CSV, index=False)
print("✓ Cleaned fire data saved to", OUT_CSV)

✓ Cleaned fire data saved to C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\runs_cleaned.csv
