In [1]:
import os
import glob
import numpy as np
import pandas as pd


In [2]:
def parse_issue_date(series: pd.Series) -> pd.Series:
    s = series.copy()
    s_str = s.astype(str)
    numeric_mask = s_str.str.fullmatch(r"-?\d+")
    out = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns, UTC]")

    # Numeric epochs
    if numeric_mask.any():
        nums = pd.to_numeric(s[numeric_mask], errors="coerce")
        ms_mask = nums > 1_000_000_000_000  # > 1e12 -> likely ms
        sec_mask = ~ms_mask
        if ms_mask.any():
            out.loc[numeric_mask & ms_mask] = pd.to_datetime(nums[ms_mask], unit="ms", utc=True, errors="coerce")
        if sec_mask.any():
            out.loc[numeric_mask & sec_mask] = pd.to_datetime(nums[sec_mask], unit="s", utc=True, errors="coerce")

    # Non-numeric strings
    str_mask = (~numeric_mask) & s.notna()
    if str_mask.any():
        out.loc[str_mask] = pd.to_datetime(s[str_mask], utc=True, errors="coerce")

    # Normalize to midnight
    out = out.dt.normalize()
    return out


def parse_issue_time_to_timedelta(series: pd.Series) -> pd.Series:
    s = series.astype("string").str.strip()
    out = pd.Series(pd.NaT, index=s.index, dtype="timedelta64[ns]")

    if s.isna().all():
        return out

    is_na = s.isna() | (s == "") | (s.str.upper() == "N/A")

    # AM/PM like '04:33 PM'
    ampm_mask = s.str.contains(r"(?i)\bAM\b|\bPM\b", na=False)
    if ampm_mask.any():
        parsed = pd.to_datetime(s[ampm_mask], format="%I:%M %p", errors="coerce")
        td = (pd.to_timedelta(parsed.dt.hour, unit="h")
              + pd.to_timedelta(parsed.dt.minute, unit="m")
              + pd.to_timedelta(parsed.dt.second, unit="s"))
        out.loc[ampm_mask] = td

    # HH:MM(:SS)? 24-hour
    hhmm_mask = s.str.fullmatch(r"\d{1,2}:\d{2}(:\d{2})?", na=False) & ~ampm_mask
    if hhmm_mask.any():
        parsed = pd.to_datetime(s[hhmm_mask], errors="coerce")
        td = (pd.to_timedelta(parsed.dt.hour, unit="h")
              + pd.to_timedelta(parsed.dt.minute, unit="m")
              + pd.to_timedelta(parsed.dt.second, unit="s"))
        out.loc[hhmm_mask] = td

    # Bare digits like '2102', '748', '59' -> HHMM (left-pad to 4)
    digits_mask = s.str.fullmatch(r"\d{1,4}", na=False) & ~(ampm_mask | hhmm_mask | is_na)
    if digits_mask.any():
        padded = s[digits_mask].str.zfill(4)
        hh = pd.to_numeric(padded.str[:2], errors="coerce")
        mm = pd.to_numeric(padded.str[2:], errors="coerce")
        # Mark invalid minutes as NaN
        mm = mm.where(mm.between(0, 59), np.nan)
        td = pd.to_timedelta(hh, unit="h") + pd.to_timedelta(mm, unit="m")
        out.loc[digits_mask] = td

    # Fallback free-form
    fallback_mask = ~(ampm_mask | hhmm_mask | digits_mask | is_na)
    if fallback_mask.any():
        parsed = pd.to_datetime(s[fallback_mask], errors="coerce")
        td = (pd.to_timedelta(parsed.dt.hour, unit="h")
              + pd.to_timedelta(parsed.dt.minute, unit="m")
              + pd.to_timedelta(parsed.dt.second, unit="s"))
        out.loc[fallback_mask] = td

    return out


def load_and_normalize_one(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, low_memory=False)

    # Ensure columns exist
    if "ISSUE_DATE" not in df.columns:
        df["ISSUE_DATE"] = pd.NA
    if "ISSUE_TIME" not in df.columns:
        df["ISSUE_TIME"] = pd.NA

    date_parsed = parse_issue_date(df["ISSUE_DATE"])
    time_td = parse_issue_time_to_timedelta(df["ISSUE_TIME"])

    # Combine as UTC
    issue_dt = date_parsed + time_td

    df["ISSUE_DATE_NORM_UTC"] = date_parsed
    df["ISSUE_TIME_NORM"] = time_td
    df["ISSUE_DATETIME_UTC"] = issue_dt

    # Convenience features
    df["ISSUE_DATE_ONLY"] = df["ISSUE_DATETIME_UTC"].dt.date
    df["ISSUE_HOUR_24"] = df["ISSUE_DATETIME_UTC"].dt.hour

    df["source_file"] = os.path.basename(path)
    return df


def load_all_normalized(
    input_glob: str,
    save_csv_path: str = None,
    preview_rows: int = 5,
) -> pd.DataFrame:
    
    files = glob.glob(input_glob)
    print(f"Found {len(files)} CSVs")
    frames = []
    for f in files:
        try:
            print(f"Reading {f}...")
            frames.append(load_and_normalize_one(f))
        except Exception as e:
            print(f"Skipping {f}: {e}")

    if not frames:
        raise RuntimeError("No CSVs were loaded successfully.")

    combined = pd.concat(frames, ignore_index=True)

    # Optional saves
    if save_csv_path:
        combined.to_csv(save_csv_path, index=False)
        print(f"Saved cleaned CSV to: {save_csv_path}")


    # Quick preview
    cols = ["source_file", "TICKET_NUMBER", "ISSUE_DATE", "ISSUE_TIME",
            "ISSUE_DATE_NORM_UTC", "ISSUE_TIME_NORM", "ISSUE_DATETIME_UTC"]
    print("\nPreview:")
    print(combined[cols].head(preview_rows))

    # Coverage stats
    coverage = pd.DataFrame({
        "source_file": combined["source_file"],
        "date_parsed_ok": combined["ISSUE_DATE_NORM_UTC"].notna(),
        "time_parsed_ok": combined["ISSUE_TIME_NORM"].notna(),
        "datetime_built": combined["ISSUE_DATETIME_UTC"].notna(),
    }).groupby("source_file").mean().round(3)
    print("\nParse coverage by file (share non-null):")
    print(coverage)

    return combined




In [5]:
if __name__ == "__main__":
    # Point this to your folder of CSVs
    # e.g., "../Data/*.csv" to match your original pattern
    INPUT_GLOB = "../Data/*.csv"

    # Where to save the combined outputs (optional)
    SAVE_CSV = "../CleanData/parking_violations_cleaned_datetime.csv"

    _ = load_all_normalized(
        input_glob=INPUT_GLOB,
        save_csv_path=SAVE_CSV,
        preview_rows=10,
    )

Found 31 CSVs
Reading ../Data/Parking_Violations_Issued_in_June_2025.csv...
Reading ../Data/Parking_Violations_Issued_in_November_2024.csv...
Reading ../Data/Parking_Violations_Issued_in_July_2024.csv...
Reading ../Data/Parking_Violations_Issued_in_June_2024.csv...
Reading ../Data/Parking_Violations_Issued_in_June_2023.csv...
Reading ../Data/Parking_Violations_Issued_in_July_2023.csv...
Reading ../Data/Parking_Violations_Issued_in_November_2023.csv...
Reading ../Data/Parking_Violations_Issued_in_January_2023.csv...
Reading ../Data/Parking_Violations_Issued_in_October_2023.csv...
Reading ../Data/Parking_Violations_Issued_in_January_2025.csv...
Reading ../Data/Parking_Violations_Issued_in_October_2024.csv...
Reading ../Data/Parking_Violations_Issued_in_January_2024.csv...
Reading ../Data/september_2023_parking_violations.csv...
Reading ../Data/Parking_Violations_Issued_in_February_2023.csv...
Reading ../Data/Parking_Violations_Issued_in_March_2025.csv...
Reading ../Data/Parking_Violation