In [None]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


# Load Data

In [None]:
zip_cols = [
    "Zip_1718", "Zip_1718.1",
    "Zip_1819", "Zip_1819.1",
    "Zip_1920", "Zip_1920.1",
    "Zip_2021", "Zip_2021.1",
    "Zip_2122", "Zip_2122.1",
    "Zip_2223", "Zip_2223.1",
    "Zip_2324", "Zip_2324.1",
]

dtype_map = {c: "string" for c in zip_cols}

df = pd.read_csv(
    "Test1.csv",
    dtype=dtype_map,
    low_memory=False
)


In [None]:
df.to_csv("debug_full_df.csv", index=False)


In [None]:
print(df.dtypes.to_string())

# Build full address strings (wide format)

In [None]:
import re

def build_full_address(addr, city, zipc, state=None):
    """
    Build a full address string from address + city + cleaned ZIP (+ optional state).
    """
    parts = []

    # Address and city: treat as free text
    for p in [addr, city]:
        if p is None or pd.isna(p):
            continue
        s = str(p).strip()
        if s and s.lower() != "nan":
            parts.append(s)

    # ZIP: treat as a code, not free text
    if zipc is not None and not pd.isna(zipc):
        z = str(zipc).strip()
        if z.lower() != "nan" and z != "":
            # Remove trailing '.0' if present
            z = re.sub(r"\.0\b", "", z)
            # Keep only digits and dash (ZIP or ZIP+4)
            z = re.sub(r"[^0-9\-]", "", z)
            if z:
                parts.append(z)

    if not parts:
        return None

    full = ", ".join(parts)
    if state:
        full = f"{full}, {state}"

    return full


In [None]:
# test

tests = [
    # normal cases
    ("4521 Webster St", "Oakland", "94609"),
    ("4521 Webster St", "Oakland", 94609),
    ("4521 Webster St", "Oakland", 94609.0),
    ("4521 Webster St", "Oakland", "94609.0"),

    # missing zip
    ("4521 Webster St", "Oakland", None),
    ("4521 Webster St", "Oakland", float("nan")),

    # zip+4
    ("4521 Webster St", "Oakland", "94609-1234"),

    # messy zip
    ("4521 Webster St", "Oakland", " 94609.0 "),
]

for t in tests:
    print(t, "->", build_full_address(*t))


In [None]:
def add_full_address_columns_wide(
    df: pd.DataFrame,
    years: list[str],
    school_cols: dict,  # {"addr":"School Address_{y}", "city":"City_{y}", "zip":"Zip_{y}"}
    home_cols: dict,    # {"addr":"Address_{y}", "city":"City_{y}.1", "zip":"Zip_{y}.1"}
    state: str | None = None,
    school_prefix: str = "school_full_",
    home_prefix: str = "home_full_",
) -> pd.DataFrame:
    """
    Add full address columns for each year in a wide table:
    school_full_{year} and home_full_{year}.
    """
    df = df.copy()

    for y in years:
        df[f"{school_prefix}{y}"] = df.apply(
            lambda x: build_full_address(
                x.get(school_cols["addr"].format(y=y)),
                x.get(school_cols["city"].format(y=y)),
                x.get(school_cols["zip"].format(y=y)),
                state=state,
            ),
            axis=1,
        )

        df[f"{home_prefix}{y}"] = df.apply(
            lambda x: build_full_address(
                x.get(home_cols["addr"].format(y=y)),
                x.get(home_cols["city"].format(y=y)),
                x.get(home_cols["zip"].format(y=y)),
                state=state,
            ),
            axis=1,
        )

    return df

# Geocode unique addresses with caching

In [None]:
def load_cache(cache_path: str) -> pd.DataFrame:
    """
    Load geocoding cache from disk.
    Expected columns: address, lat, lon
    """
    try:
        cache = pd.read_csv(cache_path)
        return cache.drop_duplicates(subset=["address"])
    except FileNotFoundError:
        return pd.DataFrame(columns=["address", "lat", "lon"])


def save_cache(cache: pd.DataFrame, cache_path: str) -> None:
    """Save geocoding cache to disk."""
    cache.drop_duplicates(subset=["address"]).to_csv(cache_path, index=False)


def geocode_unique_addresses(
    df: pd.DataFrame,
    address_cols: list[str],
    cache_path: str = "geocode_cache.csv",
    min_delay_seconds: float = 1.0,
) -> pd.DataFrame:
    """
    Geocode only unique addresses across all specified columns.
    Results are cached to avoid repeated API calls.
    """
    cache = load_cache(cache_path)
    cache_map = dict(zip(cache["address"], zip(cache["lat"], cache["lon"])))

    # Collect unique non-missing addresses
    unique_addrs = pd.Series(pd.unique(df[address_cols].values.ravel("K"))).dropna()
    unique_addrs = [
        a for a in unique_addrs
        if str(a).strip() and str(a).lower() != "nan"
    ]

    geolocator = Nominatim(user_agent="wide_table_distance")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=min_delay_seconds)

    new_rows = []
    for addr in unique_addrs:
        if addr in cache_map:
            continue

        loc = geocode(addr)
        if loc:
            new_rows.append((addr, loc.latitude, loc.longitude))
        else:
            new_rows.append((addr, np.nan, np.nan))

    if new_rows:
        cache = pd.concat(
            [cache, pd.DataFrame(new_rows, columns=["address", "lat", "lon"])],
            ignore_index=True,
        )
        save_cache(cache, cache_path)

    return cache.drop_duplicates(subset=["address"])

In [None]:
def attach_latlon_wide(
    df: pd.DataFrame,
    years: list[str],
    cache: pd.DataFrame,
    school_full_prefix: str = "school_full_",
    home_full_prefix: str = "home_full_",
) -> pd.DataFrame:
    """
    Map latitude and longitude from the cache back to
    school/home address columns for each year.
    """
    df = df.copy()

    addr2lat = dict(zip(cache["address"], cache["lat"]))
    addr2lon = dict(zip(cache["address"], cache["lon"]))

    for y in years:
        df[f"school_lat_{y}"] = df[f"{school_full_prefix}{y}"].map(addr2lat)
        df[f"school_lon_{y}"] = df[f"{school_full_prefix}{y}"].map(addr2lon)
        df[f"home_lat_{y}"]   = df[f"{home_full_prefix}{y}"].map(addr2lat)
        df[f"home_lon_{y}"]   = df[f"{home_full_prefix}{y}"].map(addr2lon)

    return df

In [None]:
def haversine_km(lat1, lon1, lat2, lon2) -> float:
    """
    Compute Haversine (great-circle) distance in kilometers.
    """
    if any(pd.isna(v) for v in [lat1, lon1, lat2, lon2]):
        return np.nan

    R = 6371.0  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(
        radians, [float(lat1), float(lon1), float(lat2), float(lon2)]
    )

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c


def compute_distances_wide(
    df: pd.DataFrame,
    years: list[str],
    out_prefix: str = "dist_km_",
) -> pd.DataFrame:
    """
    Compute distance between home and school for each year
    and store as dist_km_{year}.
    """
    df = df.copy()

    for y in years:
        df[f"{out_prefix}{y}"] = df.apply(
            lambda x: haversine_km(
                x.get(f"home_lat_{y}"),
                x.get(f"home_lon_{y}"),
                x.get(f"school_lat_{y}"),
                x.get(f"school_lon_{y}"),
            ),
            axis=1,
        )

    return df


In [None]:
def distance_qc(
    df: pd.DataFrame,
    years: list[str],
    dist_prefix: str = "dist_km_",
) -> pd.DataFrame:
    """
    Produce basic QC statistics for distances by year:
    missing rate and selected quantiles.
    """
    rows = []

    for y in years:
        s = df[f"{dist_prefix}{y}"]
        rows.append({
            "year": y,
            "missing_rate": float(s.isna().mean()),
            "p50_km": float(s.quantile(0.50)) if s.notna().any() else np.nan,
            "p90_km": float(s.quantile(0.90)) if s.notna().any() else np.nan,
            "p99_km": float(s.quantile(0.99)) if s.notna().any() else np.nan,
            "max_km": float(s.max()) if s.notna().any() else np.nan,
        })

    return pd.DataFrame(rows)


In [None]:
years = ["1718", "1819", "1920", "2021", "2122", "2223", "2324"]

school_cols = {
    "addr": "School Address_{y}",
    "city": "City_{y}",
    "zip":  "Zip_{y}",
}

home_cols = {
    "addr": "Address_{y}",
    "city": "City_{y}.1",
    "zip":  "Zip_{y}.1",
}

df = add_full_address_columns_wide(df, years, school_cols, home_cols, state=None)



In [None]:
df.school_full_1718

In [None]:
id_col = "ANON_ID"  

full_cols = [id_col] + [f"school_full_{y}" for y in years] + [f"home_full_{y}" for y in years]
addr_df = df[full_cols].copy()

addr_df.to_csv("addresses_full_wide.csv", index=False)


In [None]:
address_cols = [f"school_full_{y}" for y in years] + [f"home_full_{y}" for y in years]
cache = geocode_unique_addresses(df, address_cols, cache_path="geocode_cache.csv")

df = attach_latlon_wide(df, years, cache)
df = compute_distances_wide(df, years)

qc = distance_qc(df, years)
qc