In [None]:
#import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import re

# -- 1. LOAD ------------------------------------------------------------------
zip_cols = [
    "Zip_1718", "Zip_1718.1", "Zip_1819", "Zip_1819.1",
    "Zip_1920", "Zip_1920.1", "Zip_2021", "Zip_2021.1",
    "Zip_2122", "Zip_2122.1", "Zip_2223", "Zip_2223.1",
    "Zip_2324", "Zip_2324.1",
]
dtype_map = {c: "string" for c in zip_cols}
df = pd.read_csv("ONGB_EvalData_CLEANED.csv", dtype=dtype_map, low_memory=False)
print(f"Loaded: {df.shape}")

# -- 2. BUILD ADDRESS STRINGS -------------------------------------------------
def build_full_address(addr, city, zipc, state=None):
    parts = []
    for p in [addr, city]:
        if p is None or pd.isna(p):
            continue
        s = str(p).strip()
        if s and s.lower() != "nan":
            parts.append(s)
    if zipc is not None and not pd.isna(zipc):
        z = str(zipc).strip()
        if z.lower() != "nan" and z != "":
            z = re.sub(r"\.0\b", "", z)
            z = re.sub(r"[^0-9\-]", "", z)
            if z:
                parts.append(z)
    if not parts:
        return None
    full = ", ".join(parts)
    if state:
        full = f"{full}, {state}"
    return full

years = ["1718", "1819", "1920", "2021", "2122", "2223", "2324"]

school_cols = {"addr": "School Address_{y}", "city": "City_{y}", "zip": "Zip_{y}"}
home_cols   = {"addr": "Address_{y}", "city": "City_{y}.1", "zip": "Zip_{y}.1"}

for y in years:
    df[f"school_full_{y}"] = df.apply(
        lambda x: build_full_address(
            x.get(school_cols["addr"].format(y=y)),
            x.get(school_cols["city"].format(y=y)),
            x.get(school_cols["zip"].format(y=y)),
            state="CA"
        ), axis=1)
    df[f"home_full_{y}"] = df.apply(
        lambda x: build_full_address(
            x.get(home_cols["addr"].format(y=y)),
            x.get(home_cols["city"].format(y=y)),
            x.get(home_cols["zip"].format(y=y)),
            state="CA"
        ), axis=1)

print("Address columns built.")

# -- 3. GEOCODE (with caching) ------------------------------------------------
def load_cache(cache_path):
    try:
        cache = pd.read_csv(cache_path)
        return cache.drop_duplicates(subset=["address"])
    except FileNotFoundError:
        return pd.DataFrame(columns=["address", "lat", "lon"])

def save_cache(cache, cache_path):
    cache.drop_duplicates(subset=["address"]).to_csv(cache_path, index=False)

def geocode_unique_addresses(df, address_cols, cache_path="geocode_cache.csv", min_delay_seconds=1.0):
    cache = load_cache(cache_path)
    cache_map = dict(zip(cache["address"], zip(cache["lat"], cache["lon"])))
    unique_addrs = pd.Series(pd.unique(df[address_cols].values.ravel("K"))).dropna()
    unique_addrs = [a for a in unique_addrs if str(a).strip() and str(a).lower() != "nan"]

    geolocator = Nominatim(user_agent="ongb_distance")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=min_delay_seconds)

    new_rows = []
    total = len([a for a in unique_addrs if a not in cache_map])
    print(f"Geocoding {total} new addresses (cached: {len(cache_map)})...")

    for i, addr in enumerate(unique_addrs):
        if addr in cache_map:
            continue
        loc = geocode(addr)
        if loc:
            new_rows.append((addr, loc.latitude, loc.longitude))
        else:
            new_rows.append((addr, np.nan, np.nan))
        if i % 100 == 0:
            print(f"  {i}/{total} done...")

    if new_rows:
        cache = pd.concat([cache, pd.DataFrame(new_rows, columns=["address", "lat", "lon"])], ignore_index=True)
        save_cache(cache, cache_path)

    return cache.drop_duplicates(subset=["address"])

address_cols = [f"school_full_{y}" for y in years] + [f"home_full_{y}" for y in years]
cache = geocode_unique_addresses(df, address_cols)

# -- 4. ATTACH LAT/LON --------------------------------------------------------
addr2lat = dict(zip(cache["address"], cache["lat"]))
addr2lon = dict(zip(cache["address"], cache["lon"]))

for y in years:
    df[f"school_lat_{y}"] = df[f"school_full_{y}"].map(addr2lat)
    df[f"school_lon_{y}"] = df[f"school_full_{y}"].map(addr2lon)
    df[f"home_lat_{y}"]   = df[f"home_full_{y}"].map(addr2lat)
    df[f"home_lon_{y}"]   = df[f"home_full_{y}"].map(addr2lon)

# -- 5. COMPUTE DISTANCES -----------------------------------------------------
def haversine_km(lat1, lon1, lat2, lon2):
    if any(pd.isna(v) for v in [lat1, lon1, lat2, lon2]):
        return np.nan
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

for y in years:
    df[f"dist_km_{y}"] = df.apply(
        lambda x: haversine_km(
            x.get(f"home_lat_{y}"), x.get(f"home_lon_{y}"),
            x.get(f"school_lat_{y}"), x.get(f"school_lon_{y}")
        ), axis=1)

# -- 6. QC + SAVE -------------------------------------------------------------
print("\n=== Distance QC by Year ===")
for y in years:
    s = df[f"dist_km_{y}"]
    print(f"{y}  missing: {s.isna().mean():.1%}  median: {s.median():.2f}km  max: {s.max():.1f}km")

df.to_csv("ONGB_with_distances.csv", index=False)
print("\nSaved to ONGB_with_distances.csv")

Loaded: (79460, 143)
Address columns built.
Geocoding 67990 new addresses (cached: 0)...
  0/67990 done...
  100/67990 done...
  200/67990 done...
  300/67990 done...


KeyboardInterrupt: 

In [7]:
import pandas as pd
import numpy as np
import requests
import re
from math import radians, sin, cos, sqrt, atan2

def clean_zip(zipc):
    if zipc is None or pd.isna(zipc):
        return ""
    z = str(zipc).strip()
    z = re.sub(r"\.0\b", "", z)
    z = re.sub(r"[^0-9\-]", "", z)
    return z

# -- LOAD DATA ----------------------------------------------------------------
zip_cols = [
    "Zip_1718", "Zip_1718.1", "Zip_1819", "Zip_1819.1",
    "Zip_1920", "Zip_1920.1", "Zip_2021", "Zip_2021.1",
    "Zip_2122", "Zip_2122.1", "Zip_2223", "Zip_2223.1",
    "Zip_2324", "Zip_2324.1",
]
dtype_map = {c: "string" for c in zip_cols}
df = pd.read_csv("ONGB_EvalData_CLEANED.csv", dtype=dtype_map, low_memory=False)
print(f"Loaded: {df.shape}")

# -- COLLECT UNIQUE ADDRESSES -------------------------------------------------
years = ["1718", "1819", "1920", "2021", "2122", "2223", "2324"]
all_addresses = {}

for y in years:
    for addr_col, city_col, zip_col in [
        (f"School Address_{y}", f"City_{y}", f"Zip_{y}"),
        (f"Address_{y}", f"City_{y}.1", f"Zip_{y}.1"),
    ]:
        if addr_col not in df.columns:
            continue
        for _, row in df[[addr_col, city_col, zip_col]].drop_duplicates().iterrows():
            addr = str(row[addr_col]).strip() if pd.notna(row[addr_col]) else ""
            city = str(row[city_col]).strip() if pd.notna(row[city_col]) else ""
            zipc = clean_zip(row[zip_col])
            if addr and addr.lower() != "nan":
                key = f"{addr}|{city}|{zipc}"
                all_addresses[key] = (addr, city, zipc)

print(f"Unique addresses: {len(all_addresses)}")

# -- GEOCODE WITH FIXED PARSER ------------------------------------------------
def parse_census_response(response_text, batch_keys):
    results = {}
    for line in response_text.strip().split("\n"):
        # Remove surrounding quotes and split
        line = line.strip().strip('"')
        # Use regex to parse the quoted CSV fields
        parts = re.findall(r'"([^"]*)"', response_text.split("\n")[0])
        # Better: split the raw line properly
        parts = re.split(r',(?="|\d)', line)
        parts = [p.strip().strip('"') for p in parts]
        
        try:
            idx = int(parts[0])
            match_status = parts[2] if len(parts) > 2 else ""
            if match_status in ("Match", "Tie") and len(parts) >= 6:
                # coordinates field is "lon,lat"
                coords = parts[5].strip().strip('"')
                lon_str, lat_str = coords.split(",")
                lat = float(lat_str.strip())
                lon = float(lon_str.strip())
                results[batch_keys[idx]] = (lat, lon)
            else:
                results[batch_keys[idx]] = (np.nan, np.nan)
        except (ValueError, IndexError):
            continue
    return results

def geocode_all(addresses, cache_path="geocode_cache2.csv"):
    # Fresh cache with correct parsing
    cache = {}
    keys = list(addresses.keys())
    BATCH_SIZE = 9999

    for batch_start in range(0, len(keys), BATCH_SIZE):
        batch_keys = keys[batch_start:batch_start + BATCH_SIZE]
        batch_num = batch_start // BATCH_SIZE + 1
        print(f"Submitting batch {batch_num} ({len(batch_keys)} addresses)...")

        rows = []
        for i, k in enumerate(batch_keys):
            street, city, zipc = addresses[k]
            rows.append(f'{i},"{street}","{city}","CA","{zipc}"')
        csv_content = "\n".join(rows)

        response = requests.post(
            "https://geocoding.geo.census.gov/geocoder/locations/addressbatch",
            files={"addressFile": ("addresses.csv", csv_content, "text/csv")},
            data={"benchmark": "Public_AR_Current"},
            timeout=300
        )

        if response.status_code != 200:
            print(f"  Batch {batch_num} failed: {response.status_code}")
            continue

        # Parse each line correctly
        for line in response.text.strip().split("\n"):
            try:
                parts = re.findall(r'"([^"]*)"', line)
                idx = int(parts[0])
                match_status = parts[2] if len(parts) > 2 else ""
                if match_status in ("Match", "Tie"):
                    coords = parts[5]
                    lon_str, lat_str = coords.split(",")
                    cache[batch_keys[idx]] = (float(lat_str.strip()), float(lon_str.strip()))
                else:
                    cache[batch_keys[idx]] = (np.nan, np.nan)
            except (ValueError, IndexError):
                continue

        matched = sum(1 for v in cache.values() if not np.isnan(v[0]))
        print(f"  Batch {batch_num} done. Matched so far: {matched}")

        # Save after each batch
        pd.DataFrame(
            [{"key": k, "lat": v[0], "lon": v[1]} for k, v in cache.items()]
        ).to_csv(cache_path, index=False)

    return cache

cache = geocode_all(all_addresses)
matched = sum(1 for v in cache.values() if not np.isnan(v[0]))
print(f"\nTotal geocoded: {len(cache)} | Matched: {matched} | Match rate: {matched/len(cache):.1%}")

# -- ATTACH LAT/LON -----------------------------------------------------------
for y in years:
    print(f"Attaching {y}...")
    for prefix, addr_col, city_col, zip_col in [
        ("school", f"School Address_{y}", f"City_{y}", f"Zip_{y}"),
        ("home",   f"Address_{y}",        f"City_{y}.1", f"Zip_{y}.1"),
    ]:
        def get_latlon(row):
            addr = str(row.get(addr_col, "")).strip()
            city = str(row.get(city_col, "")).strip()
            zipc = clean_zip(row.get(zip_col, ""))
            if not addr or addr.lower() == "nan":
                return pd.Series([np.nan, np.nan])
            key = f"{addr}|{city}|{zipc}"
            lat, lon = cache.get(key, (np.nan, np.nan))
            return pd.Series([lat, lon])

        df[[f"{prefix}_lat_{y}", f"{prefix}_lon_{y}"]] = df.apply(get_latlon, axis=1)

# -- COMPUTE DISTANCES --------------------------------------------------------
def haversine_km(lat1, lon1, lat2, lon2):
    if any(pd.isna(v) for v in [lat1, lon1, lat2, lon2]):
        return np.nan
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1-a))

print("\nComputing distances...")
for y in years:
    df[f"dist_km_{y}"] = df.apply(
        lambda x: haversine_km(
            x.get(f"home_lat_{y}"), x.get(f"home_lon_{y}"),
            x.get(f"school_lat_{y}"), x.get(f"school_lon_{y}")
        ), axis=1)

# -- QC + SAVE ----------------------------------------------------------------
print("\n=== Distance QC by Year ===")
for y in years:
    s = df[f"dist_km_{y}"]
    print(f"{y}  missing: {s.isna().mean():.1%}  median: {s.median():.2f}km  max: {s.max():.1f}km")

df.to_csv("ONGB_with_distances.csv", index=False)
print("\nSaved to ONGB_with_distances.csv")

Loaded: (79460, 143)
Unique addresses: 67980
Submitting batch 1 (9999 addresses)...
  Batch 1 done. Matched so far: 9836
Submitting batch 2 (9999 addresses)...
  Batch 2 done. Matched so far: 19688
Submitting batch 3 (9999 addresses)...
  Batch 3 done. Matched so far: 29555
Submitting batch 4 (9999 addresses)...
  Batch 4 done. Matched so far: 39304
Submitting batch 5 (9999 addresses)...
  Batch 5 done. Matched so far: 49046
Submitting batch 6 (9999 addresses)...
  Batch 6 done. Matched so far: 58799
Submitting batch 7 (7986 addresses)...
  Batch 7 done. Matched so far: 66613

Total geocoded: 67943 | Matched: 66613 | Match rate: 98.0%
Attaching 1718...
Attaching 1819...
Attaching 1920...
Attaching 2021...
Attaching 2122...
Attaching 2223...
Attaching 2324...

Computing distances...

=== Distance QC by Year ===
1718  missing: 52.2%  median: 1.68km  max: 4312.1km
1819  missing: 52.6%  median: 1.68km  max: 1120.1km
1920  missing: 53.5%  median: 1.68km  max: 4023.7km
2021  missing: 55.2%  

In [8]:
import os
print(os.getcwd())
print(os.path.exists("ONGB_with_distances.csv"))

/Users/quianacan/Desktop/Anderson, Quiana ONGB|Capstone
True
