# Data Collection Notebook

# Imports

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import pandas as pd
from IPython.display import display
import time
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm 
import usaddress  
from functools import partial
import tempfile
import shutil


notebooks_folder = os.getcwd() 


raw_root_folder = os.path.abspath(os.path.join(notebooks_folder, "..", "data", "raw", "New York City Sales Data"))
interim_root_folder = os.path.abspath( os.path.join(notebooks_folder, "..", "data", "interim", "New York City Sales Data"))
processed_root_folder = os.path.abspath( os.path.join(notebooks_folder, "..", "data", "processed", "New York City Sales Data"))
os.makedirs(raw_root_folder, exist_ok=True)
os.makedirs(interim_root_folder, exist_ok=True)  
os.makedirs(processed_root_folder, exist_ok=True)  


# NYC Rolling Sales Data Collection

## Initial Parsing of the Webpage 

Looking at the HTML layout of the website, we can see that there are `<table>` elements. These contain the rolling sales data. We can parse all the table rows `<tr>` from the webpage, and begin filtering from there.  

In [None]:
url = "https://www.nyc.gov/site/finance/property/property-annualized-sales-update.page"  # replace with your site
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

rows = soup.find_all("tr")

for r in rows:
    print(r)

## Web Scrapping Script

From the html we parsed previously, there are a few things to note 

1) There is a title change in the `<td>` from 2016 to 2014. These say `<yyyy> New York City` as opposed to `<yyyy> New York City Sales Data`
2) From 2003 - 2017, the legacy extension for excell files `.xls` is used. This is changed to `.xlsx` from 2018 - 2024

The Data Saving Structure is as follows: 
```text
New York City Sales Data/
├── 2003/
│   ├── Manhattan.xls
│   ├── Bronx.xls
│   ├── Brooklyn.xls
│   ├── Queens.xls
│   └── Staten Island.xls
├── 2004/
│   ├── Manhattan.xls
│   ├── Bronx.xls
│   ├── Brooklyn.xls
│   ├── Queens.xls
│   └── Staten Island.xls
...
└── 2018/
    ├── Manhattan.xlsx
    ├── Bronx.xlsx
    ├── Brooklyn.xlsx
    ├── Queens.xlsx
    └── Staten Island.xlsx


In [None]:
rows = soup.find_all("tr")

current_year = None

for tr in rows:
    cells = tr.find_all("td")
    if not cells:
        continue

    text = cells[0].get_text(strip=True)

    # Due to table header change in td, we use regex to find the heading and year
    match = re.search(r"(\d{4})\s+New\s+York\s+City", text)
    if match:
        current_year = match.group(1)
        year_folder = os.path.join(raw_root_folder, current_year)
        os.makedirs(year_folder, exist_ok=True)
        print(f"\nSaving files for {current_year}")
        continue

    if current_year is None:
        continue

    borough = cells[0].get_text(strip=True)
    link_tag = cells[2].find("a") if len(cells) > 2 else None
    if not link_tag or not link_tag.has_attr("href"):
        continue

    excel_href = link_tag["href"]
    excel_url = urljoin(url, excel_href)
    

    filename = f"{borough}.xlsx" if ".xlsx" in excel_url else f"{borough}.xls"
    filepath = os.path.join(year_folder, filename)

    print(f"Downloading {borough} {current_year} data")
    with requests.get(excel_url, stream=True) as r:
        r.raise_for_status()
        with open(filepath, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

print("All files downloaded")

# NYC Rolling Sales Data Filtering & Unique Address Collection

## Filtering Pipeline Summary

This cell performs a full cleaning, filtering, and address-preparation pipeline for NYC property sales files (reads from `raw_root_folder`, writes cleaned `.xlsx` files into `interim_root_folder`).

### Data Filtering Main steps performed (in order)

1. **File discovery & read**
   - Iterates year subfolders in `raw_root_folder`.  
   - Skips temporary files and non-Excel files (`.xls` / `.xlsx`).
   - Reads the file once with `header=None` to detect the header row. The read uses `dtype=object` so columns are not auto-coerced into numeric types.

2. **Early illegal-character cleaning**
   - Applies `clean_column_name` / `clean_illegal_chars` to object/string cells to remove control characters that will break Excel (`ASCII 0-31`, etc.).
   - This prevents `IllegalCharacterError` when saving.

3. **Header detection and realignment**
   - Normalizes header candidates and matches them against `expected_columns`.
   - Re-reads the file once using the detected header row (`header=header_row_idx`) and `dtype=object`, then normalizes column names.

4. **Type conversions**
   - Converts integer columns (`BLOCK`, `LOT`, `RESIDENTIAL UNITS`, etc.) to pandas `Int64` (nullable integer).
   - Converts numeric columns (`LAND SQUARE FEET`, `GROSS SQUARE FEET`, `SALE PRICE`) to numeric (`float`).
   - Normalizes string columns (casts to `str`, replaces `"nan"` placeholders).
   - Parses `SALE DATE` with `pd.to_datetime(..., errors="coerce")`.

5. **Residential filtering**
   - Normalizes `BUILDING CLASS CATEGORY` strings.
   - Keeps only rows that match residential keywords (`FAMILY`, `RENTAL`, `COOP`, `CONDO`, `CONDOP`, `TAX CLASS 1`).

6. **Address parsing & cleaning**
   - Uses `usaddress` to parse street components and only retains addresses that contain both a house number and a street name.
   - Builds `ADDRESS_CLEAN` from parsed components (number + street types/names).

7. **Remove non-market transactions**
   - Drops rows where `SALE PRICE` ≤ 0 (these are typically non-arms-length or administrative transfers, not market sales).

8. **Build `FULL_ADDRESS` for geocoding**
   - Maps `BOROUGH` → appropriate city label for geocoders (Manhattan → `New York`, Brooklyn → `Brooklyn`, Queens → `Queens`, Bronx → `Bronx`, Staten Island → `Staten Island`).
   - Assembles `FULL_ADDRESS` as:
     ```
     [ADDRESS_CLEAN], [CITY], NY, [ZIP CODE]
     ```
     Example: `123 MAIN ST, BROOKLYN, NY, 11215`.

9. **Column selection**
   - Keeps only the minimal columns needed for mapping/visualization:
     - `FULL_ADDRESS`  
     - `SALE PRICE`  
     - `SALE DATE`

10. **Save & convert**
    - Ensures `interim_root_folder/<year>/` exists and saves cleaned data as `.xlsx` into that folder.
    - If the original file was `.xls`, the script removes the old `.xls` after saving the cleaned `.xlsx`.

### Final output
- For each processed raw file, you get a cleaned `.xlsx` in `interim_root_folder/<year>/` containing only:
     - `FULL_ADDRESS`  
     - `SALE PRICE`  
     - `SALE DATE`

## Unique Address Collection 

After processing and cleaning all raw property sales files, we collect the **unique addresses** for geocoding and mapping purposes.

1. **Track unique addresses while processing**
   - Each cleaned `FULL_ADDRESS` from the current file is added to a Python `set()` to automatically ensure uniqueness:
     ```python
     unique_addresses.update(df["FULL ADDRESS"])
     ```

2. **Convert to DataFrame**
   - Convert the set of unique addresses into a sorted pandas DataFrame:
     ```python
     unique_df = pd.DataFrame(sorted(unique_addresses), columns=["FULL ADDRESS"])
     ```

3. **Assign round-robin groups**
   - Assign each address to a deterministic group (useful for batch processing or parallel geocoding):
     ```python
     n_groups = 6
     unique_df["GROUP"] = (unique_df.index % n_groups) + 1
     ```

4. **Add blank latitude/longitude columns for future geocoding**
   - Prepare columns to store geocoded coordinates later:
     ```python
     unique_df["LAT"] = None
     unique_df["LON"] = None
     ```

5. **Reorder columns**
   - Ensure the final column order is consistent:
     ```python
     unique_df = unique_df[["GROUP", "FULL ADDRESS", "LAT", "LON"]]
     ```

6. **Save to Excel**
   - Save the unique addresses and groups to an Excel file in the `interim_root_folder`:
     ```python
     geocode_cache_path = os.path.join(interim_root_folder, "unique_addresses.xlsx")
     unique_df.to_excel(geocode_cache_path, index=False, engine="openpyxl")
     ```

### Final Output
- `unique_addresses.xlsx` containing:
  - `GROUP` → deterministic assignment for batch processing
  - `FULL ADDRESS` → cleaned, geocodable addresses
  - `LAT` / `LON` → blank for now, to be filled later by geocoding
- Ensures **all addresses are unique** across years and files.


In [None]:
# ==============================
# config
# ==============================
expected_columns = [
    "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", "TAX CLASS AT PRESENT",
    "BLOCK", "LOT", "EASE-MENT", "BUILDING CLASS AT PRESENT", "ADDRESS",
    "APARTMENT NUMBER", "ZIP CODE", "RESIDENTIAL UNITS", "COMMERCIAL UNITS",
    "TOTAL UNITS", "LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT",
    "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE",
    "SALE PRICE", "SALE DATE"
]

int_cols = ["BLOCK", "LOT", "RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS", "YEAR BUILT"]
float_cols = ["LAND SQUARE FEET", "GROSS SQUARE FEET", "SALE PRICE"]
str_cols = ["BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", "TAX CLASS AT PRESENT",
            "EASE-MENT", "BUILDING CLASS AT PRESENT", "ADDRESS", "APARTMENT NUMBER",
            "ZIP CODE", "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE"]
datetime_cols = ["SALE DATE"]

residential_keywords = [
    "FAMILY", "RENTAL", "COOP", "CONDO", "CONDOP", "TAX CLASS 1"
]

borough_encoding__to_city_map = {"1": "New York", "2": "Bronx", "3": "Brooklyn", "4": "Queens", "5": "Staten Island"}
unique_addresses = set()

# ==============================
# REGEX CLEANING HELPER FUNCTIONS
# ==============================
def normalize(col):
    return str(col).replace('\n', ' ').replace('"', '').replace("  ", " ").strip().upper()

def clean_column_name(s):
    if isinstance(s, str):
        # Remove illegal characters (ASCII 0-31 except \t, \n, \r)
        s = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
    return s

def clean_illegal_chars(val, replace_with=""):
    """Remove Excel-illegal control chars from a string.
       replace_with: '' (remove) or ' ' (replace with space)
    """
    if isinstance(val, str):
        # remove ASCII 0-31 and 127-159
        return re.sub(r'[\x00-\x1F\x7F-\x9F]+', replace_with, val).strip()
    return val

# ==============================
# PROPERTY TYPE CLEANING
# ==============================
def is_residential(category: str) -> bool:
    """Check if a category is residential."""
    if not isinstance(category, str):
        return False
    category = category.upper()
    return any(k in category for k in residential_keywords)


# ==============================
# ADDRESS CLEANING
# ==============================
def clean_address(addr: str) -> str | None:
    """Try to parse & clean address. Returns cleaned address or None if invalid."""
    if not isinstance(addr, str) or not addr.strip():
        return None

    addr = re.sub(r"\s+", " ", addr.strip().title())

    try:
        parsed, _ = usaddress.tag(addr)
        # Keep only addresses that have street + house number
        if "AddressNumber" in parsed and "StreetName" in parsed:
            # Rebuild normalized street address
            parts = [
                parsed.get("AddressNumber", ""),
                parsed.get("StreetNamePreType", ""),
                parsed.get("StreetName", ""),
                parsed.get("StreetNamePostType", "")
            ]
            clean = " ".join([p for p in parts if p]).strip()
            return clean
        else:
            return None
    except usaddress.RepeatedLabelError:
        return None
    
# ==============================
# FULL ADDRESS ASSEMBLY
# ==============================
def build_full_address(row):
    parts = [row["ADDRESS_CLEAN"].strip().title()]

    # Add borough→city
    city = borough_encoding__to_city_map[row["BOROUGH"]].strip().title()
    parts.append(row["NEIGHBORHOOD"].strip().title())
    parts.append(city)
    parts.append("NY")
    parts.append(str(row["ZIP CODE"]).strip().title())

    return ", ".join([p for p in parts if p])

# ==============================
# MAIN PROCESSING LOOP
# ==============================
for year in os.listdir(raw_root_folder):
    year_folder = os.path.join(raw_root_folder, year)
    if not os.path.isdir(year_folder):
        continue
    print(f"\n=== Entering year: {year} ===")

    for file in os.listdir(year_folder):

        # skip non-Excel files or temp excel files
        if not file.lower().endswith((".xls", ".xlsx")) or file.startswith("~"):  
            continue

        # Read in file (.xls or .xlsx)
        file_path = os.path.join(year_folder, file)
        try:
            df = pd.read_excel(file_path, header=None, engine="openpyxl", dtype=object)
        except Exception:
            df = pd.read_excel(file_path, header=None, engine="xlrd", dtype=object)

        # display(df.head(10))

        # Clean illegal characters before anything else
        for col in df.select_dtypes(include="object"):
            df[col] = df[col].apply(clean_column_name)

        # Detect header row
        normalized_expected = [normalize(c) for c in expected_columns]
        header_row_idx = None
        for i, row in df.iterrows():
            normalized_row = [normalize(c) for c in row.values]
            matches = sum(col in normalized_expected for col in normalized_row)
            if matches >= len(normalized_expected) * 0.7:
                header_row_idx = i
                break

        # read in file, now with proper header alignment
        try:
            df = pd.read_excel(file_path, header=header_row_idx, engine="openpyxl", dtype=object)
        except Exception:
            df = pd.read_excel(file_path, header=header_row_idx, engine="xlrd", dtype=object)
        df.columns = [normalize(c) for c in df.columns]
        df.reset_index(drop=True, inplace=True)

        # display(df.head(10))
        
        # Convert cols to proper data types
        for col in int_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

        for col in float_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        for col in str_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).replace("nan", "")

        for col in datetime_cols:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors="coerce")

        # tracking how much data we are removing
        before_count = len(df)
        
        # Cleaning residential categories
        df["BUILDING CLASS CATEGORY"] = (
            df["BUILDING CLASS CATEGORY"]
            .astype(str)
            .apply(lambda x: re.sub(r"\s+", " ", x.strip()))
        )
        df = df[df["BUILDING CLASS CATEGORY"].apply(is_residential)]

        #  Cleaning address
        df["ADDRESS_CLEAN"] = df["ADDRESS"].apply(clean_address)
        df = df[df["ADDRESS_CLEAN"].notna()]

        # Removing $0 sales
        df = df[df['SALE PRICE'] > 0]

        # Removing 0 ZIP CODE
        df = df[df['ZIP CODE'] != "0"]

        # Borough → city conversion and full address building
        df["FULL ADDRESS"] = df.apply(build_full_address, axis=1)

        # Keep only relevant columns for mapping/analysis
        keep_cols = [
            "FULL ADDRESS",
            "SALE PRICE",
            "SALE DATE",
        ]
        df = df[[c for c in keep_cols if c in df.columns]]

        # Print out % of removed data
        after_count = len(df)
        filtered_out = before_count - after_count
        print(f"{file}: Kept {after_count:,} rows, filtered out {filtered_out:,} ({filtered_out / before_count:.1%})")

        # Create new file path with .xlsx extension for consistent file types
        interim_year_folder = os.path.join(interim_root_folder, year)
        os.makedirs(interim_year_folder, exist_ok=True)
        new_file_path = os.path.join(interim_year_folder, os.path.splitext(file)[0] + ".xlsx")

        # Clean all object/string columns before saving to Excel
        for col in df.select_dtypes(include="object"):
            df[col] = df[col].apply(clean_illegal_chars)

        # Tracks unique addresses seen
        # Will be used later to get lat and longs
        unique_addresses.update(df["FULL ADDRESS"])

        # Save back to the same Excel file
        df.to_excel(new_file_path, index=False, engine="openpyxl")

print("Done filtering!")


print("Creating address geocache data frame")
# Convert to DataFrame
addresses_df = pd.DataFrame(sorted(unique_addresses), columns=["FULL ADDRESS"])

# Add blank LAT/LON columns
addresses_df["LAT"] = None
addresses_df["LON"] = None

# Reorder columns
addresses_df = addresses_df[["FULL ADDRESS", "LAT", "LON"]]

# Save CSV
geocode_cache_path = os.path.join(interim_root_folder, "addresses.xlsx")
addresses_df.to_excel(geocode_cache_path, index=False, engine="openpyxl")
print(f"Saved {len(addresses_df)} unique addresses to {geocode_cache_path}")




## Advanced Address Cleaning & Normalization for Geocoding

This step performs a **comprehensive cleaning, normalization, and deduplication** of New York City property addresses. The primary purpose is to **reduce the number of geocoding lookups** by condensing multiple variations of the same address into a single normalized form. Once we have the normalized addresses, we can use them as a **key to append latitude and longitude** to our NYC sales data.

### Key Features of the Workflow

1. **Initial Cleaning**
   - Removes illegal/control characters that can break Excel (`ASCII 0-31`, `\x7F`).
   - Strips stray leading punctuation or brackets (e.g., `(56 Street, Bensonhurst, Brooklyn, NY, 11010)` → `56 Street, Bensonhurst,Brooklyn, NY, 11010`).
   - Removes excessive leading zeros in street numbers (e.g., `0000 100th Street` → `100th Street`).
   - Normalizes spacing and capitalization.

2. **Parsing Components**
   - Splits `FULL ADDRESS` into:
     - `STREET`
     - `CITY`
     - `STATE`
     - `ZIP`
   - This allows fine-grained normalization of street names.

3. **Street Name Normalization**
   - Maps abbreviations to full street type names:
     - `AVE`, `AV`, `AVE.` → `Avenue`
     - `ST`, `ST.` → `Street`
     - `RD` → `Road`
     - `PL` → `Place`
     - etc.
   - Normalizes street ordinals:
     - Numeric-only street numbers like `107` → `107th`
     - Correctly handles `1 → 1st`, `2 → 2nd`, `3 → 3rd`, and special cases `11-13 → th`.

4. **Normalized Full Address**
   - Reconstructs `FULL ADDRESS NORM` as:
     ```
     [STREET_NORM], [CITY], [STATE], [ZIP]
     ```
   - Ensures all variations of the same address map to the same normalized form, e.g.:
     ```
     1 Ascan Ave, Forest Hills, Queens, NY, 11375
     1 Ascan Avenue, Forest Hills, Queens, NY, 11375
     → 1 Ascan Avenue, Forest Hills, Queens, NY, 11375
     ```

5. **Detecting Merged Addresses**
   - Groups addresses by `FULL ADDRESS NORM`.
   - Prints only those groups where **two or more original addresses were mapped to the same normalized address**, allowing us to verify the deduplication process.

6. **Deduplication**
   - Keeps only **unique normalized addresses**.
   - Adds a `GROUP` assignment for round-robin processing (useful for batch geocoding to respect API rate limits).
   - Final output contains only the necessary columns:
     - `GROUP`
     - `FULL ADDRESS` (original, unmodified)
     - `FULL ADDRESS NORM`
     - `LAT`
     - `LON`

### Workflow Benefits

- **Reduces Geocoding Lookups:** By condensing address variants, we minimize unnecessary queries to the geocoding service.
- **Non-Destructive:** The original `FULL ADDRESS` column is preserved, enabling a **reliable join** back to the NYC sales dataset.
- **Consistent Normalization:** Handles street type abbreviations, ordinals, leading zeros, stray punctuation, and capitalization variations.
- **Traceable Merges:** Any address variants combined into a single normalized form are logged for review.

### Next Steps

Once this script is run, we can use the normalized addresses with **geopy's Nominatim** (with `RateLimiter`) to obtain latitude and longitude for each unique normalized address. These coordinates can then be joined back to the full NYC sales data using the original `FULL ADDRESS` as the key.


In [None]:
# =====================================================
# CONFIG
# =====================================================
street_map = {
    r"\bAVE\b\.?": "Avenue",
    r"\bAV\b\.?": "Avenue",
    r"\bST\b\.?": "Street",
    r"\bRD\b\.?": "Road",
    r"\bPL\b\.?": "Place",
    r"\bTER\b\.?": "Terrace",
    r"\bBLVD\b\.?": "Boulevard",
    r"\bLN\b\.?": "Lane",
    r"\bDR\b\.?": "Drive",
    r"\bCT\b\.?": "Court",
    r"\bHWY\b\.?": "Highway",
    r"\bPKWY\b\.?": "Parkway",
    r"\bSQ\b\.?": "Square",
    r"\bCTR\b\.?": "Center",
}
n_groups = 6

# =====================================================
# HELPER FUNCTIONS
# =====================================================
def clean_illegal_chars(val):
    """Remove illegal Excel characters from string cells."""
    if isinstance(val, str):
        return re.sub(r'[\x00-\x1F\x7F]', '', val)
    return val

def clean_address_text(addr: str) -> str:
    """
    Clean up weird formatting in addresses:
    - Remove stray leading punctuation/brackets.
    - Remove excessive leading zeros in street numbers (e.g., 0000 100TH -> 100TH).
    - Normalize spaces and capitalization.
    """
    if not isinstance(addr, str):
        return addr

    # Remove illegal and control characters first
    addr = clean_illegal_chars(addr)

    # Strip leading punctuation or symbols (like (, ", ', etc.)
    addr = re.sub(r'^[^\w\d]+', '', addr)

    # Fix excessive leading zeros at the beginning of street numbers
    # e.g., "0000 100TH STREET" -> "100TH STREET"
    addr = re.sub(r'^\s*0+\s*(?=\d)', '', addr)

    # Remove double spaces
    addr = re.sub(r'\s{2,}', ' ', addr)

    # Strip trailing/leading whitespace
    addr = addr.strip()

    # Title formatting for captialization
    addr = addr.title()

    return addr

def normalize_street_ordinal(street_name):
    """Normalize street ordinals like 107 → 107TH, 1 → 1ST, 2 → 2ND, etc."""
    tokens = street_name.split()
    new_tokens = []
    for t in tokens:
        if re.fullmatch(r"\d+", t):  # purely numeric
            n = int(t)
            if 10 <= n % 100 <= 20:
                suffix = "th"
            else:
                suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
            new_tokens.append(f"{t}{suffix}")
        else:
            new_tokens.append(t)
    return " ".join(new_tokens)

def normalize_street_name(street):
    """Normalize capitalization, abbreviations, and ordinals in street names."""
    s = str(street).strip().title()
    for pattern, repl in street_map.items():
        s = re.sub(pattern, repl, s, flags=re.IGNORECASE)
    s = re.sub(r"\s+", " ", s)
    s = normalize_street_ordinal(s)
    return s.strip()

def parse_full_address(addr):
    """Parse 'FULL ADDRESS' into [street, neighborhood, city, state, zip]."""
    parts = [p.strip() for p in str(addr).split(",")]
    if len(parts) != 5:
        return [None, None, None, None]
    street, neighborhood, city, state, zip_code = parts[0], parts[1], parts[2], parts[3], parts[4]
    return [street, neighborhood, city, state, zip_code]

# =====================================================
# MAIN PROCESSING
# =====================================================
address_cache_path = os.path.join(interim_root_folder, "addresses.xlsx")
address_cache_path_remapped = os.path.join(interim_root_folder, "addresses_condensed.xlsx")

address_df = pd.read_excel(address_cache_path, engine="openpyxl")

# Clean Address 
address_df["CLEANED FULL ADDRESS"] = address_df["FULL ADDRESS"].apply(clean_address_text)

# Split into components
address_df[["STREET", "NEIGHBORHOOD", "CITY", "STATE", "ZIP"]] = address_df["CLEANED FULL ADDRESS"].apply(
    lambda x: pd.Series(parse_full_address(x))
)
address_df["STATE"] = address_df["STATE"].str.upper()

# Normalize street names
address_df["STREET_NORM"] = address_df["STREET"].apply(normalize_street_name)

# Build normalized full address
address_df["FULL ADDRESS NORM"] = (
    address_df["STREET_NORM"].astype(str)
    + ", "
    + address_df["NEIGHBORHOOD"].astype(str)
    + ", "
    + address_df["CITY"].astype(str)
    + ", "
    + address_df["STATE"].astype(str)
    + ", "
    + address_df["ZIP"].astype(str)
)

# =====================================================
# SHOW ONLY MERGED ADDRESSES (duplicates after cleaning)
# =====================================================
grouped = (
    address_df.groupby("FULL ADDRESS NORM")["FULL ADDRESS"]
    .apply(list)
    .reset_index()
)

merged = grouped[grouped["FULL ADDRESS"].apply(lambda lst: len(lst) > 1)]


print("=== ADDRESSES THAT GOT MERGED TO THE SAME NORMALIZED FORM ===")
for _, row in merged.iterrows():
    print(f"\n→ Normalized: {row['FULL ADDRESS NORM']}")
    for orig in row["FULL ADDRESS"]:
        print(f"   - {orig}")


# =====================================================
# KEEP ONLY UNIQUE NORMALIZED ADDRESSES
# =====================================================
unique_address_df = address_df.drop_duplicates(subset=["FULL ADDRESS NORM"]).reset_index(drop=True)

# Assign groups round-robin
unique_address_df["GROUP"] = (unique_address_df.index % n_groups) + 1

# Keeping only neccessary columns
unique_address_df = unique_address_df[["GROUP", "FULL ADDRESS", "FULL ADDRESS NORM", "LAT", "LON"]]

# Printing out how much data we re-mapped 
before_count = len(address_df)
after_count = len(unique_address_df)
filtered_out = before_count - after_count
print(f"Kept {after_count:,} unique and normalized addresses, condensed down {filtered_out:,} addresses ({filtered_out / before_count:.1%})")

# Save to Excel (includes both columns for traceability)
unique_address_df.to_excel(address_cache_path_remapped, index=False, engine="openpyxl")

print(f"\n Saved cleaned, deduplicated addresses to: {address_cache_path_remapped}")
display(unique_address_df.head(10))


### Creating Coordinate Columns from Addresses

`GeoJSON` files typically utilize coordinate lat. and long. in order to be abl to render maps and other layouts. For compatability, the `ADDRESS` listed in our data need to have an associated coordinate location. 

In [None]:
# =====================================================
# config
# =====================================================
config = {
    "input_file": "addresses_condensed.xlsx",
    "group_number": 1,  # <-- each team member sets their group
    "user_agent": "nyc_affordability_transit_access",
    "min_delay_seconds": 1.0,
    "flush_every": 500,  # how many rows before writing to disk
    "max_retries": 3,
}
# NYC bounding box (roughly 5 boroughs)
nyc_viewbox = (-74.25909, 40.477399, -73.700272, 40.917577)

# =====================================================
# HELPER FUNCTIONS
# =====================================================

def safe_write_excel(df, final_path):
    """Safely write Excel file without risk of corruption on interruption."""
    tmp_dir = tempfile.gettempdir()
    tmp_path = os.path.join(tmp_dir, f"tmp_{os.path.basename(final_path)}")
    try:
        # Write to a temporary file first
        df.to_excel(tmp_path, index=False, engine="openpyxl")
        # Atomically replace the old file
        shutil.move(tmp_path, final_path)
    except Exception as e:
        print(f"⚠️ Warning: Failed to safely write {final_path}: {e}")
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def geocode_address(geolocator, address_json):
    """Call Nominatim geocode with NYC-focused parameters."""
    return geolocator.geocode(
        address_json,
        country_codes="us",         # restrict to USA
        addressdetails=True,        # return structured address info
        language="en",              # English results
    )

def geocode_with_retries(geocode_func, address, max_retries=3):
    """Take a single comma-separated address string, and retry if failed."""
    parts = [p.strip() for p in address.split(",")]
    address = f"{parts[0]}, {parts[3]}, {parts[4]}"
    attempt = 0
    while attempt <= max_retries:
        try:
            return geocode_func(address)
        except Exception as e:
            attempt += 1
            wait = 2 ** attempt
            print(f"Warn: geocode error ({attempt}/{max_retries}) for '{address}': {e}. Backing off {wait}s.")
            time.sleep(wait)
    return None

# =====================================================
# MAIN SCRIPT
# =====================================================
input_path = os.path.join(interim_root_folder, config["input_file"])
df = pd.read_excel(input_path, engine="openpyxl")

# Filter to only the given group and only rows without lat/lon
to_geocode_df = df[
    (df["GROUP"] == config["group_number"]) &
    (df["LAT"].isna() | df["LON"].isna())
].reset_index(drop=True)

print(f"Group {config['group_number']} has {len(to_geocode_df)} addresses to geocode.")

# Initialize geolocator with NYC-specific parameters
geolocator = Nominatim(user_agent=config["user_agent"], timeout=10)

# Use a partial so each call applies NYC bounding box parameters
geocode = RateLimiter(
    partial(geocode_address, geolocator),
    min_delay_seconds=config["min_delay_seconds"],
    error_wait_seconds=10
)

# Prepare per-group output file
group_cache_file = os.path.join(
    interim_root_folder, f"geocode_group_{config['group_number']}.xlsx"
)

# Load existing cache if available
if os.path.exists(group_cache_file):
    cache_df = pd.read_excel(group_cache_file, engine="openpyxl")
else:
    cache_df = pd.DataFrame(columns=["GROUP","FULL ADDRESS","FULL ADDRESS NORM","LAT","LON"])

# Build a set of already geocoded addresses (FULL ADDRESS NORM) for quick lookup
cache_key = set(cache_df["FULL ADDRESS NORM"].astype(str).tolist())

new_rows = []

for idx, row in tqdm(to_geocode_df.iterrows(), total=len(to_geocode_df), desc="Geocoding"):
    norm_addr = str(row["FULL ADDRESS NORM"])
    
    if norm_addr in cache_key:
        continue  # already geocoded
    
    try:
        loc = geocode_with_retries(lambda a: geocode(a), norm_addr, max_retries=config["max_retries"])
        lat, lon = (loc.latitude, loc.longitude) if loc else (None, None)
    except Exception as e:
        print(f"Error geocoding '{norm_addr}': {e}")
        lat, lon = None, None

    new_rows.append({
        "GROUP": row["GROUP"],
        "FULL ADDRESS": row["FULL ADDRESS"],
        "FULL ADDRESS NORM": row["FULL ADDRESS NORM"],
        "LAT": lat,
        "LON": lon,
    })

    # Flush periodically
    if len(new_rows) >= config["flush_every"]:
        tmp_df = pd.DataFrame(new_rows)
        cache_df = pd.concat([cache_df, tmp_df], ignore_index=True)
        safe_write_excel(cache_df, group_cache_file)
        print(f"Flushed {len(new_rows)} rows to {group_cache_file} (total cached: {len(cache_df)})")
        new_rows = []

# Final flush
if new_rows:
    tmp_df = pd.DataFrame(new_rows)
    cache_df = pd.concat([cache_df, tmp_df], ignore_index=True)
    safe_write_excel(cache_df, group_cache_file)
    print(f"Final flush: {len(new_rows)} rows written to {group_cache_file} (total cached: {len(cache_df)})")

print(f"\nGeocoding complete for group {config['group_number']}. Saved results to: {group_cache_file}")