In [1]:
import pandas as pd
import usaddress
import re
import logging

# Configure logger once
logger = logging.getLogger("address_parser")
logger.setLevel(logging.INFO)
handler = logging.FileHandler("parse_failures.log")
logger.addHandler(handler)


In [None]:
import pandas as pd
import usaddress
import re
import logging

# --- Logging setup with handler guard ---
logger = logging.getLogger("address_parser")
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.FileHandler("parse_failures.log")
    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# --- Abbreviation normalization ---
ABBREVIATIONS = {
    "st": "street", "ave": "avenue", "rd": "road", "blvd": "boulevard",
    "dr": "drive", "n": "north", "s": "south", "e": "east", "w": "west"
}

def normalize_string(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[.,#]", "", s)
    tokens = s.split()
    tokens = [ABBREVIATIONS.get(tok, tok) for tok in tokens]
    return " ".join(tokens).strip()

# --- Unit splitting before parsing ---
def split_unit(addr: str):
    parts = re.split(r'\s*-\s*|\s+#\s*', addr, maxsplit=1)
    street = parts[0]
    unit = parts[1] if len(parts) > 1 else ''
    return street.strip(), unit.strip()

def enhanced_parse_full(addr: str):
    raw_street, unit_part = split_unit(addr)
    try:
        parsed, _ = usaddress.tag(raw_street)
    except usaddress.RepeatedLabelError as e:
        logger.info(f"RepeatedLabelError for '{addr}': {e}")
        return {}, unit_part
    except Exception as e:
        logger.info(f"Parse failure for '{addr}': {e}")
        return {}, unit_part
    return parsed, unit_part

# --- Load data ---
prop = pd.read_csv("properties-out.csv")
listing = pd.read_csv("listings-out.csv")

# Normalize property types
TYPE_MAP = {
    "Apartment Building": "Apartment",
    "Apartment Floor Plan": "Apartment"
}
prop["type_norm"] = prop["type"].map(TYPE_MAP).fillna(prop["type"])

# --- Process properties ---
parsed_results = prop["street_address"].fillna("").apply(enhanced_parse_full)
prop["parsed"], prop["unit_part"] = zip(*parsed_results)

prop["street_part"] = prop["parsed"].apply(lambda d: " ".join([
    d.get("AddressNumber", ""),
    d.get("StreetName", ""),
    d.get("StreetNamePostType", "")
]).strip())

prop["full_address"] = (
    prop["street_part"] + " " +
    prop["city"].fillna("") + " " +
    prop["state"].fillna("") + " " +
    prop["zipcode"].astype(str)
).str.strip()

prop["token_set"] = prop["full_address"].str.lower().str.split().apply(lambda x: " ".join(sorted(set(x))))

# --- Final property frame for DB ---
prop_clean = prop[[
    "property_id", "team_id", "street_part", "unit_part",
    "city", "state", "zipcode", "full_address",
    "token_set", "type_norm"
]]
prop_clean.to_csv("properties_cleaned.csv", index=False)

# --- Process listings ---
parsed_results = listing["name"].fillna("").apply(enhanced_parse_full)
listing["parsed"], listing["unit_part"] = zip(*parsed_results)

listing["street_part"] = listing["parsed"].apply(lambda d: " ".join([
    d.get("AddressNumber", ""),
    d.get("StreetName", ""),
    d.get("StreetNamePostType", "")
]).strip())

listing["city"] = listing["parsed"].apply(lambda d: d.get("PlaceName", ""))
listing["state"] = listing["parsed"].apply(lambda d: d.get("StateName", ""))
listing["zipcode"] = listing["parsed"].apply(lambda d: d.get("ZipCode", ""))

listing["full_address"] = (
    listing["street_part"] + " " +
    listing["city"] + " " +
    listing["state"] + " " +
    listing["unit_part"]
).str.strip()

listing["token_set"] = listing["full_address"].str.lower().str.split().apply(lambda x: " ".join(sorted(set(x))))

# --- Final listing frame for DB ---
clean_listing = listing[[
    "listing_id", "property_id", "team_id", "street_part", "unit_part",
    "city", "state", "zipcode", "full_address", "token_set"
]]
clean_listing.to_csv("listings_clean.csv", index=False)