In [1]:
import pandas as pd
import json
import re

In [2]:
# === Normalize Address ===
def normalize_address(addr):
    if not isinstance(addr, str):
        return ""
    addr = addr.upper()
    addr = re.sub(r'\s+(APT|UNIT|#)\s*\w+', '', addr)  # Remove unit/apartment info
    addr = re.sub(r'\bST\b', 'STREET', addr)
    addr = re.sub(r'\bAVE\b', 'AVENUE', addr)
    addr = re.sub(r'\s+', ' ', addr).strip()
    return addr

# === Load Complaints Data ===
complaints_df = pd.read_csv("311_Service_Requests_from_2010_to_Present_20250412.csv", low_memory=False)

# Normalize complaint addresses
complaints_df = complaints_df[complaints_df['Incident Address'].notnull()].copy()
complaints_df['normalized_address'] = complaints_df['Incident Address'].apply(normalize_address)

# === Aggregate Complaint Counts ===
complaint_summary = (
    complaints_df
    .groupby(['normalized_address', 'Complaint Type'])
    .size()
    .reset_index(name='count')
)

# === Load StreetEasy Listings ===
with open("filtered_listings.json", "r") as f:
    listings = json.load(f)

# === Match Listings to Complaints ===
for listing in listings:
    normalized_listing_addr = normalize_address(listing.get("addr_street", ""))
    matched_complaints = complaint_summary[complaint_summary['normalized_address'] == normalized_listing_addr]

    complaints_dict = {
        row['Complaint Type']: int(row['count'])
        for _, row in matched_complaints.iterrows()
    }

    listing['building_complaints'] = complaints_dict

# === Save Enriched Listings ===
with open("enriched_listings_with_complaints.json", "w") as f:
    json.dump(listings, f, indent=2)
