In [2]:
#Importing needed libraries
import requests
import pandas as pd
from datetime import datetime
import geopandas as gpd
import fiona
import time
from json import JSONDecodeError
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.discrete.count_model import ZeroInflatedPoisson
from personal_lib import general_functions as gf


from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="capstone_test")



In [None]:
### NYC ENERGY DATA BUILDINGS

In [None]:
### Pulling in the Data Via the APi online 

##### Skip here until reading in files from CSV, because this takes very long to run.

In [None]:
PAGE = 1000               # <-- enforce 1,000 rows per page
TIMEOUT = 30
MAX_RETRIES = 5
BACKOFF_BASE = 1.5

session = requests.Session()
headers = {}

def fetch_all_rows_1k(api_url: str, source_years: str, source_info_url: str) -> pd.DataFrame:
    offset = 0
    frames = []

    while True:
        params = {"$limit": PAGE, "$offset": offset}

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                resp = session.get(api_url, params=params, headers=headers, timeout=TIMEOUT)
                if resp.status_code in (429, 502, 503, 504):
                    time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))
                    continue
                resp.raise_for_status()

                if "json" not in resp.headers.get("Content-Type", "").lower():
                    preview = resp.text[:200]
                    raise ValueError(f"Non-JSON response (status {resp.status_code}): {preview}")

                data_chunk = resp.json()
                if not data_chunk:
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

                # Create DataFrame
                df = pd.DataFrame(data_chunk)

                # 🔑 Add your metadata columns here
                df["source_years"] = source_years
                df["source_api_url"] = api_url
                df["source_info_url"] = source_info_url

                frames.append(df)

                # If less than PAGE, stop; otherwise keep paginating
                if len(data_chunk) < PAGE:
                    return pd.concat(frames, ignore_index=True)

                offset += PAGE
                break  # Success, go to next page

            except (requests.RequestException, JSONDecodeError, ValueError) as e:
                if attempt == MAX_RETRIES:
                    print(f"⚠️ Failed fetching {api_url} at offset {offset}: {e}")
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
                time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))

# --- your existing mapping dict (unchanged) ---
# building_energy_LL84_sources = { ... }  # keep your dict as-is

In [None]:
agg_running_list = []
for k, v in building_energy_LL84_sources.items():
    print(f"Fetching {k} -> {v['api']}")
    df = fetch_all_rows_1k(v["api"], k, v["info"])
    if not df.empty:
        agg_running_list.append(df)
    else:
        print(f"Warning: no rows returned for {k} ({v['api']}).")

nyc_building_energy = pd.concat(agg_running_list, ignore_index=True) if agg_running_list else pd.DataFrame()
print(f"Total rows: {len(nyc_building_energy)}")

In [None]:
## There are mulitple sources of yearly data. Compiling the source URLs here as well as the API urls to grab
## privately owned buildings over 25,000 ft2 and in City-owned buildings over 10,000 ft2
building_energy_LL84_sources={"2022+":{
    "api":"https://data.cityofnewyork.us/resource/5zyy-y8am.json",
    "info":"https://data.cityofnewyork.us/Environment/NYC-Building-Energy-and-Water-Data-Disclosure-for-/5zyy-y8am/about_data"
                              },
                              "2021":{
    "api":"https://data.cityofnewyork.us/resource/7x5e-2fxh.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/7x5e-2fxh/about_data"
                              },
                              "2020":{
    "api":"https://data.cityofnewyork.us/resource/usc3-8zwd.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/usc3-8zwd/about_data"
                                  },
                              "2019":{
    "api":"https://data.cityofnewyork.us/resource/wcm8-aq5w.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/wcm8-aq5w/about_data"
                                  },
                              "2018":{
    "api":"https://data.cityofnewyork.us/resource/4tys-3tzj.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/4tys-3tzj/about_data"
                                  },
                              "2017":{
    "api":"https://data.cityofnewyork.us/resource/4t62-jm4m.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/4t62-jm4m/about_data"
                                  },
                              "2016":{
    "api":"https://data.cityofnewyork.us/resource/utpj-74fz.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/utpj-74fz/about_data"
                                  },
                              "2015":{
    "api":"https://data.cityofnewyork.us/resource/77q4-nkfh.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/77q4-nkfh/about_data"
                                  },
                              "2014":{
    "api":"https://data.cityofnewyork.us/resource/nbun-wekj.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/nbun-wekj/about_data"
                                  },
                              "2013":{
    "api":"https://data.cityofnewyork.us/resource/yr5p-wjer.json",
    "info":"http://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/yr5p-wjer"
                                  },
                              "2012":{
    "api":"https://data.cityofnewyork.us/resource/r6ub-zhff.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/r6ub-zhff/about_data"
                                  },
                              "2011":{
    "api":"https://data.cityofnewyork.us/resource/k7nh-aufb.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/k7nh-aufb/about_data"
                                  },
                              "2010":{
    "api":"https://data.cityofnewyork.us/resource/kswi-37bp.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/kswi-37bp/about_data"
                                  },
#Monthly Data? 2018-2023
                              "Monthly":{
    "api":"https://data.cityofnewyork.us/resource/fvp3-gcb2.json",
    "info":"https://data.cityofnewyork.us/Environment/Local-Law-84-Monthly-Data/fvp3-gcb2/about_data"
    }}


#### End Skip here. Pick up and read in data

In [1]:
### If just want residential limist continue to skip in untill reading in residential 3
# nyc_building_energy=  pd.read_csv("nyc_dob_energy_2010_2024.csv")

In [None]:
## Formatting into proper df
# print(nyc_building_energy.shape)
## Saving to File
# nyc_building_energy.to_csv("nyc_dob_energy_2010_2024.csv",index=False)

### Starting Cleaning / Skip if you want ot just read in residential3

In [None]:
### Columns to Drop because potentially irrelevant or redundant for 
nyc_building_energy_processing = nyc_building_energy.copy()
# nyc_building_energy_processing = nyc_building_energy_processing.drop(columns=['data_center_ups_output_meter',
#  'data_center_pdu_input_meter',
#  'data_center_pdu_output_meter',
#  'data_center_it_equipment',
#  'data_center_it_site_energy',
#  'data_center_it_source_energy',
#  'data_center_pue',
#  'data_center_national_median',
#  'data_center_gross_floor_area',
#  'data_center_ups_system',
#  'data_center_it_energy',
#  'data_center_cooling_equipment',
# 'supermarkets_grocery_gross',
#  'supermarkets_grocery_number',
#  'supermarkets_grocery_number_1',
#  'supermarkets_grocery_number_2',
#  'supermarkets_grocery_percent',
#  'supermarkets_grocery_presence',
#  'supermarkets_grocery_walk',
#  'supermarkets_grocery_weekly',
#  'supermarkets_grocery_workers',
# 'house_of_worship_gross_floor',
#  'house_of_worship_pc_density',
#  'house_of_worship_weekly',
#  'house_of_worship_presence',
# 'residence_halls_dormitories',
# 'medical_office_percent_cooled',
#  'residence_halls_dormitories_1',
#  'residence_halls_dormitories_2',
#  'residence_halls_dormitories_3',
#  'residence_halls_dormitories_4',
# 'hotel_onsite_laundry_short',
#  'warehouse_unrefrigerated',
#  'warehouse_unrefrigerated_1',
#  'warehouse_unrefrigerated_2',
#  'warehouse_unrefrigerated_3',
#  'warehouse_unrefrigerated_4',
#  'warehouse_unrefrigerated_5',
#  'warehouse_unrefrigerated_6',
#  'hospital_gross_floor_area',
#  'hospital_laboratory_y_1_n',
#  'hospital_laundry_facility',
#  'hospital_maximum_number_of',
#  'hospital_number_of_buildings',
#  'warehouse_refrigerated_gross',
#  'warehouse_refrigerated_weekly',
#  'warehouse_refrigerated_workers',
#  'hospital_number_of_licensed',
#  'multifamily_home_dishwashers'])

## Fruther Limiting to those that are under 10 Stories. 
nyc_building_energy_processing = nyc_building_energy_processing.replace("Not Available",np.nan).dropna(how='all',axis=1)
nyc_building_energy_processing = nyc_building_energy_processing.dropna(how='all',axis=1)

nyc_building_energy_processing["primary_property_type"] = nyc_building_energy_processing["primary_property_type"].combine_first(nyc_building_energy_processing["primary_property_type_epa"])
nyc_building_energy_processing = nyc_building_energy_processing.drop(columns=["primary_property_type_epa"])
nyc_building_energy_processing["primary_property_type_self"] = nyc_building_energy_processing["primary_property_type_self"].combine_first(nyc_building_energy_processing["primary_property_type_self_selected"])
nyc_building_energy_processing = nyc_building_energy_processing.drop(columns=["primary_property_type_self_selected"])



## Beginning of Limiting to Multifamily Homes (Step 1)
residential = nyc_building_energy_processing[(
    (nyc_building_energy_processing["list_of_all_property_use"].isin([i for i in nyc_building_energy_processing["list_of_all_property_use"].unique() if 'Multifamily Housing' in str(i)]))
    |(nyc_building_energy_processing["list_of_all_property_use"].isnull() 
      & (nyc_building_energy_processing["primary_property_type_self"]=="Multifamily Housing")
      & ((nyc_building_energy_processing["primary_property_type"]=="Multifamily Housing")|
         (nyc_building_energy_processing["primary_property_type"].isnull()))))]

In [None]:
## Beginning of Limiting to Multifamily Homes (Step 1)
residential = nyc_building_energy_processing[(
    (nyc_building_energy_processing["list_of_all_property_use"].isin([i for i in nyc_building_energy_processing["list_of_all_property_use"].unique() if 'Multifamily Housing' in str(i)]))
    |(nyc_building_energy_processing["list_of_all_property_use"].isnull() 
      & (nyc_building_energy_processing["primary_property_type_self"]=="Multifamily Housing")
      & ((nyc_building_energy_processing["primary_property_type"]=="Multifamily Housing")|
         (nyc_building_energy_processing["primary_property_type"].isnull()))))]

In [None]:
print(residential.shape)

In [None]:
## Second Step is limiting to NON MIXED USE, so ONLY MultiFamily Residences
residential = residential[
    residential['list_of_all_property_use'].isnull() |
    residential['list_of_all_property_use'].apply(
        lambda x: len(x.split(",")) == 1 if isinstance(x, str) else False
    )
]
## Fruther Limiting to those that are under 10 Stories. 
residential = residential.replace("Not Available",np.nan).dropna(how='all',axis=1)
residential = residential.dropna(how='all',axis=1)

In [None]:
## Dropping Other Property Type Columns 
to_drop =[
## Banking oriented COlumns
'bank_branch_computer_density',
 'bank_branch_gross_floor_area',
 'bank_branch_gross_floor_area_ft',
 'bank_branch_number_of',
 'bank_branch_number_of_workers',
 'bank_branch_percent_that',
 'bank_branch_weekly_operating',
 'bank_branch_worker_density',
## College / Uni / School
'college_university_gross',
 'college_university_gross_floor_area_ft',
 'college_university_number',
    'k_12_school_computer_density',
 'k_12_school_cooking_facilities',
 'k_12_school_gross_floor_area',
 'k_12_school_gross_floor_area_ft',
 'k_12_school_high_school',
 'k_12_school_percent_that',
 'k_12_school_refrigeration',
 'k_12_school_weekend_operation',
 'laboratory_gross_floor_area_ft',
    'library_gross_floor_area',
# OTher
"automobile_dealership_gross",
'convenience_store_without',
'data_center_energy_estimates_applied',
 'data_center_gross_floor_area',
 'data_center_gross_floor_area_ft',
 'data_center_it_energy',
 'data_center_it_energy_configuration',
 'data_center_it_equipment_input_meter_kwh',
 'data_center_it_site_energy',
 'data_center_it_site_energy_kwh',
 'data_center_it_source_energy_kbtu',
 'data_center_national_median',
 'data_center_pdu_input_meter_kwh',
 'data_center_pdu_output_meter_kwh',
 'data_center_ups_output_meter_kwh',
 'enclosed_mall_gross_floor',
 'enclosed_mall_gross_floor_area_ft',
 'fast_food_restaurant_gross',
 'financial_office_gross_floor',
 'financial_office_gross_floor_area_ft',
 'financial_office_number_of',
 'financial_office_number_of_1',
 'financial_office_number_of_computers',
 'financial_office_number_of_workers_on_main_shift',
 'financial_office_weekly',
 'financial_office_weekly_operating_hours',
 'fitness_center_health_club',
 'fitness_center_health_club_gym_gross_floor_area_ft',
 'food_sales_gross_floor_area',
 'food_sales_gross_floor_area_ft',
 'food_service_gross_floor',
 'food_service_gross_floor_area_ft',
     'worship_facility_computer',
 'worship_facility_cooking',
 'worship_facility_gross_floor',
 'worship_facility_gross_floor_area_ft',
 'worship_facility_weekly',
    'supermarket_grocery_cooking',
 'supermarket_grocery_gross',
 'supermarket_grocery_gross_floor_area_ft',
 'supermarket_grocery_number',
 'supermarket_grocery_number_1',
 'supermarket_grocery_number_2',
 'supermarket_grocery_number_of_open_or_closed_refrigeration_freezer_units',
 'supermarket_grocery_number_of_walk_in_refrigeration_freezer_units',
 'supermarket_grocery_percent',
 'supermarket_grocery_walk',
 'supermarket_grocery_weekly',
 'supermarket_grocery_worker',
 'swimming_pool_approximate',
 'swimming_pool_location_of',
 'swimming_pool_months_in_use',
     'social_meeting_hall_gross',
 'social_meeting_hall_gross_floor_area_ft',
     'residence_hall_dormitory',
 'residence_hall_dormitory_1',
 'residence_hall_dormitory_2',
 'residence_hall_dormitory_3',
 'residence_hall_dormitory_4',
 'residence_hall_dormitory_gross_floor_area_ft',
 'restaurant_gross_floor_area',
 'restaurant_gross_floor_area_ft',
 'restaurant_weekly_operating',
 'restaurant_weekly_operating_hours',
 'restaurant_worker_density',
 'restaurant_worker_density_number_per_1_000_sq_ft',
 'retail_store_cash_register',
 'retail_store_computer_density',
 'retail_store_exterior_entrance',
 'retail_store_gross_floor',
 'retail_store_gross_floor_area_ft',
 'retail_store_number_of_open',
 'retail_store_number_of_open_or_closed_refrigeration_freezer_units',
 'retail_store_number_of_walk',
 'retail_store_number_of_walk_in_refrigeration_freezer_units',
 'retail_store_open_or_closed',
 'retail_store_percent_that',
 'retail_store_walk_in',
 'retail_store_weekly_operating',
 'retail_store_worker_density',
 'self_storage_facility_gross',
 'self_storage_facility_gross_floor_area_ft',
 'senior_care_community_average',
 'senior_care_community_gross',
 'senior_care_community_maximum',
 'senior_care_community_number',
 'senior_care_community_number_1',
 'senior_care_community_number_2',
 'senior_care_community_number_3',
 'senior_care_community_number_4',
 'senior_care_community_number_5',
 'senior_care_community_number_6',
 'senior_care_community_percent',
 'senior_living_community_gross_floor_area_ft',
 'senior_living_community_living_unit_density_number_per_1_000_sq_ft',
    'movie_theater_gross_floor_area_ft',
    'non_refrigerated_warehouse',
 'non_refrigerated_warehouse_1',
 'non_refrigerated_warehouse_2',
 'non_refrigerated_warehouse_3',
 'non_refrigerated_warehouse_4',
 'non_refrigerated_warehouse_5',
 'non_refrigerated_warehouse_gross_floor_area_ft',
     'parking_completely_enclosed',
 'parking_completely_enclosed_parking_garage_size_ft',
 'parking_gross_floor_area',
 'parking_gross_floor_area_ft',
 'parking_open_parking_lot',
 'parking_open_parking_lot_size_ft',
 'parking_partially_enclosed',
 'parking_partially_enclosed_parking_garage_size_ft',
     'strip_mall_gross_floor_area',
## Hostpial / Hotel
    'hospital_general_medical',
 'hospital_general_medical_1',
 'hospital_general_medical_10',
 'hospital_general_medical_11',
 'hospital_general_medical_12',
 'hospital_general_medical_13',
 'hospital_general_medical_14',
 'hospital_general_medical_15',
 'hospital_general_medical_16',
 'hospital_general_medical_17',
 'hospital_general_medical_2',
 'hospital_general_medical_3',
 'hospital_general_medical_4',
 'hospital_general_medical_5',
 'hospital_general_medical_6',
 'hospital_general_medical_7',
 'hospital_general_medical_8',
 'hospital_general_medical_9',
 'hotel_amount_of_laundry',
 'hotel_cooking_facilities',
 'hotel_full_service_spa_floor',
 'hotel_gross_floor_area_ft',
 'hotel_gym_fitness_center',
 'hotel_gym_fitness_center_floor_area_ft',
 'hotel_number_of_rooms',
 'hotel_percent_that_can_be',
 'hotel_room_density_number',
 'hotel_type_of_laundry_facility',
 'hotel_worker_density_number',
 'urgent_care_clinic_other',
 'urgent_care_clinic_other_outpatient_gross_floor_area_ft',
    'mailing_center_post_office_gross_floor_area_ft',
 'manufacturing_industrial_plant_gross_floor_area_ft',
 'medical_office_gross_floor',
 'medical_office_gross_floor_area_ft',
 'medical_office_mri_machine',
 'medical_office_number_of',
 'medical_office_number_of_1',
 'medical_office_number_of_computers',
 'medical_office_number_of_mri_machines',
 'medical_office_number_of_workers_on_main_shift',
 'medical_office_percent_that',
 'medical_office_percent_that_1',
 'medical_office_percent_that_can_be_cooled',
 'medical_office_percent_that_can_be_heated',
 'medical_office_weekly',
 'medical_office_weekly_operating_hours',
    'museum_gross_floor_area_ft',
    'office_computer_density_number',
 'office_gross_floor_area_ft',
 'office_number_of_computers',
 'office_number_of_workers',
 'office_number_of_workers_on_main_shift',
 'office_percent_that_can_be',
 'office_percent_that_can_be_1',
 'office_percent_that_can_be_cooled',
 'office_percent_that_can_be_heated',
 'office_weekly_operating_hours',
 'office_worker_density_number',
 'office_worker_density_number_per_1_000_sq_ft',
     'adult_education_gross_floor',
 'adult_education_gross_floor_area_ft',
]
residential2 = residential.drop(columns=to_drop)

In [None]:
# Convert year_ending to datetime
residential2["year_ending"] = pd.to_datetime(residential2["year_ending"], errors="coerce")
# Optional: extract just the year if that’s all you need
residential2["year_ending_year"] = residential2["year_ending"].dt.year
residential2[["report_year","year_ending_year","source_years","source_api_url","source_info_url"]][residential['report_year'].isnull()]
## Limtiing to the Metered Areas for whole property or whole building
residential3 = residential2[residential2["metered_areas_energy"].isin(['Whole Building', "Whole Property"])]
residential3 = residential3[residential3["primary_property_type_self"]=='Multifamily Housing'] ## confirming onlyl multifam. No nulls here

residential3 = residential3[(residential3["construction_status"]=='Existing')|(residential3["construction_status"].isnull())]
## Stand Alone Properties
residential3 = residential3[(residential3["parent_property_id"].isnull())|(residential3["parent_property_id"]=='Not Applicable: Standalone Property')]
residential3 = residential3.drop(columns=["primary_property_type_self",
                                          "primary_property_type",
                                          "national_median_reference",
                                          "list_of_all_property_use","largest_property_use_type","construction_status",
                                         "parent_property_id","parent_property_name"])


##### End second Skip

In [97]:
residential3 = pd.read_csv("residential3.csv") 

  residential3 = pd.read_csv("residential3.csv")


#### Limint by Years and for those Buildigns that have been i nthe Data for the years we want

In [216]:
buildings_2012_2013_ids = residential3[residential3["year_ending_year"].isin([2012,2013])]["property_id"].unique()
print("CheckingGrouping")
print(residential3[residential3['property_id'].isin(buildings_2012_2013_ids)].groupby(['year_ending_year']).agg({"year_ending":"count"}).reset_index())
### Limtiing to 2017 and before 
residential3_2012_2017 = residential3[residential3["year_ending_year"]<2018]
print("YEARS")
print(residential3_2012_2017['year_ending_year'].unique())
residential3_2012_2017 = residential3_2012_2017[residential3_2012_2017['property_id'].isin(buildings_2012_2013_ids)]

CheckingGrouping
    year_ending_year  year_ending
0               2012           32
1               2013         9275
2               2014         7475
3               2015         4429
4               2016         3908
5               2017         5104
6               2018         3365
7               2019         3199
8               2020         4399
9               2021         2547
10              2022         2644
11              2023         2773
YEARS
[2017 2016 2015 2014 2013 2012]


In [217]:
### Building Info Only 
building_info =residential3_2012_2017[["property_id","address_1","address_2","city","postal_code",'county','borough',"latitude","longitude"]].drop_duplicates()
building_info.to_csv("building_info.csv", index=False)

In [218]:
# --- PATCHED HELPERS (extend yours) ---
import re
import pandas as pd
from typing import Optional, List, Tuple

STATE_ABBR = "NY"

# NYC biasing bounds (rough NYC box)
NYC_BOUNDS = {
    "southwest": {"lat": 40.477399, "lng": -74.259090},
    "northeast": {"lat": 40.917577, "lng": -73.700272},
}

# Common misspellings / normalizations seen in your data
COMMON_FIXES = {
    r"\bWASHIGNTON\b": "WASHINGTON",
    r"\bWASHNGTN\b": "WASHINGTON",
    r"\bALBERMALE\b": "ALBEMARLE",
    r"\bALBERMALE\b": "ALBEMARLE",
    r"\bAMSETRDAM\b": "AMSTERDAM",
    r"\bKINGBRIDGE\b": "KINGSBRIDGE",
    r"\bOVINGTON\b": "OVINGTON",   # keep as is (looks fine)
    r"\bNAGLE HOUSE\b": "NAGLE AVE",  # likely street not building name
    r"\bNAGLE\b": "NAGLE",  # pass-through
    r"\bST\s*NICOLAS\b": "ST NICHOLAS",
    r"\bOILVER\b": "OLIVER",
    r"\bRIVER SIDE\b": "RIVERSIDE",
    r"\bSTEET\b": "STREET",
    r"\bSREET\b": "STREET",
    r"\bFOR WASHINGTON\b": "FORT WASHINGTON",
    r"\bFT\.?\b": "FORT",
    r"\bBIVONA\b": "BIVONA",  # leave
    r"\bCLARKE\b": "CLARKE",  # leave
    r"\bCAROLL\b": "CARROLL",
    r"\bCLAFFIN\b": "CLAFLIN",
    r"\bDEKALB\b": "DEKALB",
    r"\bLAFAYETTE\b": "LAFAYETTE",
    r"\bW\s*MOSHOLU\b": "W MOSHOLU",
    r"\bE\s*MOSHOLU\b": "E MOSHOLU",
    r"\bMOSHOLU\b": "MOSHOLU",
    r"\bPARSON\b": "PARSONS",
    r"\bPARSONS BLVD?\b": "PARSONS BLVD",
}

# Street type vocabulary to help "best-guess" if clearly missing
STREET_TYPES = ["ST", "AVE", "RD", "BLVD", "PL", "LN", "DR", "CT", "PKWY", "TER", "PLZ"]

def _norm_ws(s: Optional[str]) -> Optional[str]:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return None
    s = str(s)
    # HTML escapes commonly seen
    s = s.replace("&amp;", "&")
    s = s.replace("\u2013", "-").replace("\u2014", "-")
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s if s else None

def _apply_common_fixes(u: str) -> str:
    for pat, repl in COMMON_FIXES.items():
        u = re.sub(pat, repl, u, flags=re.IGNORECASE)
    return u

def _std_unit(s: Optional[str]) -> Optional[str]:
    u = _norm_ws(s)
    if not u:
        return None
    u = u.upper().replace("#", "").strip()
    if not u:
        return None
    u = re.sub(r"^(APARTMENT|APT\.?)\s*", "APT ", u)
    u = re.sub(r"^(SUITE|STE\.?)\s*", "STE ", u)
    u = re.sub(r"^(FLOOR|FL\.?)\s*", "FL ", u)
    if not re.match(r"^(APT|STE|FL)\b", u):  # bare "5B" → "APT 5B"
        u = f"APT {u}"
    return u

def _std_street(u: Optional[str]) -> Optional[str]:
    u = _norm_ws(u)
    if not u:
        return None
    u = _apply_common_fixes(u.upper())

    repl = {
        " STREET": " ST",
        " AVENUE": " AVE",
        " ROAD": " RD",
        " BOULEVARD": " BLVD",
        " PLACE": " PL",
        " LANE": " LN",
        " DRIVE": " DR",
        " COURT": " CT",
        " PARKWAY": " PKWY",
        " TERRACE": " TER",
        " PLAZA": " PLZ",
    }
    for k, v in repl.items():
        u = re.sub(k + r"\b", v, u)

    # Normalize ordinals and add missing "TH/ST/ND/RD" on bare numbers like "E 48" or "West104"
    u = re.sub(r"\b(\d+)\s*(ST|ND|RD|TH)\b", lambda m: f"{int(m.group(1))}{m.group(2)}", u)
    u = re.sub(r"\b(WEST|W|EAST|E)\s*(\d{1,3})(?=\b)", r"\1 \2", u)  # ensure space
    def add_ordinal(m):
        n = int(m.group(2))
        suf = "TH"
        if n % 10 == 1 and n % 100 != 11: suf = "ST"
        elif n % 10 == 2 and n % 100 != 12: suf = "ND"
        elif n % 10 == 3 and n % 100 != 13: suf = "RD"
        return f"{m.group(1)} {n}{suf}"
    u = re.sub(r"\b(WEST|W|EAST|E)\s+(\d{1,3})\b(?!\s*(ST|AVE|RD|BLVD|PL|LN|DR|CT|PKWY|TER|PLZ))", add_ordinal, u)

    # Collapse multiple addresses to last one here (we also split earlier in the pipeline)
    if "/" in u:
        u = u.split("/")[-1].strip()

    # Queens: hyphenated house numbers like "41-07 42ND ST" are valid; leave them as-is
    # Best-guess add a street type if line is like "308 WEST 104TH" (no type) or "160 E 48TH"
    if re.search(r"\b(WEST|W|EAST|E|NORTH|N|SOUTH|S)\b", u) and re.search(r"\b\d{1,3}(ST|ND|RD|TH)\b", u) and not re.search(r"\b(ST|AVE|RD|BLVD|PL|LN|DR|CT|PKWY|TER|PLZ)\b", u):
        u = u + " ST"

    # Convert key phrases
    u = re.sub(r"\bFT\b", "FORT", u)

    return u.title()

def _borough_from_fields(borough, county, city) -> Optional[str]:
    vals = " ".join([str(x) for x in [borough, county, city] if pd.notna(x)]).upper()
    if any(x in vals for x in ["MANHATTAN", "NEW YORK", "NY COUNTY"]): return "MANHATTAN"
    if "BRONX" in vals: return "BRONX"
    if any(x in vals for x in ["BROOKLYN", "KINGS"]): return "BROOKLYN"
    if "QUEENS" in vals: return "QUEENS"
    if any(x in vals for x in ["STATEN", "RICHMOND"]): return "STATEN ISLAND"
    return None

def _usps_city_from_borough(city, borough_norm) -> Optional[str]:
    c = (_norm_ws(city) or "").upper()
    if borough_norm == "MANHATTAN": return "New York"
    if borough_norm == "BRONX": return "Bronx"
    if borough_norm == "BROOKLYN": return "Brooklyn"
    if borough_norm == "QUEENS":
        # USPS accepts neighborhoods, but "Queens" is safest default
        return "Queens" if c in ["", "QUEENS", "NAN"] else c.title()
    if borough_norm == "STATEN ISLAND": return "Staten Island"
    return c.title() if c else None

def _std_zip5(z):
    if z is None or (isinstance(z, float) and pd.isna(z)):
        return None
    s = re.sub(r"\D", "", str(z))
    return s[:5] if len(s) >= 5 else None

# ---------- NEW: address candidate generator ----------

SEP_PATTERN = re.compile(r"\s*(?:;|,|/|&{1,2}| and | AND |\bETAL\b|\bet al\b|\(|\)|\bAKA\b|\bA\/K\/A\b)\s*", flags=re.IGNORECASE)

def _is_pobox(s: str) -> bool:
    return bool(re.search(r"\bP\.?\s*O\.?\s*BOX\b", s, flags=re.IGNORECASE))

def _has_street_name(u: str) -> bool:
    # needs something besides just a house number or zeros
    return bool(re.search(r"\b[A-Z][A-Z]+\b", u)) and not re.fullmatch(r"\d+(-\d+)?", re.sub(r"[^0-9\-]", "", u or ""))

def _expand_number_ranges(u: str) -> List[str]:
    """
    '2078-84-90 Morris Ave' -> ['2078 Morris Ave','2084 Morris Ave','2090 Morris Ave']
    '1051-1057-1061-1065 Boston Rd' -> same expansion
    """
    m = re.search(r"\b(\d{1,6}(?:-\d{1,6})+)\s+([A-Z].+)$", u)
    if not m:
        return [u]
    block = m.group(1)
    tail = m.group(2)
    parts = [p for p in re.split(r"-", block) if p]
    base = parts[0]
    expanded = []
    for p in parts:
        if len(p) < len(base):  # "2078-84" -> fill high-order digits from base
            p = base[:len(base)-len(p)] + p
        expanded.append(f"{p} {tail}")
    return expanded

def _split_multi(s: str) -> List[str]:
    s = SEP_PATTERN.sub(" | ", s)  # unify to pipe, then split
    tokens = [t.strip(" .") for t in s.split("|") if _norm_ws(t)]
    # Remove address-like fragments that are obviously units/notes only
    return [t for t in tokens if t and not re.match(r"^(APT|STE|FL|B#\d+)\b", t, flags=re.IGNORECASE)]

def build_candidates(row) -> List[Tuple[str, dict, str]]:
    """
    Returns list of (query, components, reason_tag)
    """
    a1 = _std_street(row.get("address_line1_clean"))
    a2 = _std_unit(row.get("address_line2_clean"))
    borough = _borough_from_fields(row.get("borough_clean"), row.get("county"), row.get("city_clean"))
    city = _usps_city_from_borough(row.get("city_clean"), borough)
    zip5 = _std_zip5(row.get("zip"))
    state = STATE_ABBR

    raw = _norm_ws(row.get("geocode_key") or row.get("address_1") or "")
    raw = _apply_common_fixes(raw.upper()) if raw else ""
    raw_parts = _split_multi(raw) if raw else []

    # Build an address seed from clean fields
    seeds = []
    if a1:
        seeds.append(a1)
    seeds.extend(_expand_number_ranges(x) for x in raw_parts)
    seeds = [y for x in seeds for y in (x if isinstance(x, list) else [x])]
    seeds = [s for s in seeds if s] or ([raw.title()] if raw else [])

    candidates = []
    for s in seeds:
        s_norm = _std_street(s)
        if not s_norm:
            continue
        if _is_pobox(s_norm):
            candidates.append((None, {}, "POBOX"))
            continue
        if not _has_street_name(s_norm):
            candidates.append((None, {}, "MISSING_STREET"))
            continue

        line = s_norm
        parts = [line]
        # Append unit if it looks like an apartment (rarely helpful for geocode but OK)
        # if a2: parts.append(a2)

        locality = city or (borough.title() if borough else None)
        tail = ", ".join([p for p in [locality, state, zip5] if p])
        query = f"{' '.join(parts)}, {tail}" if tail else " ".join(parts)

        comps = {"administrative_area": state}
        if locality: comps["locality"] = locality

        candidates.append((query, comps, "OK"))
    # Deduplicate, keep order
    seen = set()
    out = []
    for q, c, r in candidates:
        key = (q or r, tuple(sorted(c.items())))
        if key not in seen:
            seen.add(key)
            out.append((q, c, r))
    return out


In [219]:
# Cleaned drafts
building_info["address_line1_clean"] = building_info["address_1"].apply(_std_street)
building_info["address_line2_clean"] = building_info["address_2"].apply(_std_unit)

building_info["borough_clean"] = [
    _borough_from_fields(b, c, ci)
    for b, c, ci in zip(building_info.get("borough"), building_info.get("county"), building_info.get("city"))
]
building_info["city_clean"] = [
    _usps_city_from_borough(ci, bn)
    for ci, bn in zip(building_info.get("city"), building_info["borough_clean"])
]
building_info["state_clean"] = STATE_ABBR
building_info["postal_code_5_clean"] = building_info.get("postal_code").apply(_std_zip5) if "postal_code" in building_info.columns else None

# Mark rows missing coords
building_info["needs_geocoding"] = building_info["latitude"].isna() | building_info["longitude"].isna()

# a stable address key to dedupe geocoding calls
def addr_key(r):
    parts = [
        _norm_ws(r["address_line1_clean"]) or "",
        _norm_ws(r["address_line2_clean"]) or "",
        _norm_ws(r["city_clean"]) or "",
        STATE_ABBR,
        _norm_ws(r["postal_code_5_clean"]) or "",
    ]
    return "|".join(parts).upper()

building_info["geocode_key"] = building_info.apply(addr_key, axis=1)

building_info.head(3)[[
    "property_id","address_1","address_2","city","borough","county",
    "address_line1_clean","address_line2_clean","city_clean","state_clean","postal_code_5_clean",
    "needs_geocoding","geocode_key"
]]


Unnamed: 0,property_id,address_1,address_2,city,borough,county,address_line1_clean,address_line2_clean,city_clean,state_clean,postal_code_5_clean,needs_geocoding,geocode_key
85955,2638790.0,43-22,,Sunnyside,,,43-22,,Sunnyside,NY,,True,43-22||SUNNYSIDE|NY|
85957,2707907.0,2626 Homecrest Avenue,,Brooklyn,BROOKLYN,,2626 Homecrest Ave,,Brooklyn,NY,,False,2626 HOMECREST AVE||BROOKLYN|NY|
85974,3521602.0,3240 Henry Hudson parkway,,Bronx,BRONX,,3240 Henry Hudson Pkwy,,Bronx,NY,,False,3240 HENRY HUDSON PKWY||BRONX|NY|


In [220]:
print(building_info.shape)
building_info = building_info.dropna(subset=["address_1","address_2","city","postal_code","county","borough",
                                             "latitude","longitude","address_line1_clean"],how='all')
print(building_info.shape)

(9739, 17)
(9720, 17)


In [221]:
### Geocode The Null lat long 
with_coord = building_info[building_info["needs_geocoding"]==False]
print(with_coord.shape)
without_coord = building_info[building_info["needs_geocoding"]==True]
print(without_coord.shape)

(9121, 17)
(599, 17)


In [222]:
without_coord['city'][without_coord['city']=='Flushinig']='Flushing'
without_coord['borough_clean'][(without_coord['city'].isin(['Sunnyside', 'Astoria', 'Jackson Heights', 'Flushing', 'Rego Park',
                                                   'Forest Hills', 'Oakland Gardens', 'Bayside', 'Kew Gardens','flushing',
                                                   'Woodside', 'Flushinig', 'Larchmont', 'Ridgewood','Jamaica', 'Albertson'])
                                &(without_coord['borough_clean'].isnull()))]='QUEENS'
### Manual Fixes where possible 
without_coord["borough_clean"][((without_coord['borough_clean'].isnull())&
               (without_coord["address_1"]=="3115 brighton 6th"))]="BROOKLYN"
without_coord["borough_clean"][((without_coord['borough_clean'].isnull())&
               (without_coord["address_1"]=="144-35/39 Sanford Avenue"))]="QUEENS"
without_coord["borough_clean"][((without_coord['borough_clean'].isnull())&
               (without_coord["address_1"]=="183-11 Hillside Ave"))]="QUEENS"
without_coord["borough_clean"][((without_coord['borough_clean'].isnull())&
               (without_coord["address_1"]=="103-30/26 68th Ave"))]="QUEENS"
without_coord["borough_clean"][((without_coord['borough_clean'].isnull())&
               (without_coord["address_1"]=="71-11 -71-23 162ND STREET"))]="QUEENS"
without_coord = without_coord[~without_coord['borough_clean'].isnull()]
print(without_coord.shape)

(586, 17)


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  without_coord['city'][without_coord['city']=='Flushinig']='Flushing'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [223]:
print(without_coord.shape)

(586, 17)


### Start of Geocoding 

In [224]:
creds = gf.get_creds()
google_api_key = creds["Google"]["geocoding"]
open_ai = creds["openai"]["FirstTestKey"]

In [None]:
# reading in past resu;ts
# results_df = pd.concat(results)
# results_df.to_csv("results_geo.csv",index=False)
# results_df = pd.read_csv("results_geo.csv")
# failed_df = pd.concat(failed)
# failed_df.to_csv("failed_geo.csv",index=False)
# failed_df = pd.read_csv("failed_geo.csv")

In [207]:
results = []
failed = []

In [228]:
# ---- GEOCODING LOOP (with retries/candidates/NYC bias) ----
import googlemaps
import os
import math
gmaps = googlemaps.Client(key=google_api_key)

results, failed = [], []

def geocode_row(row):
    cands = build_candidates(row)
    # Always add *raw borough + state* fallback for named complexes (e.g., NYCHA) to try to at least anchor
    borough = _borough_from_fields(row.get("borough_clean"), row.get("county"), row.get("city_clean"))
    if borough:
        cands.append((f"{borough.title()}, NY", {"administrative_area": "NY", "locality": borough.title()}, "BOROUGH_ONLY"))

    for query, comps, tag in cands:
        if not query:
            continue
        try:
            resp = gmaps.geocode(
                query,
                region="us",
                components=comps,
                bounds=NYC_BOUNDS
            )
        except Exception as e:
            print("Geocode error:", e)
            resp = []

        if resp:
            # prefer rooftop or range_interpolated results
            best = sorted(resp, key=lambda r: {"ROOFTOP":0, "RANGE_INTERPOLATED":1}.get(r.get("geometry",{}).get("location_type",""), 2))[0]
            loc = best["geometry"]["location"]
            return loc["lat"], loc["lng"], tag, best.get("formatted_address"), best.get("place_id")

    return None, None, "NO_HIT", None, None

# assumes without_coord is a DataFrame with the relevant columns
for i, r in without_coord[["property_id","geocode_key","address_line1_clean","address_line2_clean","borough_clean","city_clean","county"]].drop_duplicates().iterrows():
    # keep your i >= guard if needed
    temp_df = pd.DataFrame([r])
    lat, lng, tag, faddr, pid = geocode_row(r)

    if lat is not None and lng is not None:
        temp_df["latitude"]  = lat
        temp_df["longitude"] = lng
        temp_df["geocode_tag"] = tag
        temp_df["formatted_address"] = faddr
        temp_df["place_id"] = pid
        results.append(temp_df)
    else:
        temp_df["geocode_tag"] = tag  # e.g., POBOX, MISSING_STREET, NO_HIT
        failed.append(temp_df)


In [208]:


# # assumes: without_coord, results = [], failed = [] already exist
# for i, r in without_coord[["property_id","geocode_key"]].drop_duplicates().iterrows():
#     if i >= 121392:
#         temp_df = pd.DataFrame([r])
#         print(r["property_id"])

#         full_address = r["geocode_key"]
#         print(full_address)

#         if "/" in full_address:
#             full_address = full_address.split("/")[-1].strip()
#             print("Split Addy:", full_address)

#         # ---- Google Geocoding (swap-in for geolocator.geocode) ----
#         try:
#             resp = gmaps.geocode(full_address)  # add region="us" or components=... if you like
#         except Exception as e:
#             print("Geocode error:", e)
#             resp = []

#         if resp:
#             loc = resp[0]["geometry"]["location"]
#             temp_df["latitude"]  = loc["lat"]
#             temp_df["longitude"] = loc["lng"]
#             print(loc["lat"], loc["lng"])
#             results.append(temp_df)
#         else:
#             failed.append(temp_df)


3089718.0
84-17 125TH ST. ET. AL.||QUEENS|NY|
40.7072689 -73.826502
3108051.0
3400 TYRON AVE||BRONX|NY|
40.87956519999999 -73.8767307
3116937.0
854 WEST 181ST||NEW YORK|NY|
40.8511379 -73.9399545
3128084.0
||BROOKLYN|NY|
40.6781784 -73.9441579
3128087.0
||BROOKLYN|NY|
40.6781784 -73.9441579
3128113.0
||BRONX|NY|
40.8447819 -73.8648268
3128115.0
NA NA||BRONX|NY|
40.8447819 -73.8648268
3128128.0
||BRONX|NY|
40.8447819 -73.8648268
3128739.0
142-18 38TH ST||QUEENS|NY|
40.7616081 -73.82639309999999
3129467.0
301 EAST 22ND||NEW YORK|NY|
40.7371253 -73.9806613
3129938.0
54 WEST 94TH ST||NEW YORK|NY|
40.7908594 -73.9673689
3129939.0
||BROOKLYN|NY|
40.6781784 -73.9441579
3223085.0
152-154-156 EAST 171 ST||BRONX|NY|
40.8400493 -73.914232
3234049.0
3052-3054 KINGS BRIDGE AVE||BRONX|NY|
40.8794388 -73.9065129
3464982.0
1464-66, 72, 74 WATSON AVE||BRONX|NY|
40.8280409 -73.8636811
3465047.0
1484-86-92-94 WATSON AVE||BRONX|NY|
40.8255082 -73.8798845
3503004.0
18 34TH RD||QUEENS|NY|
40.7691161 -73.775

In [229]:
print(len(results))
print(len(failed)) ## Zero failures

583
0


In [230]:
# reading in past resu;ts
results_df = pd.concat(results)

In [233]:
without_coord_geo = without_coord.dropna(how='all',axis=1).merge(results_df, 
                                                                 on=["property_id","geocode_key","address_line1_clean","address_line2_clean","borough_clean","city_clean","county"],
                                                                 how='left')
without_coord_geo#[without_coord_geo['latitude'].isnull()]

Unnamed: 0,property_id,address_1,address_2,city,county,borough,address_line1_clean,address_line2_clean,borough_clean,city_clean,state_clean,needs_geocoding,geocode_key,latitude,longitude,geocode_tag,formatted_address,place_id
0,2638790.0,43-22,,Sunnyside,,,43-22,,QUEENS,Sunnyside,NY,True,43-22||SUNNYSIDE|NY|,40.728224,-73.794852,BOROUGH_ONLY,"Queens, NY, USA",ChIJK1kKR2lDwokRBXtcbIvRCUE
1,3524527.0,150-74th Street,,Brooklyn,,,150-74Th St,,BROOKLYN,Brooklyn,NY,True,150-74TH ST||BROOKLYN|NY|,40.678178,-73.944158,BOROUGH_ONLY,"Brooklyn, NY, USA",ChIJCSF8lBZEwokRhngABHRcdoI
2,2638326.0,308 West104,,New York,,,308 West 104Th St,,MANHATTAN,New York,NY,True,308 WEST 104TH ST||NEW YORK|NY|,40.768517,-73.982194,BOROUGH_ONLY,"Manhattan, New York, NY, USA",ChIJYeZuBI9YwokRjMDs_IEyCwo
3,2782781.0,106 & 114 West 143rd Street,,New York,,,106 & 114 West 143Rd St,,MANHATTAN,New York,NY,True,106 & 114 WEST 143RD ST||NEW YORK|NY|,40.768517,-73.982194,BOROUGH_ONLY,"Manhattan, New York, NY, USA",ChIJYeZuBI9YwokRjMDs_IEyCwo
4,2946930.0,777/783 Fox Street,,Bronx,,,783 Fox St,,BRONX,Bronx,NY,True,783 FOX ST||BRONX|NY|,40.844782,-73.864827,BOROUGH_ONLY,"Bronx, NY, USA",ChIJsXxpOlWLwokRd1zxj6dDblU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,4125392.0,720 West 11th Street,,New York,,,720 West 11Th St,,MANHATTAN,New York,NY,True,720 WEST 11TH ST||NEW YORK|NY|,40.768517,-73.982194,BOROUGH_ONLY,"Manhattan, New York, NY, USA",ChIJYeZuBI9YwokRjMDs_IEyCwo
582,2642595.0,3099 brighton 6thStreet,,brooklyn,kings,,3099 Brighton 6Thstreet,,BROOKLYN,Brooklyn,NY,True,3099 BRIGHTON 6THSTREET||BROOKLYN|NY|,40.678178,-73.944158,BOROUGH_ONLY,"Brooklyn, NY, USA",ChIJCSF8lBZEwokRhngABHRcdoI
583,2647976.0,9002-9046 153rd. ave 9001-9055 Shore Parkway,Lindenwood,New York,,,9002-9046 153Rd. Ave 9001-9055 Shore Pkwy,APT LINDENWOOD,MANHATTAN,New York,NY,True,9002-9046 153RD. AVE 9001-9055 SHORE PKWY|APT ...,40.768517,-73.982194,BOROUGH_ONLY,"Manhattan, New York, NY, USA",ChIJYeZuBI9YwokRjMDs_IEyCwo
584,3579396.0,1990 Adam Clayton Powell Jr Boulevard,,New York,,,1990 Adam Clayton Powell Jr Blvd,,MANHATTAN,New York,NY,True,1990 ADAM CLAYTON POWELL JR BLVD||NEW YORK|NY|,40.768517,-73.982194,BOROUGH_ONLY,"Manhattan, New York, NY, USA",ChIJYeZuBI9YwokRjMDs_IEyCwo


In [234]:
 building_info_geocoded= pd.concat([with_coord,without_coord_geo])
# building_info_geocoded.to_csv("building_info_geocoded.csv",index=False)

In [3]:
building_info_geocoded = pd.read_csv("building_info_geocoded.csv")

In [5]:
### Limit the resutlsd based on Building-specific Lat Long, not borough centroids.
building_info_geocoded = building_info_geocoded[building_info_geocoded['geocode_tag'].isnull()]
### NEEED TO ADD THE BBL BY ADDRESS.
print(building_info_geocoded.shape)
building_info_geocoded.to_csv("building_info_geocoded.csv",index=False)

(9121, 20)


In [240]:
# import re
# import time
# import requests
# import pandas as pd

# GEOSVC_BASE = "https://geoservice.planning.nyc.gov"  # Function_1A
# # If you have a key: set GEOSERVICE_KEY in your env and pass api_key=None (it will be picked up here if you want).
# # Or pass api_key="YOUR_KEY" to the function.

# # Normalizes borough strings from your df
# _BORO_NORM = {
#     "1":"Manhattan","mn":"Manhattan","manhattan":"Manhattan","new york":"Manhattan",
#     "2":"Bronx","bx":"Bronx","bronx":"Bronx",
#     "3":"Brooklyn","bk":"Brooklyn","kings":"Brooklyn","brooklyn":"Brooklyn",
#     "4":"Queens","qn":"Queens","queens":"Queens",
#     "5":"Staten Island","si":"Staten Island","richmond":"Staten Island","staten island":"Staten Island",
# }

# def _norm_borough(boro, city=None):
#     """Prefer explicit borough, else infer from city."""
#     for v in (boro, city):
#         if v is None or (isinstance(v, float) and pd.isna(v)): 
#             continue
#         key = str(v).strip().lower()
#         if key in _BORO_NORM:
#             return _BORO_NORM[key]
#     return None

# def _split_address(addr):
#     """
#     Split a single-line street address into (house_number, street_name).
#     Handles Queens hyphen numbers (e.g., 31-12), trims trailing state/ZIP and unit text.
#     """
#     if addr is None or (isinstance(addr, float) and pd.isna(addr)):
#         return None, None
#     s = str(addr).strip()
#     # remove trailing ", NY", ZIP, etc.
#     s = re.sub(r",?\s*(NY|New York|USA|\d{5}(?:-\d{4})?)\s*$", "", s, flags=re.I)
#     # drop unit/suite after a keyword (#, apt, suite, unit, ste)
#     s = re.split(r"\b(apt|suite|ste|unit|#)\b", s, flags=re.I)[0].strip()
#     m = re.match(r"^\s*(\d[\d-]*)\s+(.*)$", s)
#     if not m:
#         return None, None
#     hn, street = m.group(1).strip(), m.group(2).strip()
#     return hn, street

# def _find(d, pat):
#     """Recursive tolerant finder for slightly varying JSON key names."""
#     if isinstance(d, dict):
#         for k, v in d.items():
#             if re.search(pat, str(k), re.I): 
#                 return v
#             out = _find(v, pat)
#             if out is not None:
#                 return out
#     elif isinstance(d, list):
#         for x in d:
#             out = _find(x, pat)
#             if out is not None:
#                 return out
#     return None

# def add_bbl_from_address(
#     df: pd.DataFrame,
#     address_col: str = "address_line1_clean",   # use "address_1" if you prefer
#     borough_col: str = "borough_clean",         # fallback to city if borough missing
#     city_col: str = "city_clean",
#     api_key: str | None = None,
#     pause_sec: float = 0.10,
#     retries: int = 2,
# ):
#     """
#     For each row, call Function_1A with (HouseNumber, Street, Borough),
#     and append columns: bbl, borough_code, tax_block, tax_lot.
#     Returns a *new* dataframe (does not mutate input).
#     """
#     out = df.copy()

#     def _row_to_bbl(row):
#         hn, st = _split_address(row.get(address_col))
#         bor = _norm_borough(row.get(borough_col), row.get(city_col))
#         if not (hn and st and bor):
#             return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None, "bbl_status": "missing_inputs"})

#         params = {"HouseNumber": hn, "Street": st, "Borough": bor, "DisplayFormat": "true"}
#         if api_key:
#             params["Key"] = api_key

#         last_err = None
#         for attempt in range(1, retries + 1):
#             try:
#                 r = requests.get(f"{GEOSVC_BASE}/Function_1A", params=params, timeout=25)
#                 r.raise_for_status()
#                 j = r.json()
#                 bor_code = _find(j, r"^borough(code)?$") or _find(j, r"^boro$")
#                 block    = _find(j, r"^block$")
#                 lot      = _find(j, r"^lot$")
#                 if bor_code is None or block is None or lot is None:
#                     return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None, "bbl_status": "not_found"})
#                 bbl = f"{int(bor_code)}{int(block):05d}{int(lot):04d}"
#                 return pd.Series({
#                     "bbl": bbl,
#                     "borough_code": str(int(bor_code)),
#                     "tax_block": str(int(block)),
#                     "tax_lot": str(int(lot)),
#                     "bbl_status": "ok",
#                 })
#             except Exception as e:
#                 last_err = str(e)
#                 time.sleep(pause_sec)
#         return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None, "bbl_status": f"error:{last_err}"})
#     # Apply row-wise
#     out[["bbl", "borough_code", "tax_block", "tax_lot", "bbl_status"]] = out.apply(_row_to_bbl, axis=1)
#     return out



In [243]:
# if your address and borough columns are the cleaned ones in your sample:
building_info_geocoded_bbl = add_bbl_from_address(
    building_info_geocoded.head(n=20),
    address_col="address_line1_clean",
    borough_col="borough_clean",
    city_col="city_clean",
    api_key=None,        # or "YOUR_KEY" if you have one
    pause_sec=0.1,
    retries=2
)

building_info_geocoded_bbl[["property_id","address_line1_clean","borough_clean","bbl","borough_code","tax_block","tax_lot","bbl_status"]].head(10)


Unnamed: 0,property_id,address_line1_clean,borough_clean,bbl,borough_code,tax_block,tax_lot,bbl_status
85957,2707907.0,2626 Homecrest Ave,BROOKLYN,,,,,error:404 Client Error: Not Found for url: htt...
85974,3521602.0,3240 Henry Hudson Pkwy,BRONX,,,,,error:404 Client Error: Not Found for url: htt...
85975,3521883.0,900 Ave H,BROOKLYN,,,,,error:404 Client Error: Not Found for url: htt...
85976,3522892.0,1561 E. 13Th St,BROOKLYN,,,,,error:404 Client Error: Not Found for url: htt...
85977,4047231.0,115 West 35Th St,MANHATTAN,,,,,error:404 Client Error: Not Found for url: htt...
85978,3524325.0,2016 Ave N,BROOKLYN,,,,,error:404 Client Error: Not Found for url: htt...
85979,3524382.0,1701 W 3Rd St,BROOKLYN,,,,,error:404 Client Error: Not Found for url: htt...
85981,3524760.0,175-27Wexford Ter,QUEENS,,,,,missing_inputs
85985,3999874.0,3030 Middletown Rd,BRONX,,,,,error:404 Client Error: Not Found for url: htt...
85994,2638320.0,145 West 96Th St,MANHATTAN,,,,,error:404 Client Error: Not Found for url: htt...


In [242]:
# building_info_geocoded_bbl.to_csv("building_info_geocoded_bbl.csv",index=False)

In [30]:
import requests, time, re, pandas as pd

df = building_info_geocoded.copy()
rows = []

with requests.Session() as s:
    for i, r in df.iterrows():
        addr = str(r.get("address_line1_clean", "")).strip()
        boro = str(r.get("borough_clean", "")).strip()
        if not addr:
            rows.append((i, None, None, None, None, None)); continue

        q = f"{addr}, {boro}, NY" if boro else f"{addr}, NY"
        try:
            resp = s.get("https://geosearch.planninglabs.nyc/v2/search",
                         params={"text": q, "size": 5}, timeout=20)
            feats = resp.json().get("features", [])
            # pick the best feature: prefer layer=='address' and same borough; else highest confidence
            best, best_score = None, (-1, -1.0)
            for f in feats:
                p = f.get("properties", {})
                layer = p.get("layer")
                conf  = float(p.get("confidence", 0) or 0)
                fb    = (p.get("borough") or ((p.get("addendum") or {}).get("pad") or {}).get("boroughName") or "")
                score = (2 if layer == "address" else 0) + (1 if boro and boro.upper() in str(fb).upper() else 0)
                if (score, conf) > best_score:
                    best, best_score = f, (score, conf)

            if best:
                props = best.get("properties", {})
                add   = (props.get("addendum") or {}).get("pad") or {}
                bbl   = add.get("bbl") or props.get("bbl")
                bin_  = add.get("bin") or props.get("bin")

                # normalize BBL to a 10-digit string
                if isinstance(bbl, str):
                    digits = re.sub(r"\D", "", bbl)
                    bbl = digits.zfill(10) if digits else None
                elif bbl is not None:
                    try: bbl = str(int(bbl)).zfill(10)
                    except: bbl = None

                rows.append((i, bbl, bin_, props.get("confidence"), props.get("label") or props.get("name"), q))
            else:
                rows.append((i, None, None, None, None, q))

        except Exception as e:
            rows.append((i, None, None, None, f"ERR:{type(e).__name__}", q))

        time.sleep(0.1)  # tiny throttle to be polite

# assemble + join back
out = pd.DataFrame(rows, columns=["__idx","BBL_from_api","BIN_from_api","confidence","match_label","query"]).set_index("__idx")


In [31]:
out

Unnamed: 0_level_0,BBL_from_api,BIN_from_api,confidence,match_label,query
__idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3074560006,3204886,0.8,"2626 HOMECREST AVENUE, Brooklyn, NY, USA","2626 Homecrest Ave, BROOKLYN, NY"
1,2057890024,2084124,0.8,"3240 HENRY HUDSON PARKWAY, Bronx, NY, USA","3240 Henry Hudson Pkwy, BRONX, NY"
2,3065120001,3170547,0.8,"900 AVENUE H, Brooklyn, NY, USA","900 Ave H, BROOKLYN, NY"
3,3067600062,3181781,0.8,"1561 EAST 13 STREET, Brooklyn, NY, USA","1561 E. 13Th St, BROOKLYN, NY"
4,1008117502,1015226,0.8,"115 WEST 35 STREET, New York, NY, USA","115 West 35Th St, MANHATTAN, NY"
...,...,...,...,...,...
9116,2027137501,2092007,0.8,"955 EAST 163 STREET, Bronx, NY, USA","955 East 163Rd St, MANHATTAN, NY"
9117,2028730103,2008741,0.8,"1541 SHAKESPEARE AVENUE, Bronx, NY, USA","1541 Shakespeare Ave, MANHATTAN, NY"
9118,1019120043,1089103,0.8,"128 WEST 128 STREET, New York, NY, USA","128 West 128Th St, MANHATTAN, NY"
9119,1003500061,1004271,1.0,"315 EAST HOUSTON STREET, New York, NY, USA","315 E 102St St, MANHATTAN, NY"


In [32]:
BuildingInfoGeocoded = building_info_geocoded.join(out)
# BuildingInfoGeocoded.to_csv("BuildingInfoGeocoded_GEO_BBL.csv",index=False)
# quick peek
# BuildingInfoGeocoded[["address_line1_clean","borough_clean","BBL_from_api","BIN_from_api","confidence","match_label"]].head()


In [36]:
BuildingInfoGeocoded = BuildingInfoGeocoded.dropna(how='all',axis=1)

In [38]:
BuildingInfoGeocoded#.columns

Unnamed: 0,property_id,address_1,address_2,city,county,borough,latitude,longitude,address_line1_clean,address_line2_clean,borough_clean,city_clean,state_clean,needs_geocoding,geocode_key,BBL_from_api,BIN_from_api,confidence,match_label,query
0,2707907.0,2626 Homecrest Avenue,,Brooklyn,,BROOKLYN,40.587065,-73.957019,2626 Homecrest Ave,,BROOKLYN,Brooklyn,NY,False,2626 HOMECREST AVE||BROOKLYN|NY|,3074560006,3204886,0.8,"2626 HOMECREST AVENUE, Brooklyn, NY, USA","2626 Homecrest Ave, BROOKLYN, NY"
1,3521602.0,3240 Henry Hudson parkway,,Bronx,,BRONX,40.885365,-73.913345,3240 Henry Hudson Pkwy,,BRONX,Bronx,NY,False,3240 HENRY HUDSON PKWY||BRONX|NY|,2057890024,2084124,0.8,"3240 HENRY HUDSON PARKWAY, Bronx, NY, USA","3240 Henry Hudson Pkwy, BRONX, NY"
2,3521883.0,900 Avenue H,,Brooklyn,,BROOKLYN,40.629342,-73.967861,900 Ave H,,BROOKLYN,Brooklyn,NY,False,900 AVE H||BROOKLYN|NY|,3065120001,3170547,0.8,"900 AVENUE H, Brooklyn, NY, USA","900 Ave H, BROOKLYN, NY"
3,3522892.0,1561 E. 13th Street,,Brooklyn,,BROOKLYN,40.611279,-73.960634,1561 E. 13Th St,,BROOKLYN,Brooklyn,NY,False,1561 E. 13TH ST||BROOKLYN|NY|,3067600062,3181781,0.8,"1561 EAST 13 STREET, Brooklyn, NY, USA","1561 E. 13Th St, BROOKLYN, NY"
4,4047231.0,115 West 35th Street,,new York,,MANHATTAN,40.750878,-73.988270,115 West 35Th St,,MANHATTAN,New York,NY,False,115 WEST 35TH ST||NEW YORK|NY|,1008117502,1015226,0.8,"115 WEST 35 STREET, New York, NY, USA","115 West 35Th St, MANHATTAN, NY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9116,3632773.0,955 East 163rd Street,,New York,,BRONX,40.821001,-73.895580,955 East 163Rd St,,MANHATTAN,New York,NY,False,955 EAST 163RD ST||NEW YORK|NY|,2027137501,2092007,0.8,"955 EAST 163 STREET, Bronx, NY, USA","955 East 163Rd St, MANHATTAN, NY"
9117,3632791.0,1541 Shakespeare Avenue,,New York,,BRONX,40.845719,-73.919252,1541 Shakespeare Ave,,MANHATTAN,New York,NY,False,1541 SHAKESPEARE AVE||NEW YORK|NY|,2028730103,2008741,0.8,"1541 SHAKESPEARE AVENUE, Bronx, NY, USA","1541 Shakespeare Ave, MANHATTAN, NY"
9118,3633523.0,128 West 128th Street,,New York,,MANHATTAN,40.810108,-73.945088,128 West 128Th St,,MANHATTAN,New York,NY,False,128 WEST 128TH ST||NEW YORK|NY|,1019120043,1089103,0.8,"128 WEST 128 STREET, New York, NY, USA","128 West 128Th St, MANHATTAN, NY"
9119,3633666.0,315 E 102 Street,,New York,,MANHATTAN,40.787834,-73.943766,315 E 102St St,,MANHATTAN,New York,NY,False,315 E 102ST ST||NEW YORK|NY|,1003500061,1004271,1.0,"315 EAST HOUSTON STREET, New York, NY, USA","315 E 102St St, MANHATTAN, NY"


# ---------------------------------

Unnamed: 0,report_year,property_id,property_name,year_ending,nyc_borough_block_and_lot,nyc_building_identification,address_1,city,postal_code,largest_property_use_type_1,...,energy_current_date,electricity_onsite_renewable,electricity_sourced_from,onsite_renewable_system,target_site_eui_kbtu_ft,other_use_kbtu,estimated_data_flag_other,bin,bbl,year_ending_year
21,2022.0,15327445.0,KM 1200 Union Ave/1204 Union Ave,2022-12-31,2026820001,2005057,1200 Union Ave/1204 Union Ave,Bronx,10459,37672.0,...,,,,,,,,,,2022
24,2022.0,15332746.0,KM 500 West 144th street/144 Hamilton Place,2022-12-31,1020750036,1061888,500 West 144th street/144 Hamilton Place,New York,10031,56688.0,...,,,,,,,,,,2022
84,2022.0,24125286.0,2820 Middle Town Road,2022-12-31,2053860017,2074435,2820 Middle Town Road,Bronx,10461,25200.0,...,,,,,,,,,,2022
161,2022.0,2771721.0,91-32/34 195th St LLC,2022-12-31,4108210060,4231514;4451927,91-32/34 195th Street,Queens,11423,79709.0,...,,,,,,,,,,2022
199,2022.0,2638326.0,Marben Realty,2022-12-31,1018900064,1057200,308 West104,New York,10025,56555.0,...,,,,,,,,,,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134654,,3539873.0,(7324) - East Midtown Plaza(2),2013-12-31,1-00929-0001,1084709; 1078824; 1084710; 1802187; 1084711,401 1ST AVE; 400 2ND AVE,New York,,,...,,,,,,,,,,2013
134660,,3956379.0,Carr Properties Inc,2013-12-31,,2008621,25-29 West Tremont,Bronx,,,...,,,,,,,,,,2013
134672,,3579396.0,1990 Adam Clayton Powell Jr Boulevard,2012-12-31,,,1990 Adam Clayton Powell Jr Boulevard,New York,,,...,2012-12-31,,,,,,,,,2012
134676,,3608447.0,Star 65 LLC,2012-12-31,,,665-88 Street,Brooklyn,,,...,,,,,,,,,,2012


In [41]:
list(without_coord.columns)

['report_year',
 'property_id',
 'property_name',
 'year_ending',
 'nyc_borough_block_and_lot',
 'nyc_building_identification',
 'address_1',
 'city',
 'postal_code',
 'largest_property_use_type_1',
 'year_built',
 'number_of_buildings',
 'occupancy',
 'metered_areas_energy',
 'metered_areas_water',
 'energy_star_score',
 'national_median_energy_star',
 'target_energy_star_score',
 'reason_s_for_no_score',
 'energy_star_certification',
 'energy_star_certification_1',
 'site_eui_kbtu_ft',
 'weather_normalized_site_eui',
 'national_median_site_eui',
 'site_energy_use_kbtu',
 'weather_normalized_site_energy',
 'electricity_weather_normalized',
 'electricity_weather_normalized_1',
 'natural_gas_weather_normalized',
 'natural_gas_weather_normalized_1',
 'source_eui_kbtu_ft',
 'weather_normalized_source',
 'national_median_source_eui',
 'source_energy_use_kbtu',
 'weather_normalized_source_1',
 'fuel_oil_1_use_kbtu',
 'fuel_oil_2_use_kbtu',
 'fuel_oil_4_use_kbtu',
 'fuel_oil_5_6_use_kbtu',
 

In [52]:
### 
# without_coord = without_coord.drop(columns=["latitude","longitude"])
without_coord_enr = without_coord.merge(all_results, on=["property_id","address_1","city","postal_code"],how='left')
print(without_coord_enr.shape)
print(without_coord_enr[without_coord_enr["latitude"].isnull()].shape)

(4377, 362)
(4133, 362)


In [58]:
## Sill in need 
print("initial Total",residential3.shape)
with_coord2 = pd.concat([with_coord,without_coord_enr[~without_coord_enr["latitude"].isnull()]]).drop_duplicates()
print("with coords:",with_coord2.shape)

need_coords = pd.concat([without_coord[without_coord["property_id"].isin(all_failed['property_id'].unique())],
                         without_coord_enr[without_coord_enr["latitude"].isnull()]]).drop_duplicates()
print("need_coords:",need_coords.shape)


initial Total (134698, 362)
with coords: (129820, 362)
need_coords: (4113, 362)


In [59]:
results = []
failed = []

In [60]:
## Second Cut 
gmaps = googlemaps.Client(key=google_api_key)

# assumes: need_coords, results = [], failed = [] already exist
for i, r in need_coords[["property_id","address_1","city","postal_code"]].drop_duplicates().iterrows():
    temp_df = pd.DataFrame([r])
    print(r["property_id"])

    full_address = str(r["address_1"]) + " " + str(r["city"]) + " " + str(r["postal_code"])
    full_address = full_address.replace("nan","").strip()
    print(full_address)

    if "/" in full_address:
        full_address = full_address.split("/")[-1].strip()
        print("Split Addy:", full_address)

    # ---- Google Geocoding (swap-in for geolocator.geocode) ----
    try:
        resp = gmaps.geocode(full_address)  # add region="us" or components=... if you like
    except Exception as e:
        print("Geocode error:", e)
        resp = []

    if resp:
        loc = resp[0]["geometry"]["location"]
        temp_df["latitude"]  = loc["lat"]
        temp_df["longitude"] = loc["lng"]
        print(loc["lat"], loc["lng"])
        results.append(temp_df)
    else:
        failed.append(temp_df)

24125286.0
2820 Middle Town Road Bronx 10461
40.8432564 -73.8358873
2638326.0
308 West104 New York 10025
40.8004228 -73.970244
5841271.0
2401 Davdison Avenue Bronx 10452
40.861769 -73.9032932
8705631.0
510 West 218 Street New York 11357
40.8712485 -73.9148179
9637009.0
2485 Morris Aveune Bronx 10468
40.8630438 -73.899755
2734772.0
105 Pinhurst Ave New York 10033
40.851816 -73.9387086
3522985.0
601 79th Steet Brooklyn 11209
40.6248125 -74.02066839999999
4040577.0
71-11 -71-23 162ND STREET NY 11365
40.7397205 -73.8062806
5834191.0
107-19 70st Ave Flushing 11375
40.7467285 -73.89544719999999
5863627.0
1675 E 21st Brooklyn 11210
40.6122215 -73.9525281
5965464.0
1440 Richmond Terracae STATEN ISLAND 10310
40.6408643 -74.1161858
6282532.0
54 Morningisde Drive New York 10025
40.8059287 -73.9596929
6282641.0
225 East 202nd StreetBronx New York 10458
40.8732705 -73.88585499999999
6282704.0
82-06 & 82-16 34th Avenue Jackson Heights 11372
40.7536074 -73.8848235
6282839.0
62 Clermont Street Brookly

In [61]:
results_df3 = pd.concat(results)
failed_df3= pd.concat(failed)

In [69]:
need_coords = need_coords.drop(columns=["latitude","longitude"])
without_coord_enr2 = need_coords.merge(results_df3, on=["property_id","address_1","city","postal_code"],how='left')
print(without_coord_enr2.shape)
print(without_coord_enr2[without_coord_enr2["latitude"].isnull()].shape)

(4113, 362)
(48, 362)


In [81]:
print("initial Total",residential3.shape)
with_coord3 = pd.concat([with_coord2,without_coord_enr2[~without_coord_enr2["latitude"].isnull()]]).drop_duplicates()
print("with coords:",with_coord3.shape)
need_coords2 = pd.concat([need_coords[need_coords["property_id"].isin(failed_df3['property_id'].unique())],
                         without_coord_enr2[without_coord_enr2["latitude"].isnull()]]).drop_duplicates()
print("need_coords:",need_coords2.shape)

initial Total (134698, 362)
with coords: (133885, 362)
need_coords: (57, 362)


0.6

In [96]:
### Making the decision to drop these and not deal with them, mostly corrupt address values. 
need_coords2[["property_id", 'address_1','address_2',
 'city',
 'postal_code']].drop_duplicates()

Unnamed: 0,property_id,address_1,address_2,city,postal_code
122015,3128085.0,,,,
122016,3128086.0,,,,
122018,3128108.0,,,,
122019,3128109.0,,,,
122020,3128110.0,,,,
122021,3128111.0,,,,
122022,3128112.0,,,,
122026,3128116.0,,,,
122027,3128117.0,,,,
122028,3128118.0,,,,


In [None]:
round(((134698 - 133885)/134698)*100,2) 
### DROPPING 0.6% of the data becaue not able to successfgully geocode

### saving geocoded cleaned data for now

In [84]:
working_residential_geo = with_coord3.copy()
# working_residential_geo.to_csv("working_residential_geo.csv",index=False)

### Spatial joinign with CT to pull ct into the buildings data. 

In [85]:
working_residential_geo

Unnamed: 0,report_year,property_id,property_name,year_ending,nyc_borough_block_and_lot,nyc_building_identification,address_1,city,postal_code,largest_property_use_type_1,...,energy_current_date,electricity_onsite_renewable,electricity_sourced_from,onsite_renewable_system,target_site_eui_kbtu_ft,other_use_kbtu,estimated_data_flag_other,bin,bbl,year_ending_year
0,2022.0,9793770.0,1870 Pelham Parkway South,2022-12-31,2042500026,2047795,1870 Pelham Parkway South,Bronx,10461,52941.0,...,,,,,,,,,,2022
1,2022.0,14377690.0,1680 Ocean Ave,2022-12-31,3067300001,3180535,1680 Ocean Ave,Brooklyn,11230,68400.0,...,,,,,,,,,,2022
2,2022.0,15176247.0,88-24 Merrick Blvd,2022-12-31,4098150067,4210063,88-24 Merrick Blvd,Jamaica,11432,82576.0,...,,,,,,,,,,2022
3,2022.0,15176327.0,90-11 149th Street,2022-12-31,4096790052,4206819,90-11 149 str,Jamaica,11435,136000.0,...,,,,,,,,,,2022
4,2022.0,15176328.0,148-25 89th Ave,2022-12-31,4096930051,4207100,148-25 89 Ave,Jamaica,11435,127200.0,...,,,,,,,,,,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4108,,2676297.0,(7226) - The Howard Owners Corp,2013-12-31,4-02118-7501,4432024;4432026;4432028;4432025;4432027;4432029,99-32 - 99-72 66 Rd,New York,,,...,,,,,,,,,,2013
4109,,2718861.0,213-02 42nd Ave,2013-12-31,4-06284-7501,4433427,213-02 42nd Ave,Queens,,,...,,,,,,,,,,2013
4110,,2917501.0,325 East 79 Street Owners,2013-12-31,1-01542-0014,1049250,325 East 79th Streeet,New York,,,...,,,,,,,,,,2013
4111,,3956379.0,Carr Properties Inc,2013-12-31,,2008621,25-29 West Tremont,Bronx,,,...,,,,,,,,,,2013


In [None]:
### pulling in ct shapefile

In [86]:
import io, requests, geopandas as gpd

CT2010_URL = "https://data.cityofnewyork.us/resource/bmjq-373p.geojson?$limit=50000"
SOCRATA_APP_TOKEN = None  # put your token string here if you have one

headers = {"X-App-Token": SOCRATA_APP_TOKEN} if SOCRATA_APP_TOKEN else {}
resp = requests.get(CT2010_URL, headers=headers, timeout=60)
resp.raise_for_status()

ct2010 = gpd.read_file(io.BytesIO(resp.content)).to_crs(2263)
ct2010.head()


Unnamed: 0,ntacode,shape_area,ntaname,shape_leng,boroname,puma,boroct2010,ct2010,borocode,cdeligibil,ctlabel,geometry
0,SI22,2497009.71359,West New Brighton-New Brighton-St. George,7729.01679383,Staten Island,3903,5000900,900,5,E,9,"MULTIPOLYGON (((962269.126 173705.5, 962288.72..."
1,MN17,1860992.68163,Midtown-Midtown South,5687.80243891,Manhattan,3807,1010200,10200,1,I,102,"MULTIPOLYGON (((992216.539 216507.687, 992091...."
2,MN17,1864600.43538,Midtown-Midtown South,5693.03636707,Manhattan,3807,1010400,10400,1,I,104,"MULTIPOLYGON (((991325.882 217001.689, 991199...."
3,MN17,1890907.25105,Midtown-Midtown South,5699.86064037,Manhattan,3807,1011300,11300,1,I,113,"MULTIPOLYGON (((988650.277 214286.402, 988517...."
4,MN40,1918144.56374,Upper East Side-Carnegie Hill,5807.97295649,Manhattan,3805,1013000,13000,1,I,130,"MULTIPOLYGON (((994920.11 221386.27, 994791.85..."


In [87]:
import geopandas as gpd

# --- 1) Points from your geocoded table ---
# assumes working_residential_geo has columns: property_id, latitude, longitude
pts = gpd.GeoDataFrame(
    working_residential_geo[["property_id", "latitude", "longitude"]].copy(),
    geometry=gpd.points_from_xy(
        working_residential_geo["longitude"], working_residential_geo["latitude"]
    ),
    crs=4326  # your geocodes are WGS84
)

# project points to match the CT layer (your CT layer is already to_crs(2263))
pts_2263 = pts.to_crs(2263)

# --- 2) Keep only what you need from the CT layer ---
# 'ct2010' and/or 'boroct2010' are the usual tract IDs in that NYC layer
ct_keep = ct2010[["ct2010", "boroct2010", "boroname", "geometry"]].copy()

# --- 3) Spatial join: which tract polygon contains each point ---
joined = gpd.sjoin(
    pts_2263,
    ct_keep,
    how="left",
    predicate="within"   # points that fall inside a tract polygon
)

# If a property_id appears multiple times (duplicates), keep the first tract hit
joined = joined.sort_index().drop_duplicates(subset=["property_id"])

# --- 4) Bring tract columns back to your original dataframe ---
cols_to_add = ["ct2010", "boroct2010", "boroname"]
working_residential_tract = working_residential_geo.merge(
    joined[["property_id"] + cols_to_add],
    on="property_id",
    how="left"
)

# working_with_tract now has ct2010 / boroct2010 for each row


In [18]:
## Saving information to Csv 
# working_residential_tract.to_csv("working_residential_tract.csv",index=False)
working_residential_tract = pd.read_csv("working_residential_tract.csv")

  working_residential_tract = pd.read_csv("working_residential_tract.csv")


In [None]:
## limiting to certain 2010 and 2017?

In [19]:
working_residential_tract.groupby(['year_ending_year']).agg({"year_ending":"count"}).reset_index()

Unnamed: 0,year_ending_year,year_ending
0,2012,32
1,2013,9256
2,2014,9064
3,2015,6882
4,2016,7050
5,2017,15657
6,2018,13474
7,2019,14265
8,2020,17240
9,2021,13114


In [20]:
working_residential_tract[["report_year","year_ending","year_ending_year"]].drop_duplicates()

Unnamed: 0,report_year,year_ending,year_ending_year
0,2022.0,2022-12-31,2022
13091,2023.0,2023-12-31,2023
27197,,2021-12-31,2021
40000,,2020-12-31,2020
56796,,2019-12-31,2019
70692,,2018-12-31,2018
83801,,2017-12-31,2017
98974,,2016-12-31,2016
105745,,2015-12-31,2015
112339,,2014-12-31,2014


In [22]:
buildings_2012_2013_ids = working_residential_tract_2012_2017[working_residential_tract_2012_2017["year_ending_year"].isin([2012,2013])]["property_id"].unique()
working_residential_tract_2012_2017[working_residential_tract_2012_2017['property_id'].isin(buildings_2012_2013_ids)].groupby(['year_ending_year']).agg({"year_ending":"count"}).reset_index()


Unnamed: 0,year_ending_year,year_ending
0,2012,32
1,2013,9256
2,2014,7456
3,2015,4429
4,2016,3908
5,2017,4906


In [23]:
### Limtiing to 2017 and before 
working_residential_tract_2012_2017 = working_residential_tract[working_residential_tract["year_ending_year"]<2018]
working_residential_tract_2012_2017['year_ending_year'].unique()
working_residential_tract_2012_2017[working_residential_tract_2012_2017['property_id'].isin(buildings_2012_2013_ids)]
# working_residential_tract_2012_2017.to_csv("working_residential_tract_2012_2017.csv",index=False)

Unnamed: 0,report_year,property_id,property_name,year_ending,nyc_borough_block_and_lot,nyc_building_identification,address_1,city,postal_code,largest_property_use_type_1,...,onsite_renewable_system,target_site_eui_kbtu_ft,other_use_kbtu,estimated_data_flag_other,bin,bbl,year_ending_year,ct2010,boroct2010,boroname
83802,,2707907.0,2626 Homecrest Avenue,2017-12-31,3-07456-0006,3204886,2626 Homecrest Avenue,Brooklyn,,127500.0,...,,,,,,,2017,60600.0,3060600.0,Brooklyn
83814,,3521602.0,3240 Henry Hudson parkway LLC,2017-12-31,2-05789-0024,2084124,3240 Henry Hudson parkway,Bronx,,163197.0,...,,,,,,,2017,29700.0,2029700.0,Bronx
83815,,3521883.0,900 Avenue H LLC,2017-12-31,3-06512-0001,3170547,900 Avenue H,Brooklyn,,77000.0,...,,,,,,,2017,45600.0,3045600.0,Brooklyn
83816,,3522892.0,1561 E. 13th Street LLC,2017-12-31,3-06760-0062,3181781,1561 E. 13th Street,Brooklyn,,71000.0,...,,,,,,,2017,54200.0,3054200.0,Brooklyn
83817,,4047231.0,Justin - 115 West 30th Street,2017-12-31,1-00806-0026,1015161,115 West 35th Street,new York,,151525.0,...,,,,,,,2017,10900.0,1010900.0,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,,2676297.0,(7226) - The Howard Owners Corp,2013-12-31,4-02118-7501,4432024;4432026;4432028;4432025;4432027;4432029,99-32 - 99-72 66 Rd,New York,,,...,,,,,,,2013,71304.0,4071304.0,Queens
133881,,2718861.0,213-02 42nd Ave,2013-12-31,4-06284-7501,4433427,213-02 42nd Ave,Queens,,,...,,,,,,,2013,146700.0,4146700.0,Queens
133882,,2917501.0,325 East 79 Street Owners,2013-12-31,1-01542-0014,1049250,325 East 79th Streeet,New York,,,...,,,,,,,2013,13800.0,1013800.0,Manhattan
133883,,3956379.0,Carr Properties Inc,2013-12-31,,2008621,25-29 West Tremont,Bronx,,,...,,,,,,,2013,24300.0,2024300.0,Bronx


In [24]:
working_continuous_final = working_residential_tract_2012_2017[working_residential_tract_2012_2017['property_id'].isin(buildings_2012_2013_ids)]

# working_continuous_final.to_csv("working_continuous_final_2012_2017.csv",index=False)

In [25]:
# Path to your geodatabase (.gdb)
gdb_path = r"C:\Users\johnf\Downloads\Tree_Canopy_Change (1)\Tree_Canopy_Change\NYC_TreeCanopyChange_2010_2017.gdb"

# List all layers in the geodatabase
layers = fiona.listlayers(gdb_path)
print("Layers available:", layers)

# Read a specific layer into a GeoDataFrame
gdf = gpd.read_file(gdb_path, layer=layers[0])  # Replace [0] with your desired layer
print(gdf.head())

# Optional: save as shapefile or GeoJSON
#gdf.to_file("output.shp") 
#gdf.to_file("output.geojson", driver="GeoJSON")

canopy_change = gdf.copy()
canopy_change

Layers available: ['NYC_TreeCanopyChange_2010_2017']


  return ogr_read(


       Class  Shape_Length    Shape_Area  \
0  No Change    732.955182   5800.500001   
1  No Change   2079.456465  35005.875002   
2  No Change    228.557599   1984.500000   
3  No Change    125.726893    617.750000   
4  No Change     41.031601     90.750000   

                                            geometry  
0  MULTIPOLYGON (((1008879.93 272372.3, 1008878.4...  
1  MULTIPOLYGON (((1008749.43 271870.8, 1008748.9...  
2  MULTIPOLYGON (((1008704.43 272105.3, 1008701.9...  
3  MULTIPOLYGON (((1009015.43 272757.3, 1009013.4...  
4  MULTIPOLYGON (((1008794.93 270669.8, 1008781.4...  


Unnamed: 0,Class,Shape_Length,Shape_Area,geometry
0,No Change,732.955182,5800.500001,"MULTIPOLYGON (((1008879.93 272372.3, 1008878.4..."
1,No Change,2079.456465,35005.875002,"MULTIPOLYGON (((1008749.43 271870.8, 1008748.9..."
2,No Change,228.557599,1984.500000,"MULTIPOLYGON (((1008704.43 272105.3, 1008701.9..."
3,No Change,125.726893,617.750000,"MULTIPOLYGON (((1009015.43 272757.3, 1009013.4..."
4,No Change,41.031601,90.750000,"MULTIPOLYGON (((1008794.93 270669.8, 1008781.4..."
...,...,...,...,...
5692504,Loss,63.798210,286.735284,"MULTIPOLYGON (((1014610.46 188155.382, 1014610..."
5692505,Loss,63.429348,314.238467,"MULTIPOLYGON (((1014606.12 187969.835, 1014605..."
5692506,Loss,223.829388,2467.258600,"MULTIPOLYGON (((1014946.295 187646.6, 1014944...."
5692507,Loss,96.181802,538.668512,"MULTIPOLYGON (((1014656.958 187618.818, 101465..."


### Shaptial Join for LiDar Canopy and Buildings Lat Long for those in thwe Data from 2012/2013 through 2017

In [26]:
# --- 1) Points from your buildings (WGS84) ---
shape_bldg = gpd.GeoDataFrame(
    working_continuous_final[["property_id", "latitude", "longitude"]].copy(),
    geometry=gpd.points_from_xy(
        working_continuous_final["longitude"], working_continuous_final["latitude"]
    ),
    crs=4326
)

# --- 2) Put both layers in the SAME CRS (use the canopy CRS) ---
canopy = canopy_change[["Class", "geometry"]].copy()
if canopy.crs is None:
    # set if your canopy file didn’t come with a CRS; adjust if needed
    canopy = canopy.set_crs(2263)  # NYC StatePlane ft; change if different

shape_bldg = shape_bldg.to_crs(canopy.crs)

# (optional but recommended) fix invalid polygon rings that can break joins
canopy["geometry"] = canopy.buffer(0)

In [27]:
# --- 3) Spatial join: which canopy polygon contains each building point? ---
joined = gpd.sjoin(
    shape_bldg[["property_id", "geometry"]],
    canopy,
    how="left",
    predicate="within"
)

In [28]:
# If canopy polygons overlap and produce duplicates, keep the polygon with the largest area
if joined.duplicated("property_id").any():
    canopy_area = canopy.assign(_poly_area=canopy.area)
    joined = gpd.sjoin(
        shape_bldg[["property_id", "geometry"]],
        canopy_area,
        how="left",
        predicate="within"
    )
    joined = (joined
              .sort_values(["property_id", "_poly_area"], ascending=[True, False])
              .drop_duplicates(subset=["property_id"]))

In [31]:
# --- 4) Bring the canopy class back to your original table ---
working_with_canopy = working_continuous_final.merge(
    joined[["property_id", "Class"]].rename(columns={"Class": "canopy_change_class"}),
    on="property_id",
    how="left"
)


In [32]:
working_with_canopy["canopy_change_class"].unique()

array([nan, 'Gain', 'No Change', 'Loss'], dtype=object)

In [33]:
working_with_canopy.to_csv("working_with_canopy.csv",index=False)

In [34]:
### Before snapping seeing how many data poitns are null , not have a canopy value
working_with_canopy.groupby(["canopy_change_class"]).agg({"property_id":"nunique"})

Unnamed: 0_level_0,property_id
canopy_change_class,Unnamed: 1_level_1
Gain,700
Loss,154
No Change,976


In [35]:
# --- 5) (Optional) fill misses by snapping to nearest polygon within 50 ft ---
# This helps when a point lands just outside a sliver polygon.
miss_ids = working_with_canopy.loc[working_with_canopy["canopy_change_class"].isna(), "property_id"]
if len(miss_ids):
    nearest = gpd.sjoin_nearest(
        shape_bldg[shape_bldg["property_id"].isin(miss_ids)],
        canopy,
        how="left",
        max_distance=50  # units = CRS units; 50ft if CRS=2263
    )[["property_id", "Class"]].rename(columns={"Class": "canopy_change_class_nearest"})

    working_with_canopy_50ft = working_with_canopy.merge(nearest, on="property_id", how="left")
    working_with_canopy_50ft["canopy_change_class"] = (
        working_with_canopy_50ft["canopy_change_class"]
        .fillna(working_with_canopy_50ft["canopy_change_class_nearest"])
    )
    working_with_canopy_50ft.drop(columns=["canopy_change_class_nearest"], inplace=True)

In [36]:
working_with_canopy_50ft.groupby(["canopy_change_class"]).agg({"property_id":"nunique"})

Unnamed: 0_level_0,property_id
canopy_change_class,Unnamed: 1_level_1
Gain,6307
Loss,1101
No Change,1461


In [37]:
working_with_canopy_50ft.shape

(109537, 366)

In [38]:
working_with_canopy_50ft = working_with_canopy_50ft.dropna(how='all', axis=1)
working_with_canopy_50ft.to_csv("working_with_canopy_50ft.csv",index=False)

In [39]:
 working_with_canopy_50ft[['investment_in_energy_projects', 'investment_in_energy_projects_1']].drop_duplicates()

Unnamed: 0,investment_in_energy_projects,investment_in_energy_projects_1
0,,
56633,0.0,0.0
56830,575000.0,0.65
60649,20000.0,0.25


In [42]:
working_with_canopy_50ft[["green_power_onsite_kwh"]].drop_duplicates()

Unnamed: 0,green_power_onsite_kwh
0,
248,56620.0
261,38580.0
631,45420.0
36766,56900.0
36778,38000.0
57140,202815.0
79671,195375.5
104264,50990.0
105738,45100.0


In [52]:
noEnergy_investments = working_with_canopy_50ft[(working_with_canopy_50ft['investment_in_energy_projects'].isnull() 
                                                 & working_with_canopy_50ft['investment_in_energy_projects_1'].isnull()
                                                &working_with_canopy_50ft['green_power_onsite_kwh'].isnull())]
print(noEnergy_investments.groupby(["canopy_change_class"]).agg({"property_id":"nunique"}))
noEnergy_investments['borough'] =  noEnergy_investments['county'].combine_first(noEnergy_investments['borough']).astype(str).str.lower()
# noEnergy_investments = noEnergy_investments.drop(column=['city'])
# noEnergy_investments.to_csv("noEnergy_investments.csv",index=False)

                     property_id
canopy_change_class             
Gain                        3192
Loss                         563
No Change                    742


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noEnergy_investments['borough'] =  noEnergy_investments['county'].combine_first(noEnergy_investments['borough']).astype(str).str.lower()


In [66]:
pd.options.display.max_rows = 500

In [71]:

## Cleaning
noEnergy_investments[['city','borough']][(noEnergy_investments['borough']=='usa')].drop_duplicates()

noEnergy_investments['borough'][((noEnergy_investments['borough']=='usa')
                                          &(noEnergy_investments['city'].astype(str).str.upper()=='BRONX'))]='bronx'
noEnergy_investments['borough'][((noEnergy_investments['borough']=='usa')
                                          &(noEnergy_investments['city'].astype(str).str.upper()=='BROOKLYN'))]='brooklyn'
noEnergy_investments['borough'][((noEnergy_investments['borough']=='usa')
                                          &(noEnergy_investments['city'].astype(str).str.upper()=='NEW YORK'))]='manhattan'
noEnergy_investments['borough'][((noEnergy_investments['borough']=='usa')
                                          &(noEnergy_investments['city'].astype(str).str.title().isin(['Elmhurst','Sunnyside'])))]='queens'
noEnergy_investments['borough'][((noEnergy_investments['borough']=='usa')
                                          &(noEnergy_investments['city'].astype(str).str.title().isin(['Riverdale'])))]='queens'
noEnergy_investments['borough'][((noEnergy_investments['borough']=='nan')
                                          &(noEnergy_investments['city'].astype(str).str.upper().isin([
                                              "SUNNYSIDE","ASTORIA","JACKSON HEIGHTS","FLUSHING","QUEENS","REGO PARK",
                                              "OAKLAND GARDENS","FOREST HILLS","BAYSIDE","KEW GARDENS"])))]="queens"

noEnergy_investments['borough'][((noEnergy_investments['borough']=='nan')
                                          &(noEnergy_investments['city'].astype(str).str.upper().isin([
                                              "NEW YORK","NY","MANHATTAN"
                                              ])))]="manhattan"
noEnergy_investments['borough'][((noEnergy_investments['borough']=='nan')
                                          &(noEnergy_investments['city'].astype(str).str.upper().isin([
                                              "BROOKLYN"
                                              ])))]="brooklyn"
noEnergy_investments['borough'][((noEnergy_investments['borough']=='nan')
                                          &(noEnergy_investments['city'].astype(str).str.upper().isin([
                                              "BRONX"
                                              ])))]="bronx"
noEnergy_investments['borough'][((noEnergy_investments['borough']=='nan')
                                          &(noEnergy_investments['city'].astype(str).str.upper().isin([
                                              "STATEN ISLAND"
                                              ])))]="staten island"
# noEnergy_investments[['city','borough']][(noEnergy_investments['borough']=='usa')].drop_duplicates()


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  noEnergy_investments['borough'][((noEnergy_investments['borough']=='usa')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

Unnamed: 0,city,borough


In [131]:
noEnergy_investments[['county','borough']].drop_duplicates()

Unnamed: 0,county,borough
0,,brooklyn
5,,bronx
20,,manhattan
36,,queens
56,kings,brooklyn
99,Kings,brooklyn
164,NEW YORK,manhattan
195,Bronx,bronx
740,Queens,queens
1418,New York,manhattan


In [125]:
noEnergy_investments[['city','borough',"county"]][~noEnergy_investments['borough'].astype(str).str.lower().str.strip().isin([
    'brooklyn', 'bronx', 'manhattan', 'queens','staten island'])].drop_duplicates()
##### BROOKLYN
# noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
#                                               "KINGS","BROOKLYN"]))
#                                 &(noEnergy_investments["borough"].astype(str).str.upper().isin([
#                                     "KINGS","US","NEW YORK","KING","KINGS COUNTY","UNITED STATES"
#                                 ]))
#                                 )]="brooklyn"
# noEnergy_investments['borough'][((noEnergy_investments["city"]=="New York")
#                                  &(noEnergy_investments["borough"]=="kings"))]="brooklyn"
# noEnergy_investments['city'][((noEnergy_investments["city"].astype(str).str.lower()!="brooklyn")
#                                  &(noEnergy_investments["borough"]=="brooklyn"))]="brooklyn"

##### BRONX
# noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
#                                               "BRONX"]))
#                                 &(noEnergy_investments["borough"].astype(str).str.upper().isin([
#                                     "BRONC","NEW YORK","BX.","US","NEW YORK CITY"
#                                 ]))
#                                 )]="bronx"

##### STATEN ISLAND
# noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
#                                               "STATEN ISLAND","STATEN"]))
#                                 &(noEnergy_investments["borough"].astype(str).str.upper().isin([
#                                     "STATEN IS","RICHMOND"
#                                 ]))
#                                 )]="staten island"

## Manhattan
# noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
#                                               "NEW YORK","MANHATTAN","NY"]))
#                                 &(noEnergy_investments["borough"].astype(str).str.upper().isin([
#                                     "10025","NEW YORK","UNITED STATES","US","NEW YORK COUNTY","MAN"
#                                 ]))
#                                 )]="manhattan"

### QUEENS
# noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
#                                               "QUEENS","REGO PARK","ELMHURST","GLEN OAKS","CORONA"]))
#                                 &(noEnergy_investments["borough"].astype(str).str.upper().isin([
#                                     "ELMHURST","NEW YORK","OUEENS","US","11368","KEW GARDENS"
#                                 ]))
#                                 )]="queens"
# noEnergy_investments[((noEnergy_investments['address_1']=='7-11 Seagirt Ave')
#                       &(noEnergy_investments['borough']=='kings')
                     &(noEnergy_investments['city']=='Queens'))]="queens"



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noEnergy_investments['borough'][((noEnergy_investments['city'].astype(str).str.upper().isin([
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noEnergy_investments[((noEnergy_investments['address_1']=='7-11 Seagirt Ave')
  noEnergy_investments[((noEnergy_investments['addres

Unnamed: 0,city,borough,county


In [None]:
noEnergy_investments[['city','borough',"county"]][~noEnergy_investments['borough'].astype(str).str.lower().str.strip().isin([
    'brooklyn', 'bronx', 'manhattan', 'queens','staten island'])].drop_duplicates()

In [124]:
noEnergy_investments['address_1'][(noEnergy_investments['city']=="Queens") &( noEnergy_investments['borough']=="kings")]

34590    7-11 Seagirt Ave
34591    7-11 Seagirt Ave
34592    7-11 Seagirt Ave
34593    7-11 Seagirt Ave
43437    7-11 Seagirt Ave
43438    7-11 Seagirt Ave
43439    7-11 Seagirt Ave
43440    7-11 Seagirt Ave
Name: address_1, dtype: object

In [129]:
noEnergy_investments[["borough","county"]].drop_duplicates()

Unnamed: 0,borough,county
0,brooklyn,
5,bronx,
20,manhattan,
36,queens,
56,brooklyn,kings
99,brooklyn,Kings
164,manhattan,NEW YORK
195,bronx,Bronx
740,queens,Queens
1418,manhattan,New York


In [128]:

noEnergy_investments[["property_id","address_1","address_2","city","borough","county","postcode"]][~noEnergy_investments["address_2"].isnull()].drop_duplicates()

Unnamed: 0,property_id,address_1,address_2,city,borough,county,postcode
1233,2609529.0,3510 Bainbridge Avenue,Bronx,NY,bronx,Bronx,10467
1339,3286981.0,2609 Aqueduct Avenue,2610 University Avenue,Bronx,bronx,,10468
1844,3422658.0,2130 Adam C. Powell Jr. Blvd,Attn: Management,New York,manhattan,Manhattan,10027
3094,3280103.0,85-15 139th Street,Briarwood,Queens,queens,,11435
3189,2787415.0,221-229 Seaman Avenue,"31-41 Park Terrace West, NY, NY",New York,manhattan,,10034
3236,2716189.0,2190 Brigham St; 2170 Brigham St;,2171 Bragg St,Brooklyn,brooklyn,,11229
3237,2673740.0,3105 Ave V; 2140 Knapp St;,2165 Brigham St,Brooklyn,brooklyn,,11229
3686,3121027.0,319 E. 24th St,320 E. 25th St,New York,manhattan,,10010
4258,2977466.0,410 West 118th Street,B-230,New York,manhattan,,10027
4279,2977455.0,410 West 118th Street,B-230,New York,manhattan,,10027


In [126]:
## CLean Building Specific Dataframe for Tax Data 
info_buildings = noEnergy_investments[["property_id","address_1","address_2","city","borough","county","postcode"]].drop_duplicates()
info_buildings


Unnamed: 0,property_id,address_1,address_2,city,borough,county,postcode
0,2707907.0,2626 Homecrest Avenue,,Brooklyn,brooklyn,,11235
5,3521602.0,3240 Henry Hudson parkway,,Bronx,bronx,,10463
10,3521883.0,900 Avenue H,,Brooklyn,brooklyn,,11230
15,3522892.0,1561 E. 13th Street,,Brooklyn,brooklyn,,11230
20,4047231.0,115 West 35th Street,,new York,manhattan,,10001
...,...,...,...,...,...,...,...
106521,2817031.0,3845 Sedgwick,,Bronx,bronx,,10463
106555,4044352.0,3310-3320-3420-3510 Avenue H,,Brooklyn,brooklyn,,11210
106619,2711572.0,126 WEST FORDHAM,,BRONX,bronx,BRONC,10468
106717,2721640.0,160 Beach 177th Street,,Queens,queens,,11694


In [46]:
list(noEnergy_investments.columns)

['property_id',
 'property_name',
 'year_ending',
 'nyc_borough_block_and_lot',
 'nyc_building_identification',
 'address_1',
 'city',
 'largest_property_use_type_1',
 'year_built',
 'number_of_buildings',
 'occupancy',
 'metered_areas_energy',
 'metered_areas_water',
 'energy_star_score',
 'national_median_energy_star',
 'energy_star_certification',
 'energy_star_certification_1',
 'site_eui_kbtu_ft',
 'weather_normalized_site_eui',
 'national_median_site_eui',
 'site_energy_use_kbtu',
 'weather_normalized_site_energy',
 'source_eui_kbtu_ft',
 'weather_normalized_source',
 'national_median_source_eui',
 'source_energy_use_kbtu',
 'weather_normalized_source_1',
 'fuel_oil_1_use_kbtu',
 'fuel_oil_2_use_kbtu',
 'fuel_oil_4_use_kbtu',
 'fuel_oil_5_6_use_kbtu',
 'diesel_2_use_kbtu',
 'district_steam_use_kbtu',
 'natural_gas_use_kbtu',
 'electricity_use_grid_purchase',
 'electricity_use_grid_purchase_1',
 'electricity_use_grid_purchase_2',
 'electricity_use_generated',
 'electricity_use_gen

In [48]:
import re, time, requests, pandas as pd

BASE = "https://geoservice.planning.nyc.gov"

# Map borough/city variants to API values
BORO_NORM = {
    "manhattan":"Manhattan","new york":"Manhattan","mn":"Manhattan","1":"Manhattan",
    "bronx":"Bronx","bx":"Bronx","2":"Bronx",
    "brooklyn":"Brooklyn","kings":"Brooklyn","bk":"Brooklyn","3":"Brooklyn",
    "queens":"Queens","qn":"Queens","4":"Queens",
    "staten island":"Staten Island","richmond":"Staten Island","si":"Staten Island","5":"Staten Island",
}

def _clean_addr(addr):
    if pd.isna(addr): return None, None
    s = str(addr).strip()
    # strip trailing ", NY", zip, etc.
    s = re.sub(r",?\s*(NY|New York|USA|\d{5}(?:-\d{4})?)\s*$", "", s, flags=re.I)
    # remove unit/suite after a separator (e.g., "123 Main St Apt 4B" -> keep "123 Main St")
    s = re.split(r"\b(apt|suite|ste|unit|#)\b", s, flags=re.I)[0].strip()
    m = re.match(r"^\s*(\d[\d-]*)\s+(.*)$", s)
    if not m: return None, None
    return m.group(1), m.group(2)

def _norm_boro(boro, city=None):
    # prefer explicit borough; else use city
    for x in (boro, city):
        if x and not pd.isna(x):
            key = str(x).strip().lower()
            if key in BORO_NORM: return BORO_NORM[key]
    return None

def _fetch_bbl(row):
    hn, st = _clean_addr(row.get("address_1"))
    bor = _norm_boro(row.get("borough"), row.get("city"))
    if not (hn and st and bor):
        return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None})
    try:
        r = requests.get(f"{BASE}/Function_1A",
                         params={"HouseNumber": hn, "Street": st, "Borough": bor, "DisplayFormat": "true"},
                         timeout=20)
        r.raise_for_status()
        j = r.json()
        # tiny finder
        def find(d, pat):
            if isinstance(d, dict):
                for k,v in d.items():
                    if re.search(pat, str(k), re.I): return v
                    out = find(v, pat)
                    if out is not None: return out
            elif isinstance(d, list):
                for x in d:
                    out = find(x, pat)
                    if out is not None: return out
            return None
        bor_code = find(j, r"^borough(code)?$") or find(j, r"^boro$")
        block    = find(j, r"^block$")
        lot      = find(j, r"^lot$")
        if bor_code is None or block is None or lot is None:
            return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None})
        bbl = f"{int(bor_code)}{int(block):05d}{int(lot):04d}"
        return pd.Series({
            "bbl": bbl,
            "borough_code": str(int(bor_code)),
            "tax_block": str(int(block)),
            "tax_lot": str(int(lot)),
        })
    except Exception:
        return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None})
    finally:
        time.sleep(0.1)  # be polite

# 🔧 Inline “loop” over your df:
noEnergy_investments[["bbl","borough_code","tax_block","tax_lot"]] = (
    noEnergy_investments.apply(_fetch_bbl, axis=1)
)


KeyboardInterrupt: 

In [152]:
### Limiting to the columns we need for the analysis
[["property_id","property_name","year_ending_year","address_1","city","borough","bbl","bin","census_tract","ct2010","nta","latitude","longitude",

 ]]
  

NameError: name 'nta' is not defined

In [43]:
working_with_canopy_50ft["number_of_buildings"].unique()

array([  1.,   6.,   8.,   2.,   5.,  10.,  52.,  11.,   4.,  35.,   3.,
        45., 107.,  22.,  32.,  12.,  26.,   7.,  14.,  16.,  19.,  15.,
        24.,  30.,  18.,  42.,  13.,   9.,  25.,  91., 140., 131., 126.,
         0.,  31.,  20.,  28.,  23.,  21.,  nan, 161.,  27.])

In [156]:
working_with_canopy_50ft[["property_gfa_calculated","number_of_buildings","property_gfa_epa_calculated",
                          "multifamily_housing_number","multifamily_housing_total","occupancy"]]

Unnamed: 0,property_gfa_calculated,number_of_buildings,property_gfa_epa_calculated,multifamily_housing_number,multifamily_housing_total,occupancy
0,127500.0,1.0,,185.0,139.0,100.0
1,127500.0,1.0,,185.0,139.0,100.0
2,127500.0,1.0,,185.0,139.0,100.0
3,127500.0,1.0,,185.0,139.0,100.0
4,127500.0,1.0,,185.0,139.0,100.0
...,...,...,...,...,...,...
109532,,,,0.0,,
109533,,,,0.0,,
109534,,,,0.0,,
109535,,,,0.0,,


In [153]:
"year_built","number_of_buildings","property_gfa_calculated"   # or property_gfa_epa_calculated if cleaner
multifamily_housing_number
multifamily_housing_total
occupancy

NameError: name 'multifamily_housing_number' is not defined

In [17]:
working_with_canopy_50ft

NameError: name 'working_with_canopy_50ft' is not defined

In [148]:
list(noEnergy_investments.dropna(how='all',axis=1).columns)

['property_id',
 'property_name',
 'year_ending',
 'nyc_borough_block_and_lot',
 'nyc_building_identification',
 'address_1',
 'city',
 'largest_property_use_type_1',
 'year_built',
 'number_of_buildings',
 'occupancy',
 'metered_areas_energy',
 'metered_areas_water',
 'energy_star_score',
 'national_median_energy_star',
 'energy_star_certification',
 'energy_star_certification_1',
 'site_eui_kbtu_ft',
 'weather_normalized_site_eui',
 'national_median_site_eui',
 'site_energy_use_kbtu',
 'weather_normalized_site_energy',
 'source_eui_kbtu_ft',
 'weather_normalized_source',
 'national_median_source_eui',
 'source_energy_use_kbtu',
 'weather_normalized_source_1',
 'fuel_oil_1_use_kbtu',
 'fuel_oil_2_use_kbtu',
 'fuel_oil_4_use_kbtu',
 'fuel_oil_5_6_use_kbtu',
 'district_steam_use_kbtu',
 'natural_gas_use_kbtu',
 'electricity_use_grid_purchase',
 'electricity_use_grid_purchase_1',
 'electricity_use_grid_purchase_2',
 'annual_maximum_demand_kw',
 'annual_maximum_demand_mm',
 'annual_maximu

In [47]:
noEnergy_investments.head(n=10).to_csv("sample.csv",index=False)

In [None]:
### Reading in Tax Zoning 

In [160]:
tax_zoning = {
    "ALL":{
        "api":"https://data.cityofnewyork.us/resource/fdkv-4t4z.json",
        "info":"https://data.cityofnewyork.us/City-Government/NYC-Zoning-Tax-Lot-Database/fdkv-4t4z/about_data"
}}

In [162]:
def fetch_all_rows_1k(api_url: str, source_years: str, source_info_url: str) -> pd.DataFrame:
    offset = 0
    frames = []

    while True:
        params = {"$limit": PAGE, "$offset": offset}

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                resp = session.get(api_url, params=params, headers=headers, timeout=TIMEOUT)
                if resp.status_code in (429, 502, 503, 504):
                    time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))
                    continue
                resp.raise_for_status()

                if "json" not in resp.headers.get("Content-Type", "").lower():
                    preview = resp.text[:200]
                    raise ValueError(f"Non-JSON response (status {resp.status_code}): {preview}")

                data_chunk = resp.json()
                if not data_chunk:
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

                # Create DataFrame
                df = pd.DataFrame(data_chunk)

                # 🔑 Add your metadata columns here
                df["source_years"] = source_years
                df["source_api_url"] = api_url
                df["source_info_url"] = source_info_url

                frames.append(df)

                # If less than PAGE, stop; otherwise keep paginating
                if len(data_chunk) < PAGE:
                    return pd.concat(frames, ignore_index=True)

                offset += PAGE
                break  # Success, go to next page

            except (requests.RequestException, JSONDecodeError, ValueError) as e:
                if attempt == MAX_RETRIES:
                    print(f"⚠️ Failed fetching {api_url} at offset {offset}: {e}")
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
                time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))

In [163]:
PAGE = 1000               # <-- enforce 1,000 rows per page
TIMEOUT = 30
MAX_RETRIES = 5
BACKOFF_BASE = 1.5

session = requests.Session()
headers = {}

agg_running_list = []
for k, v in tax_zoning.items():
    print(f"Fetching {k} -> {v['api']}")
    df = fetch_all_rows_1k(v["api"], k, v["info"])
    if not df.empty:
        agg_running_list.append(df)
    else:
        print(f"Warning: no rows returned for {k} ({v['api']}).")

nyc_tax_zoning = pd.concat(agg_running_list, ignore_index=True) if agg_running_list else pd.DataFrame()
print(f"Total rows: {len(nyc_tax_zoning)}")


Fetching ALL -> https://data.cityofnewyork.us/resource/fdkv-4t4z.json
Total rows: 857969


In [2]:
# nyc_tax_zoning.to_csv("nyc_tax_zoning.csv", index=False )
nyc_tax_zoning = pd.read_csv("nyc_tax_zoning.csv")
# https://www.nyc.gov/content/planning/pages/zoning/zoning-districts-guide/residence-districts

  nyc_tax_zoning = pd.read_csv("nyc_tax_zoning.csv")


In [11]:
nyc_tax_zoning

Unnamed: 0,borough_code,tax_block,tax_lot,bbl,zoning_district_1,zoning_district_2,special_district_1,zoning_map_number,zoning_map_code,commercial_overlay_1,source_years,source_api_url,source_info_url,commercial_overlay_2,zoning_district_3,special_district_2,limited_height_district,zoning_district_4
0,1,1,10,1000010010,R3-2,C4-1,GI,16A,Y,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
1,1,1,101,1000010101,R3-2,,,16A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
2,1,1,111,1000010111,R3-2,,GI,16A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
3,1,1,112,1000010112,R3-2,,GI,16A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
4,1,1,150,1000010150,R3-2,,GI,16A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857964,5,8050,83,5080500083,R1-2,,SRD,35A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
857965,5,8050,86,5080500086,R1-2,,SRD,35A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
857966,5,8050,89,5080500089,R1-2,,SRD,35A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,
857967,5,8050,92,5080500092,R1-2,,SRD,35A,,,ALL,https://data.cityofnewyork.us/resource/fdkv-4t...,https://data.cityofnewyork.us/City-Government/...,,,,,


In [10]:
list(nyc_tax_zoning["zoning_district_1"][nyc_tax_zoning["zoning_district_1"].astype(str).str.contains('R')].unique()) #,	"zoning_district_2"

['R3-2',
 'PARK',
 'R8',
 'R7-2',
 'M1-5/R10',
 'M1-5/R9X',
 'M1-5/R7X',
 'R7A',
 'R8A',
 'R9-1',
 'R8B',
 'R7B',
 'R8X',
 'M1-6/R10',
 'M1-5/R7D',
 'R6',
 'M1-5/R9A',
 'R10',
 'R6A',
 'R9X',
 'R9A',
 'R9',
 'R10H',
 'R10A',
 'R7D',
 'R7X',
 'M1-6/R9',
 'M1-3/R8',
 'M1-5/R7-2',
 'M1-4/R9A',
 'M1-4/R7A',
 'R7-1',
 'R5',
 'R1-2',
 'M1-5/R8A',
 'M1-2/R6A',
 'M1-4/R7X',
 'M1-4/R6A',
 'M1-4/R8A',
 'M1-1/R7-2',
 'M1-4/R7D',
 'M1-2/R7-2',
 'R7-3',
 'R4A',
 'R6B',
 'R5B',
 'R5A',
 'R5D',
 'R3A',
 'R4',
 'R6-1',
 'M1-1A/R7-3',
 'R4-1',
 'R3-1',
 'R3X',
 'R2',
 'R1-1',
 'M1-6/R8X',
 'M1-2/R8',
 'M1-2/R8A',
 'M1-5/R9-1',
 'M1-2/R6',
 'M1-4/R6B',
 'M1-4/R7-2',
 'M1-1/R5',
 'M1-3/R7D',
 'M1-2A/R6A',
 'M1-3A/R7D',
 'M1-4A/R9A',
 'M1-1/R7D',
 'M1-1A/R6B',
 'M1-2/R7A',
 'M1-2/R6B',
 'M1-1/R6A',
 'R4B',
 'R2X',
 'M1-5/R7-3',
 'R6D',
 'M1-5/R9',
 'M1-5/R6A',
 'M1-2/R5D',
 'M1-2/R5B',
 'M1-3/R7X',
 'M1-4/R9',
 'M1-4/R7-3',
 'R2A',
 'R1-2A',
 'M1-2/R7-1']

In [12]:
import os
import re
import time
import json
import math
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

GEOSERVICE_BASE = "https://geoservice.planning.nyc.gov"
API_KEY = os.getenv("GEOSERVICE_KEY")  # set this in your environment

# Acceptable borough inputs include 1–5 or names like "mn", etc. We'll normalize from codes & BBL strings.
BORO_CODE_TO_NUM = {
    1: "1", 2: "2", 3: "3", 4: "4", 5: "5"
}
BORO_NUM_FROM_NAME = {
    "1":"1","mn":"1","manhattan":"1","new york":"1",
    "2":"2","bx":"2","bronx":"2",
    "3":"3","bk":"3","kings":"3","brooklyn":"3",
    "4":"4","qn":"4","queens":"4",
    "5":"5","si":"5","richmond":"5","staten island":"5",
}

def _normalize_boro(v):
    if v is None or (isinstance(v, float) and math.isnan(v)):
        return None
    s = str(v).strip().lower()
    if s.isdigit():
        return s
    return BORO_NUM_FROM_NAME.get(s, s)

def parse_bbl(bbl_str):
    """
    BBL is 10 digits: B(1) + Block(5) + Lot(4)
    e.g. 1000010010 -> b=1, block=00001 -> 1, lot=0010 -> 10
    """
    s = str(bbl_str).strip().zfill(10)
    boro = s[0]
    block = str(int(s[1:6]))  # remove leading zeros
    lot = str(int(s[6:10]))
    return boro, block, lot

def call_geoservice(path, params, *, pause=0.1, timeout=30):
    params = {k: v for k, v in params.items() if v not in (None, "", float("nan"))}
    if API_KEY:
        params.setdefault("Key", API_KEY)
    # DisplayFormat true returns beautified JSON; false returns raw WA fields.
    params.setdefault("DisplayFormat", "true")
    url = f"{GEOSERVICE_BASE}/{path}"
    r = requests.get(url, params=params, timeout=timeout)
    time.sleep(pause)  # be polite
    r.raise_for_status()
    return r.json()

def find_first(d, *predicates):
    """
    Search nested dict/list and return first value whose key matches any predicate (case-insensitive).
    predicate can be a regex compiled with re.I.
    """
    stack = [d]
    while stack:
        cur = stack.pop()
        if isinstance(cur, dict):
            for k, v in cur.items():
                if any(p.search(str(k)) for p in predicates):
                    return v
            stack.extend(cur.values())
        elif isinstance(cur, list):
            stack.extend(cur)
    return None

# -------- BBL -> Address list --------
def bbl_to_addresses(boro, block, lot):
    resp = call_geoservice(
        "Function_BBL",
        {"Borough": _normalize_boro(boro), "Block": block, "Lot": lot, "TPAD": "N"}
    )
    # Try common locations for address arrays/strings in the pretty JSON
    # This works even if field names vary a bit across versions.
    addr_predicates = [
        re.compile(r"address(es)?$", re.I),
        re.compile(r"addressList", re.I),
        re.compile(r"list.*address", re.I),
        re.compile(r"house.*street", re.I),
    ]
    addresses = find_first(resp, *addr_predicates)
    # Normalize: a list of strings
    if isinstance(addresses, list):
        # Sometimes each element is dict with HouseNumber/StreetName
        def to_str(a):
            if isinstance(a, dict):
                hn = a.get("HouseNumber") or a.get("Housenumber") or a.get("House", "")
                st = a.get("StreetName") or a.get("Street", "")
                b = a.get("Borough") or ""
                return " ".join([str(hn), str(st), str(b)]).strip()
            return str(a)
        addresses = [to_str(a) for a in addresses if a]
    elif isinstance(addresses, str):
        addresses = [addresses]
    else:
        addresses = []

    # Grab BINs too (often useful)
    bins = find_first(resp, re.compile(r"BINs?$", re.I), re.compile(r"Building.*Identification.*Number", re.I))
    if isinstance(bins, list):
        bins = [str(b) for b in bins]
    elif bins is not None:
        bins = [str(bins)]
    else:
        bins = []

    return addresses, bins, resp

# -------- Address -> lat/long + Census Tract --------
def geocode_address(address, borough=None):
    """
    Try Function_1B first (has both geographic & property info incl. CT & lat/long),
    fall back to Function_AP (address point) if needed.
    """
    params = {}
    # If a "free form" string is easiest, pass it directly; otherwise split number/name.
    # We'll try 'FreeForm' (supported on 1A/1B/1E/AP pages).
    params["FreeForm"] = address
    if borough:
        params["Borough"] = _normalize_boro(borough)

    # 1) Function 1B
    try:
        r1b = call_geoservice("Function_1B", params)
        lat = find_first(r1b, re.compile(r"^lat(itude)?$", re.I))
        lon = find_first(r1b, re.compile(r"^lon(gitude)?$", re.I))
        ct = (find_first(r1b, re.compile(r"census.*tract.*2020", re.I)) or
              find_first(r1b, re.compile(r"census.*tract.*2010", re.I)) or
              find_first(r1b, re.compile(r"census.*tract", re.I)))
        if lat and lon and ct:
            return {"latitude": float(lat), "longitude": float(lon), "census_tract": str(ct), "source": "1B"}
        elif (lat and lon) or ct:
            return {"latitude": float(lat) if lat else None,
                    "longitude": float(lon) if lon else None,
                    "census_tract": str(ct) if ct else None,
                    "source": "1B"}
    except Exception:
        pass

    # 2) Function AP (address point)
    try:
        rap = call_geoservice("Function_AP", params)
        lat = find_first(rap, re.compile(r"^lat(itude)?$", re.I))
        lon = find_first(rap, re.compile(r"^lon(gitude)?$", re.I))
        # AP may not include CT; if missing, try 1E quickly for CT using same address
        if lat and lon:
            ct = (find_first(rap, re.compile(r"census.*tract.*2020", re.I)) or
                  find_first(rap, re.compile(r"census.*tract.*2010", re.I)) or
                  None)
            if not ct:
                try:
                    r1e = call_geoservice("Function_1E", params)
                    ct = (find_first(r1e, re.compile(r"census.*tract.*2020", re.I)) or
                          find_first(r1e, re.compile(r"census.*tract.*2010", re.I)) or
                          find_first(r1e, re.compile(r"census.*tract", re.I)))
                except Exception:
                    ct = None
            return {"latitude": float(lat), "longitude": float(lon), "census_tract": str(ct) if ct else None, "source": "AP(+1E)"}
    except Exception:
        pass

    return {"latitude": None, "longitude": None, "census_tract": None, "source": "NA"}

def enrich_with_addresses_and_geo(df, bbl_col="bbl", boro_col="borough_code", block_col="tax_block", lot_col="tax_lot", max_workers=5):
    """
    Adds columns:
      - 'addr_list' (list of strings)
      - 'bin_list'  (list of BINs)
      - 'addr_best' (first address, if any)
      - 'latitude', 'longitude', 'census_tract', 'geo_source'
    Works if you have either a full 'bbl' string or separate borough/block/lot.
    """
    work = []

    for idx, row in df.iterrows():
        if pd.notna(row.get(bbl_col, None)):
            boro, block, lot = parse_bbl(row[bbl_col])
        else:
            boro = _normalize_boro(row.get(boro_col))
            block = str(int(row.get(block_col))) if pd.notna(row.get(block_col)) else None
            lot = str(int(row.get(lot_col))) if pd.notna(row.get(lot_col)) else None
        work.append((idx, boro, block, lot))

    # 1) BBL -> addresses
    results = {}
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(bbl_to_addresses, b, bl, l): (i, b) for i, b, bl, l in work}
        for fut in as_completed(futs):
            i, boro = futs[fut]
            try:
                addrs, bins, raw = fut.result()
                results[i] = {"addr_list": addrs, "bin_list": bins, "raw_bbl_json": raw}
            except Exception as e:
                results[i] = {"addr_list": [], "bin_list": [], "raw_bbl_json": {"error": str(e)}}

    # 2) Address -> geo (lat/long & CT). Use the first address returned.
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {}
        for i, boro, _, _ in work:
            addr_list = results[i]["addr_list"]
            addr_best = addr_list[0] if addr_list else None
            results[i]["addr_best"] = addr_best
            futs[ex.submit(geocode_address, addr_best, boro)] = i if addr_best else None

        for fut in as_completed(futs):
            i = futs[fut]
            if i is None:
                continue
            try:
                g = fut.result()
            except Exception as e:
                g = {"latitude": None, "longitude": None, "census_tract": None, "source": f"error:{e}"}
            results[i].update({
                "latitude": g.get("latitude"),
                "longitude": g.get("longitude"),
                "census_tract": g.get("census_tract"),
                "geo_source": g.get("source")
            })

    # 3) Write back to dataframe
    out = df.copy()
    out["addr_list"]    = [results[i]["addr_list"] for i in out.index]
    out["bin_list"]     = [results[i]["bin_list"] for i in out.index]
    out["addr_best"]    = [results[i].get("addr_best") for i in out.index]
    out["latitude"]     = [results[i].get("latitude") for i in out.index]
    out["longitude"]    = [results[i].get("longitude") for i in out.index]
    out["census_tract"] = [results[i].get("census_tract") for i in out.index]
    out["geo_source"]   = [results[i].get("geo_source") for i in out.index]
    return out


In [13]:
# nyc_tax_zoning is your dataframe from the screenshot.
# It has both `bbl` and separate borough/block/lot columns, so either works.

df_enriched = enrich_with_addresses_and_geo(nyc_tax_zoning, bbl_col="bbl",
                                            boro_col="borough_code",
                                            block_col="tax_block",
                                            lot_col="tax_lot",
                                            max_workers=5)

# peek:
df_enriched[["bbl", "addr_best", "latitude", "longitude", "census_tract", "geo_source"]].head()


Unnamed: 0,bbl,addr_best,latitude,longitude,census_tract,geo_source
0,1000010010,,,,,
1,1000010101,,,,,
2,1000010111,,,,,
3,1000010112,,,,,
4,1000010150,,,,,


In [15]:
df_enriched

Unnamed: 0,borough_code,tax_block,tax_lot,bbl,zoning_district_1,zoning_district_2,special_district_1,zoning_map_number,zoning_map_code,commercial_overlay_1,...,special_district_2,limited_height_district,zoning_district_4,addr_list,bin_list,addr_best,latitude,longitude,census_tract,geo_source
0,1,1,10,1000010010,R3-2,C4-1,GI,16A,Y,,...,,,,[],[],,,,,
1,1,1,101,1000010101,R3-2,,,16A,,,...,,,,[],[],,,,,
2,1,1,111,1000010111,R3-2,,GI,16A,,,...,,,,[],[],,,,,
3,1,1,112,1000010112,R3-2,,GI,16A,,,...,,,,[],[],,,,,
4,1,1,150,1000010150,R3-2,,GI,16A,,,...,,,,[],[],,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857964,5,8050,83,5080500083,R1-2,,SRD,35A,,,...,,,,[],[],,,,,
857965,5,8050,86,5080500086,R1-2,,SRD,35A,,,...,,,,[],[],,,,,
857966,5,8050,89,5080500089,R1-2,,SRD,35A,,,...,,,,[],[],,,,,
857967,5,8050,92,5080500092,R1-2,,SRD,35A,,,...,,,,[],[],,,,,


In [14]:
df_enriched.to_csv("df_enriched.csv",index=False)

In [None]:
import re, time, requests, pandas as pd

BASE = "https://geoservice.planning.nyc.gov"

# Normalize your 'borough' values to what the API expects
BORO_MAP = {
    "1":"Manhattan","mn":"Manhattan","manhattan":"Manhattan","new york":"Manhattan",
    "2":"Bronx","bx":"Bronx","bronx":"Bronx",
    "3":"Brooklyn","bk":"Brooklyn","kings":"Brooklyn","brooklyn":"Brooklyn",
    "4":"Queens","qn":"Queens","queens":"Queens",
    "5":"Staten Island","si":"Staten Island","richmond":"Staten Island","staten island":"Staten Island",
}

def _norm_boro(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    return BORO_MAP.get(s, BORO_MAP.get(s.split(",")[0], None)) or (s.title() if s else None)

def _parse_addr(a):
    """Return (house_number, street) from a freeform address_1 string."""
    if pd.isna(a): return None, None
    a = str(a).strip()
    # number (handles Queens hyphen numbers like 31-12) + the rest
    m = re.match(r"^\s*(\d[\d-]*)\s+(.*)$", a)
    if not m: return None, None
    hn, street = m.group(1), m.group(2)
    # trim trailing city/state/zip if present
    street = re.sub(r",?\s*(NY|New York|USA|\d{5}(?:-\d{4})?)\s*$", "", street, flags=re.I).strip()
    return hn, street

def _row_to_bbl(row):
    hn, st = _parse_addr(row.get("address_1"))
    bor = _norm_boro(row.get("borough"))
    if not (hn and st and bor):
        return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None})
    try:
        resp = requests.get(
            f"{BASE}/Function_1A",
            params={"HouseNumber": hn, "Street": st, "Borough": bor, "DisplayFormat": "true"},
            timeout=20,
        )
        resp.raise_for_status()
        j = resp.json()

        # tiny recursive finder to be robust to field-name tweaks
        def _find(d, pat):
            if isinstance(d, dict):
                for k,v in d.items():
                    if re.search(pat, str(k), re.I): return v
                    hit = _find(v, pat)
                    if hit is not None: return hit
            elif isinstance(d, list):
                for x in d:
                    hit = _find(x, pat)
                    if hit is not None: return hit
            return None

        boro_code = _find(j, r"^borough(code)?$") or _find(j, r"^boro$")
        block     = _find(j, r"^block$")
        lot       = _find(j, r"^lot$")

        if boro_code is None or block is None or lot is None:
            return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None})

        bbl = f"{int(boro_code)}{int(block):05d}{int(lot):04d}"
        return pd.Series({
            "bbl": bbl,
            "borough_code": str(int(boro_code)),
            "tax_block": str(int(block)),
            "tax_lot": str(int(lot)),
        })
    except Exception:
        return pd.Series({"bbl": None, "borough_code": None, "tax_block": None, "tax_lot": None})
    finally:
        time.sleep(0.1)  # be polite to the API

# === Inline apply over your dataframe ===
noEnergy_investments[["bbl","borough_code","tax_block","tax_lot"]] = (
    noEnergy_investments.apply(_row_to_bbl, axis=1)
)


In [None]:
### 

In [248]:
import geopandas as gpd
import fiona

# 1) Point to the GDB folder (NOT an individual file)
GDB = r"C:\Users\johnf\Downloads\nyc_mappluto_25v3_fgdb\MapPLUTO25v3.gdb"  # adjust to your path

# 2) See what layers are inside (names vary by release)
print(fiona.listlayers(GDB))
# Typical: ["MapPLUTO"] or borough layers like ["BKMapPLUTO","MNMapPLUTO","QNMapPLUTO","BXMapPLUTO","SIMapPLUTO"]

# 3) Read the layer (pick "MapPLUTO" or your borough layer)
layer = "MapPLUTO"   # change if listlayers() shows borough layers
pluto = gpd.read_file(GDB, layer=layer)

# 4) Keep only what you need and set CRS
wanted = ["BBL","BIN","LandUse","NumFloors","UnitsRes","BldgClass","geometry"]
have = [c for c in wanted if c in pluto.columns] + ["geometry"]
pluto = pluto[have]
if pluto.crs is None or pluto.crs.to_epsg() != 4326:
    pluto = pluto.to_crs(4326)

# 5) (Optional) Save to something lighter/faster for repeated use
pluto.to_file("pluto.gpkg", layer="pluto", driver="GPKG")      # portable
pluto.to_parquet("pluto.parquet")                              # fastest for Python


['MapPLUTO_25v3_clipped']


DataLayerError: Layer 'MapPLUTO' could not be opened

In [None]:
### REading in the pluta data i need. Total thing crashed the kernel, limiting by lat long in the 

In [7]:
building_info_geocoded.longitude.min()

-74.171256

In [None]:
building_info_geocoded

In [None]:
## PLUTO

In [16]:
# SOURCE # https://www.nyc.gov/content/planning/pages/resources/datasets/mappluto-pluto-change#mappluto
import geopandas as gpd
import pyogrio
import pandas as pd
from shapely.geometry import box

SRC   = r"C:/Users/johnf/Downloads/nyc_mappluto_25v3_fgdb/MapPLUTO25v3.gdb"  # note forward slashes
LAYER = "MapPLUTO_25v3_clipped"

# 1) Prove we can read: take a tiny sample
pluto_head = gpd.read_file(SRC, layer=LAYER, rows=slice(0, 500))  # engine=pyogrio under the hood
print(pluto_head.shape, pluto_head.crs)          # e.g., EPSG:2263 or EPSG:4326
print(pluto_head.columns[:15])

# 2) Build a bbox from your LL84 points and transform it to the PLUTO CRS if needed
pad = 0.002  # ~200 m
bbox_4326 = (building_info_geocoded.longitude.min()-pad, building_info_geocoded.latitude.min()-pad,
        building_info_geocoded.longitude.max()+pad, building_info_geocoded.latitude.max()+pad)

if pluto_head.crs and pluto_head.crs.to_epsg() != 4326:
    # transform bbox to the dataset CRS
    g_bbox = gpd.GeoSeries([box(*bbox_4326)], crs=4326).to_crs(pluto_head.crs)
    minx, miny, maxx, maxy = g_bbox.geometry.iloc[0].bounds
    bbox = (minx, miny, maxx, maxy)
else:
    bbox = bbox_4326

# 3) Read ONLY the columns you need within that bbox (stable + light on memory)
cols = ["BBL","BIN","LandUse","NumFloors","UnitsRes","BldgClass"]
pluto = pyogrio.read_dataframe(
    SRC, layer=LAYER,
    columns=[c for c in cols if c in pluto_head.columns],
    bbox=bbox,
    force_2d=True,
)
# Reproject to WGS84 for joining to your lon/lat points
if pluto.crs is None or pluto.crs.to_epsg() != 4326:
    pluto = pluto.to_crs(4326)

print("Loaded:", pluto.shape)


(500, 96) EPSG:2263
Index(['Borough', 'Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010',
       'CB2010', 'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCenterDistrict', 'HealthArea'],
      dtype='object')
Loaded: (816021, 6)


In [17]:
### Spatial Join for PLuto and LL84 data with Lat Long 

Unnamed: 0,BBL,BldgClass,LandUse,NumFloors,UnitsRes,geometry
0,1.000010e+09,Y4,08,,0.0,"MULTIPOLYGON (((-74.01826 40.6927, -74.01842 4..."
1,1.000010e+09,Z9,,,0.0,"MULTIPOLYGON (((-74.03823 40.69836, -74.03868 ..."
2,1.000020e+09,Y7,07,1.0,0.0,"MULTIPOLYGON (((-74.01108 40.70096, -74.01108 ..."
3,1.000030e+09,Q1,09,1.0,0.0,"MULTIPOLYGON (((-74.01559 40.70459, -74.01531 ..."
4,1.000050e+09,D5,03,32.0,1320.0,"MULTIPOLYGON (((-74.01012 40.7026, -74.0112 40..."
...,...,...,...,...,...,...
816016,5.056530e+09,A5,01,3.0,1.0,"MULTIPOLYGON (((-74.17277 40.56113, -74.17285 ..."
816017,5.056530e+09,A5,01,3.0,1.0,"MULTIPOLYGON (((-74.17268 40.56114, -74.17277 ..."
816018,5.056530e+09,A5,01,3.0,1.0,"MULTIPOLYGON (((-74.17265 40.56101, -74.17274 ..."
816019,5.056530e+09,A5,01,3.0,1.0,"MULTIPOLYGON (((-74.17288 40.56098, -74.17295 ..."


In [19]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# 0) You already have `pluto` loaded and to_crs(4326)
#    with columns like: BBL, LandUse, NumFloors, UnitsRes, BldgClass

# 1) Make a GeoDataFrame of your LL84 points (must be lon/lat in WGS84)
ll84 = building_info_geocoded.copy()  # your existing DF
g_ll84 = gpd.GeoDataFrame(
    ll84,
    geometry=[Point(xy) for xy in zip(ll84["longitude"], ll84["latitude"])],
    crs=4326
)

# 2) Spatial join: attach lot attributes to each LL84 point
#    (intersects is safer than within if a point lands on an edge)
ll84_pluto = gpd.sjoin(g_ll84, pluto, how="left", predicate="intersects")

# 3) Stories column (use PLUTO NumFloors)
# ll84_pluto["stories"] = ll84_pluto["NumFloors"]

# # 4) Residential + <5 stories filter
# res_codes = {"01","02","03","04"}  # 1–2 fam, walk-ups, elevator apt, mixed res/com
# is_res = (
#     ll84_pluto["LandUse"].astype(str).isin(res_codes)
#     | (ll84_pluto["UnitsRes"].fillna(0) > 0)
#     | ll84_pluto["BldgClass"].fillna("").str.startswith(("A","B"))
# )
# target = ll84_pluto[ is_res & ll84_pluto["stories"].notna() & (ll84_pluto["stories"] < 5) ].copy()

# print("Total LL84:", len(g_ll84))
# print("Matched PLUTO:", ll84_pluto["BBL"].notna().sum())
# print("Residential <5 stories:", len(target))

# # 5) Save & quick interactive preview
# target_cols = [c for c in ["BBL","LandUse","NumFloors","UnitsRes","BldgClass","stories"] if c in target.columns]
# target[target_cols + ["geometry"]].to_file("ll84_res_under5.geojson", driver="GeoJSON")

# m = target.explore(tooltip=target_cols, style_kwds={"fillOpacity":0.2,"weight":0.3})
# m.save("ll84_res_under5_preview.html")
# print("Wrote ll84_res_under5.geojson and ll84_res_under5_preview.html")


Total LL84: 9121
Matched PLUTO: 371
Residential <5 stories: 37
Wrote ll84_res_under5.geojson and ll84_res_under5_preview.html


In [27]:
building_info_geocoded

Unnamed: 0,property_id,address_1,address_2,city,postal_code,county,borough,latitude,longitude,address_line1_clean,address_line2_clean,borough_clean,city_clean,state_clean,postal_code_5_clean,needs_geocoding,geocode_key,geocode_tag,formatted_address,place_id
0,2707907.0,2626 Homecrest Avenue,,Brooklyn,,,BROOKLYN,40.587065,-73.957019,2626 Homecrest Ave,,BROOKLYN,Brooklyn,NY,,False,2626 HOMECREST AVE||BROOKLYN|NY|,,,
1,3521602.0,3240 Henry Hudson parkway,,Bronx,,,BRONX,40.885365,-73.913345,3240 Henry Hudson Pkwy,,BRONX,Bronx,NY,,False,3240 HENRY HUDSON PKWY||BRONX|NY|,,,
2,3521883.0,900 Avenue H,,Brooklyn,,,BROOKLYN,40.629342,-73.967861,900 Ave H,,BROOKLYN,Brooklyn,NY,,False,900 AVE H||BROOKLYN|NY|,,,
3,3522892.0,1561 E. 13th Street,,Brooklyn,,,BROOKLYN,40.611279,-73.960634,1561 E. 13Th St,,BROOKLYN,Brooklyn,NY,,False,1561 E. 13TH ST||BROOKLYN|NY|,,,
4,4047231.0,115 West 35th Street,,new York,,,MANHATTAN,40.750878,-73.988270,115 West 35Th St,,MANHATTAN,New York,NY,,False,115 WEST 35TH ST||NEW YORK|NY|,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9116,3632773.0,955 East 163rd Street,,New York,,,BRONX,40.821001,-73.895580,955 East 163Rd St,,MANHATTAN,New York,NY,,False,955 EAST 163RD ST||NEW YORK|NY|,,,
9117,3632791.0,1541 Shakespeare Avenue,,New York,,,BRONX,40.845719,-73.919252,1541 Shakespeare Ave,,MANHATTAN,New York,NY,,False,1541 SHAKESPEARE AVE||NEW YORK|NY|,,,
9118,3633523.0,128 West 128th Street,,New York,,,MANHATTAN,40.810108,-73.945088,128 West 128Th St,,MANHATTAN,New York,NY,,False,128 WEST 128TH ST||NEW YORK|NY|,,,
9119,3633666.0,315 E 102 Street,,New York,,,MANHATTAN,40.787834,-73.943766,315 E 102St St,,MANHATTAN,New York,NY,,False,315 E 102ST ST||NEW YORK|NY|,,,


In [None]:
https://geosearch.planninglabs.nyc/v2/search?text=955 EAST 163RD ST||NEW YORK|NY 	

In [24]:
print("Unsuccessful:")
print(ll84_pluto[ll84_pluto['BBL'].isnull()].shape)
print("success:")
print(ll84_pluto[~ll84_pluto['BBL'].isnull()].shape)

Unsuccessful:
(8750, 28)
success:
(371, 28)


In [25]:
# Compare bounds of your points vs. PLUTO
print("PLUTO bounds:", pluto.total_bounds)  # [minx, miny, maxx, maxy]

print("LL84 bounds:", g_ll84.total_bounds)
print("Sample coords:")
print(g_ll84.geometry.x.head(5).tolist(), g_ll84.geometry.y.head(5).tolist())

# Count points clearly in NYC-ish extent
in_nyc = (
    g_ll84.geometry.x.between(-74.30, -73.60) &
    g_ll84.geometry.y.between( 40.45,  40.95)
)
print("Points in NYC bbox:", in_nyc.sum(), "/", len(g_ll84))


PLUTO bounds: [-74.18605881  40.52954773 -73.70017605  40.91544196]
LL84 bounds: [-74.171256  40.556747 -73.700935  40.912869]
Sample coords:
[-73.957019, -73.913345, -73.967861, -73.960634, -73.98827] [40.587065, 40.885365, 40.629342, 40.611279, 40.750878]
Points in NYC bbox: 9121 / 9121


In [26]:
# start with your previous result
got_lot = ll84_pluto["BBL"].notna()
no_lot  = ll84_pluto[~got_lot].drop(columns=["index_right"], errors="ignore")

recovered = gpd.sjoin_nearest(
    no_lot,
    pluto[["geometry","BBL","LandUse","NumFloors","UnitsRes","BldgClass"]],
    how="left",
    max_distance=0.0005,   # ~55 m; start at 0.00025 (~28 m) and increase if needed
    distance_col="snap_dist_deg"
)

# merge recovered attributes back
cols = ["BBL","LandUse","NumFloors","UnitsRes","BldgClass"]
ll84_pluto.loc[recovered.index, cols] = recovered[cols].values

print("After nearest-snap, lot matches:", ll84_pluto["BBL"].notna().sum())





KeyError: "None of [Index(['BBL', 'LandUse', 'NumFloors', 'UnitsRes', 'BldgClass'], dtype='object')] are in the [columns]"

In [None]:
### Building Footprint data and Building Historic INFO and Height Ets
# BUILDING_HISTORIC
# https://data.cityofnewyork.us/City-Government/BUILDING_HISTORIC/ipkp-snf6/about_data

#Building Elevation and Subgrade (BES)
# https://data.cityofnewyork.us/City-Government/Building-Elevation-and-Subgrade-BES-/bsin-59hv/about_data