In [115]:
#Importing needed libraries
import requests
import pandas as pd
from datetime import datetime
import geopandas as gpd
import fiona
import time
from json import JSONDecodeError
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.discrete.count_model import ZeroInflatedPoisson
from personal_lib import general_functions as gf


from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="capstone_test")

In [None]:
### NYC ENERGY DATA BUILDINGS

In [None]:
### Pulling in the Data Via the APi online 

##### Skip here until reading in files from CSV, because this takes very long to run.

In [None]:
PAGE = 1000               # <-- enforce 1,000 rows per page
TIMEOUT = 30
MAX_RETRIES = 5
BACKOFF_BASE = 1.5

session = requests.Session()
headers = {}

def fetch_all_rows_1k(api_url: str, source_years: str, source_info_url: str) -> pd.DataFrame:
    offset = 0
    frames = []

    while True:
        params = {"$limit": PAGE, "$offset": offset}

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                resp = session.get(api_url, params=params, headers=headers, timeout=TIMEOUT)
                if resp.status_code in (429, 502, 503, 504):
                    time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))
                    continue
                resp.raise_for_status()

                if "json" not in resp.headers.get("Content-Type", "").lower():
                    preview = resp.text[:200]
                    raise ValueError(f"Non-JSON response (status {resp.status_code}): {preview}")

                data_chunk = resp.json()
                if not data_chunk:
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

                # Create DataFrame
                df = pd.DataFrame(data_chunk)

                # 🔑 Add your metadata columns here
                df["source_years"] = source_years
                df["source_api_url"] = api_url
                df["source_info_url"] = source_info_url

                frames.append(df)

                # If less than PAGE, stop; otherwise keep paginating
                if len(data_chunk) < PAGE:
                    return pd.concat(frames, ignore_index=True)

                offset += PAGE
                break  # Success, go to next page

            except (requests.RequestException, JSONDecodeError, ValueError) as e:
                if attempt == MAX_RETRIES:
                    print(f"⚠️ Failed fetching {api_url} at offset {offset}: {e}")
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
                time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))

# --- your existing mapping dict (unchanged) ---
# building_energy_LL84_sources = { ... }  # keep your dict as-is

In [None]:
agg_running_list = []
for k, v in building_energy_LL84_sources.items():
    print(f"Fetching {k} -> {v['api']}")
    df = fetch_all_rows_1k(v["api"], k, v["info"])
    if not df.empty:
        agg_running_list.append(df)
    else:
        print(f"Warning: no rows returned for {k} ({v['api']}).")

nyc_building_energy = pd.concat(agg_running_list, ignore_index=True) if agg_running_list else pd.DataFrame()
print(f"Total rows: {len(nyc_building_energy)}")

In [None]:
## There are mulitple sources of yearly data. Compiling the source URLs here as well as the API urls to grab
## privately owned buildings over 25,000 ft2 and in City-owned buildings over 10,000 ft2
building_energy_LL84_sources={"2022+":{
    "api":"https://data.cityofnewyork.us/resource/5zyy-y8am.json",
    "info":"https://data.cityofnewyork.us/Environment/NYC-Building-Energy-and-Water-Data-Disclosure-for-/5zyy-y8am/about_data"
                              },
                              "2021":{
    "api":"https://data.cityofnewyork.us/resource/7x5e-2fxh.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/7x5e-2fxh/about_data"
                              },
                              "2020":{
    "api":"https://data.cityofnewyork.us/resource/usc3-8zwd.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/usc3-8zwd/about_data"
                                  },
                              "2019":{
    "api":"https://data.cityofnewyork.us/resource/wcm8-aq5w.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/wcm8-aq5w/about_data"
                                  },
                              "2018":{
    "api":"https://data.cityofnewyork.us/resource/4tys-3tzj.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/4tys-3tzj/about_data"
                                  },
                              "2017":{
    "api":"https://data.cityofnewyork.us/resource/4t62-jm4m.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/4t62-jm4m/about_data"
                                  },
                              "2016":{
    "api":"https://data.cityofnewyork.us/resource/utpj-74fz.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/utpj-74fz/about_data"
                                  },
                              "2015":{
    "api":"https://data.cityofnewyork.us/resource/77q4-nkfh.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/77q4-nkfh/about_data"
                                  },
                              "2014":{
    "api":"https://data.cityofnewyork.us/resource/nbun-wekj.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/nbun-wekj/about_data"
                                  },
                              "2013":{
    "api":"https://data.cityofnewyork.us/resource/yr5p-wjer.json",
    "info":"http://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/yr5p-wjer"
                                  },
                              "2012":{
    "api":"https://data.cityofnewyork.us/resource/r6ub-zhff.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/r6ub-zhff/about_data"
                                  },
                              "2011":{
    "api":"https://data.cityofnewyork.us/resource/k7nh-aufb.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/k7nh-aufb/about_data"
                                  },
                              "2010":{
    "api":"https://data.cityofnewyork.us/resource/kswi-37bp.json",
    "info":"https://data.cityofnewyork.us/Environment/Energy-and-Water-Data-Disclosure-for-Local-Law-84-/kswi-37bp/about_data"
                                  },
#Monthly Data? 2018-2023
                              "Monthly":{
    "api":"https://data.cityofnewyork.us/resource/fvp3-gcb2.json",
    "info":"https://data.cityofnewyork.us/Environment/Local-Law-84-Monthly-Data/fvp3-gcb2/about_data"
    }}


#### End Skip here. Pick up and read in data

In [1]:
### If just want residential limist continue to skip in untill reading in residential 3
# nyc_building_energy=  pd.read_csv("nyc_dob_energy_2010_2024.csv")

In [None]:
## Formatting into proper df
# print(nyc_building_energy.shape)
## Saving to File
# nyc_building_energy.to_csv("nyc_dob_energy_2010_2024.csv",index=False)

### Starting Cleaning / Skip if you want ot just read in residential3

In [None]:
### Columns to Drop because potentially irrelevant or redundant for 
nyc_building_energy_processing = nyc_building_energy.copy()
# nyc_building_energy_processing = nyc_building_energy_processing.drop(columns=['data_center_ups_output_meter',
#  'data_center_pdu_input_meter',
#  'data_center_pdu_output_meter',
#  'data_center_it_equipment',
#  'data_center_it_site_energy',
#  'data_center_it_source_energy',
#  'data_center_pue',
#  'data_center_national_median',
#  'data_center_gross_floor_area',
#  'data_center_ups_system',
#  'data_center_it_energy',
#  'data_center_cooling_equipment',
# 'supermarkets_grocery_gross',
#  'supermarkets_grocery_number',
#  'supermarkets_grocery_number_1',
#  'supermarkets_grocery_number_2',
#  'supermarkets_grocery_percent',
#  'supermarkets_grocery_presence',
#  'supermarkets_grocery_walk',
#  'supermarkets_grocery_weekly',
#  'supermarkets_grocery_workers',
# 'house_of_worship_gross_floor',
#  'house_of_worship_pc_density',
#  'house_of_worship_weekly',
#  'house_of_worship_presence',
# 'residence_halls_dormitories',
# 'medical_office_percent_cooled',
#  'residence_halls_dormitories_1',
#  'residence_halls_dormitories_2',
#  'residence_halls_dormitories_3',
#  'residence_halls_dormitories_4',
# 'hotel_onsite_laundry_short',
#  'warehouse_unrefrigerated',
#  'warehouse_unrefrigerated_1',
#  'warehouse_unrefrigerated_2',
#  'warehouse_unrefrigerated_3',
#  'warehouse_unrefrigerated_4',
#  'warehouse_unrefrigerated_5',
#  'warehouse_unrefrigerated_6',
#  'hospital_gross_floor_area',
#  'hospital_laboratory_y_1_n',
#  'hospital_laundry_facility',
#  'hospital_maximum_number_of',
#  'hospital_number_of_buildings',
#  'warehouse_refrigerated_gross',
#  'warehouse_refrigerated_weekly',
#  'warehouse_refrigerated_workers',
#  'hospital_number_of_licensed',
#  'multifamily_home_dishwashers'])

## Fruther Limiting to those that are under 10 Stories. 
nyc_building_energy_processing = nyc_building_energy_processing.replace("Not Available",np.nan).dropna(how='all',axis=1)
nyc_building_energy_processing = nyc_building_energy_processing.dropna(how='all',axis=1)

nyc_building_energy_processing["primary_property_type"] = nyc_building_energy_processing["primary_property_type"].combine_first(nyc_building_energy_processing["primary_property_type_epa"])
nyc_building_energy_processing = nyc_building_energy_processing.drop(columns=["primary_property_type_epa"])
nyc_building_energy_processing["primary_property_type_self"] = nyc_building_energy_processing["primary_property_type_self"].combine_first(nyc_building_energy_processing["primary_property_type_self_selected"])
nyc_building_energy_processing = nyc_building_energy_processing.drop(columns=["primary_property_type_self_selected"])



## Beginning of Limiting to Multifamily Homes (Step 1)
residential = nyc_building_energy_processing[(
    (nyc_building_energy_processing["list_of_all_property_use"].isin([i for i in nyc_building_energy_processing["list_of_all_property_use"].unique() if 'Multifamily Housing' in str(i)]))
    |(nyc_building_energy_processing["list_of_all_property_use"].isnull() 
      & (nyc_building_energy_processing["primary_property_type_self"]=="Multifamily Housing")
      & ((nyc_building_energy_processing["primary_property_type"]=="Multifamily Housing")|
         (nyc_building_energy_processing["primary_property_type"].isnull()))))]

In [None]:
## Beginning of Limiting to Multifamily Homes (Step 1)
residential = nyc_building_energy_processing[(
    (nyc_building_energy_processing["list_of_all_property_use"].isin([i for i in nyc_building_energy_processing["list_of_all_property_use"].unique() if 'Multifamily Housing' in str(i)]))
    |(nyc_building_energy_processing["list_of_all_property_use"].isnull() 
      & (nyc_building_energy_processing["primary_property_type_self"]=="Multifamily Housing")
      & ((nyc_building_energy_processing["primary_property_type"]=="Multifamily Housing")|
         (nyc_building_energy_processing["primary_property_type"].isnull()))))]

In [None]:
print(residential.shape)

In [None]:
## Second Step is limiting to NON MIXED USE, so ONLY MultiFamily Residences
residential = residential[
    residential['list_of_all_property_use'].isnull() |
    residential['list_of_all_property_use'].apply(
        lambda x: len(x.split(",")) == 1 if isinstance(x, str) else False
    )
]
## Fruther Limiting to those that are under 10 Stories. 
residential = residential.replace("Not Available",np.nan).dropna(how='all',axis=1)
residential = residential.dropna(how='all',axis=1)

In [None]:
## Dropping Other Property Type Columns 
to_drop =[
## Banking oriented COlumns
'bank_branch_computer_density',
 'bank_branch_gross_floor_area',
 'bank_branch_gross_floor_area_ft',
 'bank_branch_number_of',
 'bank_branch_number_of_workers',
 'bank_branch_percent_that',
 'bank_branch_weekly_operating',
 'bank_branch_worker_density',
## College / Uni / School
'college_university_gross',
 'college_university_gross_floor_area_ft',
 'college_university_number',
    'k_12_school_computer_density',
 'k_12_school_cooking_facilities',
 'k_12_school_gross_floor_area',
 'k_12_school_gross_floor_area_ft',
 'k_12_school_high_school',
 'k_12_school_percent_that',
 'k_12_school_refrigeration',
 'k_12_school_weekend_operation',
 'laboratory_gross_floor_area_ft',
    'library_gross_floor_area',
# OTher
"automobile_dealership_gross",
'convenience_store_without',
'data_center_energy_estimates_applied',
 'data_center_gross_floor_area',
 'data_center_gross_floor_area_ft',
 'data_center_it_energy',
 'data_center_it_energy_configuration',
 'data_center_it_equipment_input_meter_kwh',
 'data_center_it_site_energy',
 'data_center_it_site_energy_kwh',
 'data_center_it_source_energy_kbtu',
 'data_center_national_median',
 'data_center_pdu_input_meter_kwh',
 'data_center_pdu_output_meter_kwh',
 'data_center_ups_output_meter_kwh',
 'enclosed_mall_gross_floor',
 'enclosed_mall_gross_floor_area_ft',
 'fast_food_restaurant_gross',
 'financial_office_gross_floor',
 'financial_office_gross_floor_area_ft',
 'financial_office_number_of',
 'financial_office_number_of_1',
 'financial_office_number_of_computers',
 'financial_office_number_of_workers_on_main_shift',
 'financial_office_weekly',
 'financial_office_weekly_operating_hours',
 'fitness_center_health_club',
 'fitness_center_health_club_gym_gross_floor_area_ft',
 'food_sales_gross_floor_area',
 'food_sales_gross_floor_area_ft',
 'food_service_gross_floor',
 'food_service_gross_floor_area_ft',
     'worship_facility_computer',
 'worship_facility_cooking',
 'worship_facility_gross_floor',
 'worship_facility_gross_floor_area_ft',
 'worship_facility_weekly',
    'supermarket_grocery_cooking',
 'supermarket_grocery_gross',
 'supermarket_grocery_gross_floor_area_ft',
 'supermarket_grocery_number',
 'supermarket_grocery_number_1',
 'supermarket_grocery_number_2',
 'supermarket_grocery_number_of_open_or_closed_refrigeration_freezer_units',
 'supermarket_grocery_number_of_walk_in_refrigeration_freezer_units',
 'supermarket_grocery_percent',
 'supermarket_grocery_walk',
 'supermarket_grocery_weekly',
 'supermarket_grocery_worker',
 'swimming_pool_approximate',
 'swimming_pool_location_of',
 'swimming_pool_months_in_use',
     'social_meeting_hall_gross',
 'social_meeting_hall_gross_floor_area_ft',
     'residence_hall_dormitory',
 'residence_hall_dormitory_1',
 'residence_hall_dormitory_2',
 'residence_hall_dormitory_3',
 'residence_hall_dormitory_4',
 'residence_hall_dormitory_gross_floor_area_ft',
 'restaurant_gross_floor_area',
 'restaurant_gross_floor_area_ft',
 'restaurant_weekly_operating',
 'restaurant_weekly_operating_hours',
 'restaurant_worker_density',
 'restaurant_worker_density_number_per_1_000_sq_ft',
 'retail_store_cash_register',
 'retail_store_computer_density',
 'retail_store_exterior_entrance',
 'retail_store_gross_floor',
 'retail_store_gross_floor_area_ft',
 'retail_store_number_of_open',
 'retail_store_number_of_open_or_closed_refrigeration_freezer_units',
 'retail_store_number_of_walk',
 'retail_store_number_of_walk_in_refrigeration_freezer_units',
 'retail_store_open_or_closed',
 'retail_store_percent_that',
 'retail_store_walk_in',
 'retail_store_weekly_operating',
 'retail_store_worker_density',
 'self_storage_facility_gross',
 'self_storage_facility_gross_floor_area_ft',
 'senior_care_community_average',
 'senior_care_community_gross',
 'senior_care_community_maximum',
 'senior_care_community_number',
 'senior_care_community_number_1',
 'senior_care_community_number_2',
 'senior_care_community_number_3',
 'senior_care_community_number_4',
 'senior_care_community_number_5',
 'senior_care_community_number_6',
 'senior_care_community_percent',
 'senior_living_community_gross_floor_area_ft',
 'senior_living_community_living_unit_density_number_per_1_000_sq_ft',
    'movie_theater_gross_floor_area_ft',
    'non_refrigerated_warehouse',
 'non_refrigerated_warehouse_1',
 'non_refrigerated_warehouse_2',
 'non_refrigerated_warehouse_3',
 'non_refrigerated_warehouse_4',
 'non_refrigerated_warehouse_5',
 'non_refrigerated_warehouse_gross_floor_area_ft',
     'parking_completely_enclosed',
 'parking_completely_enclosed_parking_garage_size_ft',
 'parking_gross_floor_area',
 'parking_gross_floor_area_ft',
 'parking_open_parking_lot',
 'parking_open_parking_lot_size_ft',
 'parking_partially_enclosed',
 'parking_partially_enclosed_parking_garage_size_ft',
     'strip_mall_gross_floor_area',
## Hostpial / Hotel
    'hospital_general_medical',
 'hospital_general_medical_1',
 'hospital_general_medical_10',
 'hospital_general_medical_11',
 'hospital_general_medical_12',
 'hospital_general_medical_13',
 'hospital_general_medical_14',
 'hospital_general_medical_15',
 'hospital_general_medical_16',
 'hospital_general_medical_17',
 'hospital_general_medical_2',
 'hospital_general_medical_3',
 'hospital_general_medical_4',
 'hospital_general_medical_5',
 'hospital_general_medical_6',
 'hospital_general_medical_7',
 'hospital_general_medical_8',
 'hospital_general_medical_9',
 'hotel_amount_of_laundry',
 'hotel_cooking_facilities',
 'hotel_full_service_spa_floor',
 'hotel_gross_floor_area_ft',
 'hotel_gym_fitness_center',
 'hotel_gym_fitness_center_floor_area_ft',
 'hotel_number_of_rooms',
 'hotel_percent_that_can_be',
 'hotel_room_density_number',
 'hotel_type_of_laundry_facility',
 'hotel_worker_density_number',
 'urgent_care_clinic_other',
 'urgent_care_clinic_other_outpatient_gross_floor_area_ft',
    'mailing_center_post_office_gross_floor_area_ft',
 'manufacturing_industrial_plant_gross_floor_area_ft',
 'medical_office_gross_floor',
 'medical_office_gross_floor_area_ft',
 'medical_office_mri_machine',
 'medical_office_number_of',
 'medical_office_number_of_1',
 'medical_office_number_of_computers',
 'medical_office_number_of_mri_machines',
 'medical_office_number_of_workers_on_main_shift',
 'medical_office_percent_that',
 'medical_office_percent_that_1',
 'medical_office_percent_that_can_be_cooled',
 'medical_office_percent_that_can_be_heated',
 'medical_office_weekly',
 'medical_office_weekly_operating_hours',
    'museum_gross_floor_area_ft',
    'office_computer_density_number',
 'office_gross_floor_area_ft',
 'office_number_of_computers',
 'office_number_of_workers',
 'office_number_of_workers_on_main_shift',
 'office_percent_that_can_be',
 'office_percent_that_can_be_1',
 'office_percent_that_can_be_cooled',
 'office_percent_that_can_be_heated',
 'office_weekly_operating_hours',
 'office_worker_density_number',
 'office_worker_density_number_per_1_000_sq_ft',
     'adult_education_gross_floor',
 'adult_education_gross_floor_area_ft',
]
residential2 = residential.drop(columns=to_drop)

In [None]:
# Convert year_ending to datetime
residential2["year_ending"] = pd.to_datetime(residential2["year_ending"], errors="coerce")
# Optional: extract just the year if that’s all you need
residential2["year_ending_year"] = residential2["year_ending"].dt.year
residential2[["report_year","year_ending_year","source_years","source_api_url","source_info_url"]][residential['report_year'].isnull()]
## Limtiing to the Metered Areas for whole property or whole building
residential3 = residential2[residential2["metered_areas_energy"].isin(['Whole Building', "Whole Property"])]
residential3 = residential3[residential3["primary_property_type_self"]=='Multifamily Housing'] ## confirming onlyl multifam. No nulls here

residential3 = residential3[(residential3["construction_status"]=='Existing')|(residential3["construction_status"].isnull())]
## Stand Alone Properties
residential3 = residential3[(residential3["parent_property_id"].isnull())|(residential3["parent_property_id"]=='Not Applicable: Standalone Property')]
residential3 = residential3.drop(columns=["primary_property_type_self",
                                          "primary_property_type",
                                          "national_median_reference",
                                          "list_of_all_property_use","largest_property_use_type","construction_status",
                                         "parent_property_id","parent_property_name"])



##### Edn second Skip

In [4]:
residential3 = pd.read_csv("residential3.csv") 

  residential3 = pd.read_csv("residential3.csv")


In [6]:
### Geocode The Null lat long 
with_coord = residential3[(~residential3['latitude'].isnull())&(~residential3['longitude'].isnull())]
print(with_coord.shape)
without_coord = residential3[(residential3['latitude'].isnull())|(residential3['longitude'].isnull())]
print(without_coord.shape)

(130321, 362)
(4377, 362)


### Start of Geocoding 

In [13]:
creds = gf.get_creds()
google_api_key = creds["Google"]["geocoding"]

In [None]:
# reading in past resu;ts
# results_df = pd.concat(results)
# results_df.to_csv("results_geo.csv",index=False)
results_df = pd.read_csv("results_geo.csv")
# failed_df = pd.concat(failed)
# failed_df.to_csv("failed_geo.csv",index=False)
failed_df = pd.read_csv("failed_geo.csv")

In [27]:
results = []
failed = []

In [None]:

for i, r in without_coord[["property_id","address_1","city","postal_code"]].drop_duplicates().iterrows():
    if i>= 7665:
        temp_df = pd.DataFrame([r])
        print(r["property_id"])
        full_address = str(r["address_1"])+" "+ str(r["city"])+" "+str(r["postal_code"])
        full_address = full_address.replace("nan","")
        print(full_address)
        if '/' in full_address:
            full_address = full_address.split('/')[-1]
            print("Split Addy:",full_address)
        location = geolocator.geocode(full_address)
        if location:
            temp_df["latitude"] = location.latitude
            temp_df["longitude"]= location.longitude
            print(location.latitude, location.longitude)
            results.append(temp_df)
        else:
            failed.append(temp_df)

In [22]:
i

121393

In [31]:
print(len(results))
print(len(failed))

152
19


In [32]:
# reading in past resu;ts
results_df2 = pd.concat(results)
results_df = pd.read_csv("results_geo.csv")
failed_df2 = pd.concat(failed)
failed_df = pd.read_csv("failed_geo.csv")

## pUTTING ALL RESULTS SO FAR TOGETHER
all_failed = pd.concat([failed_df,failed_df2])
all_results = pd.concat([results_df2,results_df])
failed_df.to_csv("failed_geo.csv",index=False)
results_df.to_csv("results_geo.csv",index=False)


In [33]:
print(without_coord.shape)
print(all_results.shape)
print(all_failed.shape)

(4377, 362)
(192, 6)
(50, 4)


In [28]:
import googlemaps
import os

gmaps = googlemaps.Client(key=google_api_key)

# assumes: without_coord, results = [], failed = [] already exist
for i, r in without_coord[["property_id","address_1","city","postal_code"]].drop_duplicates().iterrows():
    if i >= 121392:
        temp_df = pd.DataFrame([r])
        print(r["property_id"])

        full_address = str(r["address_1"]) + " " + str(r["city"]) + " " + str(r["postal_code"])
        full_address = full_address.replace("nan","").strip()
        print(full_address)

        if "/" in full_address:
            full_address = full_address.split("/")[-1].strip()
            print("Split Addy:", full_address)

        # ---- Google Geocoding (swap-in for geolocator.geocode) ----
        try:
            resp = gmaps.geocode(full_address)  # add region="us" or components=... if you like
        except Exception as e:
            print("Geocode error:", e)
            resp = []

        if resp:
            loc = resp[0]["geometry"]["location"]
            temp_df["latitude"]  = loc["lat"]
            temp_df["longitude"] = loc["lng"]
            print(loc["lat"], loc["lng"])
            results.append(temp_df)
        else:
            failed.append(temp_df)


3089718.0
84-17 125th St. et. Al. Queens
40.7072689 -73.826502
3108051.0
3400 Tyron Ave Bronx
40.87956519999999 -73.8767307
3113483.0
MNM: 2816 8th Ave New York
40.7510745 -73.994261
3116937.0
854 West 181st New York
40.8511379 -73.9399545
3128084.0
BROOKLYN
40.6781784 -73.9441579
3128085.0

Geocode error: HTTP Error: 400
3128086.0

Geocode error: HTTP Error: 400
3128087.0
BROOKLYN
40.6781784 -73.9441579
3128108.0

Geocode error: HTTP Error: 400
3128109.0

Geocode error: HTTP Error: 400
3128110.0

Geocode error: HTTP Error: 400
3128111.0

Geocode error: HTTP Error: 400
3128112.0

Geocode error: HTTP Error: 400
3128113.0
BRONX
40.8447819 -73.8648268
3128115.0
NA NA BRONX
40.8447819 -73.8648268
3128116.0

Geocode error: HTTP Error: 400
3128117.0

Geocode error: HTTP Error: 400
3128118.0

Geocode error: HTTP Error: 400
3128119.0

Geocode error: HTTP Error: 400
3128120.0

Geocode error: HTTP Error: 400
3128121.0

Geocode error: HTTP Error: 400
3128122.0

Geocode error: HTTP Error: 400
3128

In [37]:
without_coord

Unnamed: 0,report_year,property_id,property_name,year_ending,nyc_borough_block_and_lot,nyc_building_identification,address_1,city,postal_code,largest_property_use_type_1,...,energy_current_date,electricity_onsite_renewable,electricity_sourced_from,onsite_renewable_system,target_site_eui_kbtu_ft,other_use_kbtu,estimated_data_flag_other,bin,bbl,year_ending_year
21,2022.0,15327445.0,KM 1200 Union Ave/1204 Union Ave,2022-12-31,2026820001,2005057,1200 Union Ave/1204 Union Ave,Bronx,10459,37672.0,...,,,,,,,,,,2022
24,2022.0,15332746.0,KM 500 West 144th street/144 Hamilton Place,2022-12-31,1020750036,1061888,500 West 144th street/144 Hamilton Place,New York,10031,56688.0,...,,,,,,,,,,2022
84,2022.0,24125286.0,2820 Middle Town Road,2022-12-31,2053860017,2074435,2820 Middle Town Road,Bronx,10461,25200.0,...,,,,,,,,,,2022
161,2022.0,2771721.0,91-32/34 195th St LLC,2022-12-31,4108210060,4231514;4451927,91-32/34 195th Street,Queens,11423,79709.0,...,,,,,,,,,,2022
199,2022.0,2638326.0,Marben Realty,2022-12-31,1018900064,1057200,308 West104,New York,10025,56555.0,...,,,,,,,,,,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134654,,3539873.0,(7324) - East Midtown Plaza(2),2013-12-31,1-00929-0001,1084709; 1078824; 1084710; 1802187; 1084711,401 1ST AVE; 400 2ND AVE,New York,,,...,,,,,,,,,,2013
134660,,3956379.0,Carr Properties Inc,2013-12-31,,2008621,25-29 West Tremont,Bronx,,,...,,,,,,,,,,2013
134672,,3579396.0,1990 Adam Clayton Powell Jr Boulevard,2012-12-31,,,1990 Adam Clayton Powell Jr Boulevard,New York,,,...,2012-12-31,,,,,,,,,2012
134676,,3608447.0,Star 65 LLC,2012-12-31,,,665-88 Street,Brooklyn,,,...,,,,,,,,,,2012


In [41]:
list(without_coord.columns)

['report_year',
 'property_id',
 'property_name',
 'year_ending',
 'nyc_borough_block_and_lot',
 'nyc_building_identification',
 'address_1',
 'city',
 'postal_code',
 'largest_property_use_type_1',
 'year_built',
 'number_of_buildings',
 'occupancy',
 'metered_areas_energy',
 'metered_areas_water',
 'energy_star_score',
 'national_median_energy_star',
 'target_energy_star_score',
 'reason_s_for_no_score',
 'energy_star_certification',
 'energy_star_certification_1',
 'site_eui_kbtu_ft',
 'weather_normalized_site_eui',
 'national_median_site_eui',
 'site_energy_use_kbtu',
 'weather_normalized_site_energy',
 'electricity_weather_normalized',
 'electricity_weather_normalized_1',
 'natural_gas_weather_normalized',
 'natural_gas_weather_normalized_1',
 'source_eui_kbtu_ft',
 'weather_normalized_source',
 'national_median_source_eui',
 'source_energy_use_kbtu',
 'weather_normalized_source_1',
 'fuel_oil_1_use_kbtu',
 'fuel_oil_2_use_kbtu',
 'fuel_oil_4_use_kbtu',
 'fuel_oil_5_6_use_kbtu',
 

In [52]:
### 
# without_coord = without_coord.drop(columns=["latitude","longitude"])
without_coord_enr = without_coord.merge(all_results, on=["property_id","address_1","city","postal_code"],how='left')
print(without_coord_enr.shape)
print(without_coord_enr[without_coord_enr["latitude"].isnull()].shape)

(4377, 362)
(4133, 362)


In [58]:
## Sill in need 
print("initial Total",residential3.shape)
with_coord2 = pd.concat([with_coord,without_coord_enr[~without_coord_enr["latitude"].isnull()]]).drop_duplicates()
print("with coords:",with_coord2.shape)

need_coords = pd.concat([without_coord[without_coord["property_id"].isin(all_failed['property_id'].unique())],
                         without_coord_enr[without_coord_enr["latitude"].isnull()]]).drop_duplicates()
print("need_coords:",need_coords.shape)


initial Total (134698, 362)
with coords: (129820, 362)
need_coords: (4113, 362)


In [59]:
results = []
failed = []

In [60]:
## Second Cut 
gmaps = googlemaps.Client(key=google_api_key)

# assumes: need_coords, results = [], failed = [] already exist
for i, r in need_coords[["property_id","address_1","city","postal_code"]].drop_duplicates().iterrows():
    temp_df = pd.DataFrame([r])
    print(r["property_id"])

    full_address = str(r["address_1"]) + " " + str(r["city"]) + " " + str(r["postal_code"])
    full_address = full_address.replace("nan","").strip()
    print(full_address)

    if "/" in full_address:
        full_address = full_address.split("/")[-1].strip()
        print("Split Addy:", full_address)

    # ---- Google Geocoding (swap-in for geolocator.geocode) ----
    try:
        resp = gmaps.geocode(full_address)  # add region="us" or components=... if you like
    except Exception as e:
        print("Geocode error:", e)
        resp = []

    if resp:
        loc = resp[0]["geometry"]["location"]
        temp_df["latitude"]  = loc["lat"]
        temp_df["longitude"] = loc["lng"]
        print(loc["lat"], loc["lng"])
        results.append(temp_df)
    else:
        failed.append(temp_df)

24125286.0
2820 Middle Town Road Bronx 10461
40.8432564 -73.8358873
2638326.0
308 West104 New York 10025
40.8004228 -73.970244
5841271.0
2401 Davdison Avenue Bronx 10452
40.861769 -73.9032932
8705631.0
510 West 218 Street New York 11357
40.8712485 -73.9148179
9637009.0
2485 Morris Aveune Bronx 10468
40.8630438 -73.899755
2734772.0
105 Pinhurst Ave New York 10033
40.851816 -73.9387086
3522985.0
601 79th Steet Brooklyn 11209
40.6248125 -74.02066839999999
4040577.0
71-11 -71-23 162ND STREET NY 11365
40.7397205 -73.8062806
5834191.0
107-19 70st Ave Flushing 11375
40.7467285 -73.89544719999999
5863627.0
1675 E 21st Brooklyn 11210
40.6122215 -73.9525281
5965464.0
1440 Richmond Terracae STATEN ISLAND 10310
40.6408643 -74.1161858
6282532.0
54 Morningisde Drive New York 10025
40.8059287 -73.9596929
6282641.0
225 East 202nd StreetBronx New York 10458
40.8732705 -73.88585499999999
6282704.0
82-06 & 82-16 34th Avenue Jackson Heights 11372
40.7536074 -73.8848235
6282839.0
62 Clermont Street Brookly

In [61]:
results_df3 = pd.concat(results)
failed_df3= pd.concat(failed)

In [66]:
results_df3()

array(['Bronx', 'New York', 'Brooklyn', 'NY', 'Flushing', 'STATEN ISLAND',
       'Jackson Heights', 'Queens', 'Quuens', 'Manhattan', 'BRONX',
       'BROOKLYN', 'Forest Hills', 'NEW YORK', '90 North 5th', 'Glendale',
       'Kew Gardens', 'Corona', 'Briarwood', 'Astoria', 'Hollis',
       'Whitestone', 'Staten Island', 'Bayside', 'Far Rockwy',
       'Rego Park', 'Hempstead', 'Bellport', 'The Bronx', 'Sunnyside',
       'New York City', 'QUEENS', 'New Tork', 'Elmhurst',
       'Long Island City', 'WESTBURY', 'Buffalo', 'Jamaica', 'Glen Oaks',
       'Far Rockaway', 'Woodside', 'Flushinig', 'Philadelphia',
       'flushing', 'Rockaway Park', 'New Rochelle', 'brooklyn', 'Yonkers',
       'Boston', 'Niagara Falls', 'Ardsley', 'Bellerose', 'Monsey',
       'Oakland Gardens', 'New york', 'Broolyn', 'South Ozone Park',
       'New Hyde Park', 'Booklyn', 'Patchogue', 'MANHATTAN',
       'Howard Beach', 'Brookly'], dtype=object)

In [69]:
need_coords = need_coords.drop(columns=["latitude","longitude"])
without_coord_enr2 = need_coords.merge(results_df3, on=["property_id","address_1","city","postal_code"],how='left')
print(without_coord_enr2.shape)
print(without_coord_enr2[without_coord_enr2["latitude"].isnull()].shape)

(4113, 362)
(48, 362)


In [81]:
print("initial Total",residential3.shape)
with_coord3 = pd.concat([with_coord2,without_coord_enr2[~without_coord_enr2["latitude"].isnull()]]).drop_duplicates()
print("with coords:",with_coord3.shape)
need_coords2 = pd.concat([need_coords[need_coords["property_id"].isin(failed_df3['property_id'].unique())],
                         without_coord_enr2[without_coord_enr2["latitude"].isnull()]]).drop_duplicates()
print("need_coords:",need_coords2.shape)

initial Total (134698, 362)
with coords: (133885, 362)
need_coords: (57, 362)


0.6

In [96]:
### Making the decision to drop these and not deal with them, mostly corrupt address values. 
need_coords2[["property_id", 'address_1','address_2',
 'city',
 'postal_code']].drop_duplicates()

Unnamed: 0,property_id,address_1,address_2,city,postal_code
122015,3128085.0,,,,
122016,3128086.0,,,,
122018,3128108.0,,,,
122019,3128109.0,,,,
122020,3128110.0,,,,
122021,3128111.0,,,,
122022,3128112.0,,,,
122026,3128116.0,,,,
122027,3128117.0,,,,
122028,3128118.0,,,,


In [None]:
round(((134698 - 133885)/134698)*100,2) 
### DROPPING 0.6% of the data becaue not able to successfgully geocode

### saving geocoded cleaned data for now

In [84]:
working_residential_geo = with_coord3.copy()
# working_residential_geo.to_csv("working_residential_geo.csv",index=False)

### Spatial joinign with CT to pull ct into the buildings data. 

In [85]:
working_residential_geo

Unnamed: 0,report_year,property_id,property_name,year_ending,nyc_borough_block_and_lot,nyc_building_identification,address_1,city,postal_code,largest_property_use_type_1,...,energy_current_date,electricity_onsite_renewable,electricity_sourced_from,onsite_renewable_system,target_site_eui_kbtu_ft,other_use_kbtu,estimated_data_flag_other,bin,bbl,year_ending_year
0,2022.0,9793770.0,1870 Pelham Parkway South,2022-12-31,2042500026,2047795,1870 Pelham Parkway South,Bronx,10461,52941.0,...,,,,,,,,,,2022
1,2022.0,14377690.0,1680 Ocean Ave,2022-12-31,3067300001,3180535,1680 Ocean Ave,Brooklyn,11230,68400.0,...,,,,,,,,,,2022
2,2022.0,15176247.0,88-24 Merrick Blvd,2022-12-31,4098150067,4210063,88-24 Merrick Blvd,Jamaica,11432,82576.0,...,,,,,,,,,,2022
3,2022.0,15176327.0,90-11 149th Street,2022-12-31,4096790052,4206819,90-11 149 str,Jamaica,11435,136000.0,...,,,,,,,,,,2022
4,2022.0,15176328.0,148-25 89th Ave,2022-12-31,4096930051,4207100,148-25 89 Ave,Jamaica,11435,127200.0,...,,,,,,,,,,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4108,,2676297.0,(7226) - The Howard Owners Corp,2013-12-31,4-02118-7501,4432024;4432026;4432028;4432025;4432027;4432029,99-32 - 99-72 66 Rd,New York,,,...,,,,,,,,,,2013
4109,,2718861.0,213-02 42nd Ave,2013-12-31,4-06284-7501,4433427,213-02 42nd Ave,Queens,,,...,,,,,,,,,,2013
4110,,2917501.0,325 East 79 Street Owners,2013-12-31,1-01542-0014,1049250,325 East 79th Streeet,New York,,,...,,,,,,,,,,2013
4111,,3956379.0,Carr Properties Inc,2013-12-31,,2008621,25-29 West Tremont,Bronx,,,...,,,,,,,,,,2013


In [None]:
### pulling in ct shapefile

In [86]:
import io, requests, geopandas as gpd

CT2010_URL = "https://data.cityofnewyork.us/resource/bmjq-373p.geojson?$limit=50000"
SOCRATA_APP_TOKEN = None  # put your token string here if you have one

headers = {"X-App-Token": SOCRATA_APP_TOKEN} if SOCRATA_APP_TOKEN else {}
resp = requests.get(CT2010_URL, headers=headers, timeout=60)
resp.raise_for_status()

ct2010 = gpd.read_file(io.BytesIO(resp.content)).to_crs(2263)
ct2010.head()


Unnamed: 0,ntacode,shape_area,ntaname,shape_leng,boroname,puma,boroct2010,ct2010,borocode,cdeligibil,ctlabel,geometry
0,SI22,2497009.71359,West New Brighton-New Brighton-St. George,7729.01679383,Staten Island,3903,5000900,900,5,E,9,"MULTIPOLYGON (((962269.126 173705.5, 962288.72..."
1,MN17,1860992.68163,Midtown-Midtown South,5687.80243891,Manhattan,3807,1010200,10200,1,I,102,"MULTIPOLYGON (((992216.539 216507.687, 992091...."
2,MN17,1864600.43538,Midtown-Midtown South,5693.03636707,Manhattan,3807,1010400,10400,1,I,104,"MULTIPOLYGON (((991325.882 217001.689, 991199...."
3,MN17,1890907.25105,Midtown-Midtown South,5699.86064037,Manhattan,3807,1011300,11300,1,I,113,"MULTIPOLYGON (((988650.277 214286.402, 988517...."
4,MN40,1918144.56374,Upper East Side-Carnegie Hill,5807.97295649,Manhattan,3805,1013000,13000,1,I,130,"MULTIPOLYGON (((994920.11 221386.27, 994791.85..."


In [87]:
import geopandas as gpd

# --- 1) Points from your geocoded table ---
# assumes working_residential_geo has columns: property_id, latitude, longitude
pts = gpd.GeoDataFrame(
    working_residential_geo[["property_id", "latitude", "longitude"]].copy(),
    geometry=gpd.points_from_xy(
        working_residential_geo["longitude"], working_residential_geo["latitude"]
    ),
    crs=4326  # your geocodes are WGS84
)

# project points to match the CT layer (your CT layer is already to_crs(2263))
pts_2263 = pts.to_crs(2263)

# --- 2) Keep only what you need from the CT layer ---
# 'ct2010' and/or 'boroct2010' are the usual tract IDs in that NYC layer
ct_keep = ct2010[["ct2010", "boroct2010", "boroname", "geometry"]].copy()

# --- 3) Spatial join: which tract polygon contains each point ---
joined = gpd.sjoin(
    pts_2263,
    ct_keep,
    how="left",
    predicate="within"   # points that fall inside a tract polygon
)

# If a property_id appears multiple times (duplicates), keep the first tract hit
joined = joined.sort_index().drop_duplicates(subset=["property_id"])

# --- 4) Bring tract columns back to your original dataframe ---
cols_to_add = ["ct2010", "boroct2010", "boroname"]
working_residential_tract = working_residential_geo.merge(
    joined[["property_id"] + cols_to_add],
    on="property_id",
    how="left"
)

# working_with_tract now has ct2010 / boroct2010 for each row


In [90]:
## Saving information to Csv 
working_residential_tract.to_csv("working_residential_tract.csv",index=False)

In [None]:
## limiting to certain 2010 and 2017?

In [101]:
working_residential_tract.groupby(['year_ending_year']).agg({"year_ending":"count"}).reset_index()

Unnamed: 0,year_ending_year,year_ending
0,2012,32
1,2013,9256
2,2014,9064
3,2015,6882
4,2016,7050
5,2017,15657
6,2018,13474
7,2019,14265
8,2020,17240
9,2021,13114


In [99]:
working_residential_tract[["report_year","year_ending","year_ending_year"]].drop_duplicates()

Unnamed: 0,report_year,year_ending,year_ending_year
0,2022.0,2022-12-31,2022
13091,2023.0,2023-12-31,2023
27197,,2021-12-31,2021
40000,,2020-12-31,2020
56796,,2019-12-31,2019
70692,,2018-12-31,2018
83801,,2017-12-31,2017
98974,,2016-12-31,2016
105745,,2015-12-31,2015
112339,,2014-12-31,2014


In [103]:
### Limtiing to 2017 and before 
working_residential_tract_2012_2017 = working_residential_tract[working_residential_tract["year_ending_year"]<2018]
working_residential_tract_2012_2017['year_ending_year'].unique()
working_residential_tract_2012_2017[working_residential_tract_2012_2017['property_id'].isin(buildings_2012_2013_ids)]
# working_residential_tract_2012_2017.to_csv("working_residential_tract_2012_2017.csv",index=False)

In [112]:
buildings_2012_2013_ids = working_residential_tract_2012_2017[working_residential_tract_2012_2017["year_ending_year"].isin([2012,2013])]["property_id"].unique()
working_residential_tract_2012_2017[working_residential_tract_2012_2017['property_id'].isin(buildings_2012_2013_ids)].groupby(['year_ending_year']).agg({"year_ending":"count"}).reset_index()


Unnamed: 0,year_ending_year,year_ending
0,2012,32
1,2013,9256
2,2014,7456
3,2015,4429
4,2016,3908
5,2017,4906


In [113]:
working_continuous_final = working_residential_tract_2012_2017[working_residential_tract_2012_2017['property_id'].isin(buildings_2012_2013_ids)]
# working_continuous_final.to_csv("working_continuous_final_2012_2017.csv",index=False)

In [116]:
# Path to your geodatabase (.gdb)
gdb_path = r"C:\Users\johnf\Downloads\Tree_Canopy_Change (1)\Tree_Canopy_Change\NYC_TreeCanopyChange_2010_2017.gdb"

# List all layers in the geodatabase
layers = fiona.listlayers(gdb_path)
print("Layers available:", layers)

# Read a specific layer into a GeoDataFrame
gdf = gpd.read_file(gdb_path, layer=layers[0])  # Replace [0] with your desired layer
print(gdf.head())

# Optional: save as shapefile or GeoJSON
#gdf.to_file("output.shp") 
#gdf.to_file("output.geojson", driver="GeoJSON")

canopy_change = gdf.copy()
canopy_change

Layers available: ['NYC_TreeCanopyChange_2010_2017']


  return ogr_read(


       Class  Shape_Length    Shape_Area  \
0  No Change    732.955182   5800.500001   
1  No Change   2079.456465  35005.875002   
2  No Change    228.557599   1984.500000   
3  No Change    125.726893    617.750000   
4  No Change     41.031601     90.750000   

                                            geometry  
0  MULTIPOLYGON (((1008879.93 272372.3, 1008878.4...  
1  MULTIPOLYGON (((1008749.43 271870.8, 1008748.9...  
2  MULTIPOLYGON (((1008704.43 272105.3, 1008701.9...  
3  MULTIPOLYGON (((1009015.43 272757.3, 1009013.4...  
4  MULTIPOLYGON (((1008794.93 270669.8, 1008781.4...  


Unnamed: 0,Class,Shape_Length,Shape_Area,geometry
0,No Change,732.955182,5800.500001,"MULTIPOLYGON (((1008879.93 272372.3, 1008878.4..."
1,No Change,2079.456465,35005.875002,"MULTIPOLYGON (((1008749.43 271870.8, 1008748.9..."
2,No Change,228.557599,1984.500000,"MULTIPOLYGON (((1008704.43 272105.3, 1008701.9..."
3,No Change,125.726893,617.750000,"MULTIPOLYGON (((1009015.43 272757.3, 1009013.4..."
4,No Change,41.031601,90.750000,"MULTIPOLYGON (((1008794.93 270669.8, 1008781.4..."
...,...,...,...,...
5692504,Loss,63.798210,286.735284,"MULTIPOLYGON (((1014610.46 188155.382, 1014610..."
5692505,Loss,63.429348,314.238467,"MULTIPOLYGON (((1014606.12 187969.835, 1014605..."
5692506,Loss,223.829388,2467.258600,"MULTIPOLYGON (((1014946.295 187646.6, 1014944...."
5692507,Loss,96.181802,538.668512,"MULTIPOLYGON (((1014656.958 187618.818, 101465..."


### Shaptial Join for LiDar Canopy and Buildings Lat Long for those in thwe Data from 2012/2013 through 2017

In [114]:

## Increase or Decrease in Canopy Cover 
working_continuous_final

Unnamed: 0,report_year,property_id,property_name,year_ending,nyc_borough_block_and_lot,nyc_building_identification,address_1,city,postal_code,largest_property_use_type_1,...,onsite_renewable_system,target_site_eui_kbtu_ft,other_use_kbtu,estimated_data_flag_other,bin,bbl,year_ending_year,ct2010,boroct2010,boroname
83802,,2707907.0,2626 Homecrest Avenue,2017-12-31,3-07456-0006,3204886,2626 Homecrest Avenue,Brooklyn,,127500.0,...,,,,,,,2017,060600,3060600,Brooklyn
83814,,3521602.0,3240 Henry Hudson parkway LLC,2017-12-31,2-05789-0024,2084124,3240 Henry Hudson parkway,Bronx,,163197.0,...,,,,,,,2017,029700,2029700,Bronx
83815,,3521883.0,900 Avenue H LLC,2017-12-31,3-06512-0001,3170547,900 Avenue H,Brooklyn,,77000.0,...,,,,,,,2017,045600,3045600,Brooklyn
83816,,3522892.0,1561 E. 13th Street LLC,2017-12-31,3-06760-0062,3181781,1561 E. 13th Street,Brooklyn,,71000.0,...,,,,,,,2017,054200,3054200,Brooklyn
83817,,4047231.0,Justin - 115 West 30th Street,2017-12-31,1-00806-0026,1015161,115 West 35th Street,new York,,151525.0,...,,,,,,,2017,010900,1010900,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,,2676297.0,(7226) - The Howard Owners Corp,2013-12-31,4-02118-7501,4432024;4432026;4432028;4432025;4432027;4432029,99-32 - 99-72 66 Rd,New York,,,...,,,,,,,2013,071304,4071304,Queens
133881,,2718861.0,213-02 42nd Ave,2013-12-31,4-06284-7501,4433427,213-02 42nd Ave,Queens,,,...,,,,,,,2013,146700,4146700,Queens
133882,,2917501.0,325 East 79 Street Owners,2013-12-31,1-01542-0014,1049250,325 East 79th Streeet,New York,,,...,,,,,,,2013,013800,1013800,Manhattan
133883,,3956379.0,Carr Properties Inc,2013-12-31,,2008621,25-29 West Tremont,Bronx,,,...,,,,,,,2013,024300,2024300,Bronx


In [120]:
# --- 1) Points from your buildings (WGS84) ---
shape_bldg = gpd.GeoDataFrame(
    working_continuous_final[["property_id", "latitude", "longitude"]].copy(),
    geometry=gpd.points_from_xy(
        working_continuous_final["longitude"], working_continuous_final["latitude"]
    ),
    crs=4326
)

# --- 2) Put both layers in the SAME CRS (use the canopy CRS) ---
canopy = canopy_change[["Class", "geometry"]].copy()
if canopy.crs is None:
    # set if your canopy file didn’t come with a CRS; adjust if needed
    canopy = canopy.set_crs(2263)  # NYC StatePlane ft; change if different

shape_bldg = shape_bldg.to_crs(canopy.crs)

# (optional but recommended) fix invalid polygon rings that can break joins
canopy["geometry"] = canopy.buffer(0)

In [None]:
# --- 3) Spatial join: which canopy polygon contains each building point? ---
joined = gpd.sjoin(
    shape_bldg[["property_id", "geometry"]],
    canopy,
    how="left",
    predicate="within"
)

In [122]:
# If canopy polygons overlap and produce duplicates, keep the polygon with the largest area
if joined.duplicated("property_id").any():
    canopy_area = canopy.assign(_poly_area=canopy.area)
    joined = gpd.sjoin(
        shape_bldg[["property_id", "geometry"]],
        canopy_area,
        how="left",
        predicate="within"
    )
    joined = (joined
              .sort_values(["property_id", "_poly_area"], ascending=[True, False])
              .drop_duplicates(subset=["property_id"]))

In [123]:
# --- 4) Bring the canopy class back to your original table ---
working_with_canopy = working_continuous_final.merge(
    joined[["property_id", "Class"]].rename(columns={"Class": "canopy_change_class"}),
    on="property_id",
    how="left"
)

In [125]:
working_with_canopy["canopy_change_class"].unique()

array([nan, 'Gain', 'No Change', 'Loss'], dtype=object)

In [126]:
### Before snapping seeing how many data poitns are null , not have a canopy value
working_with_canopy.groupby(["canopy_change_class"]).agg({"property_id":"nunique"})

Unnamed: 0_level_0,property_id
canopy_change_class,Unnamed: 1_level_1
Gain,700
Loss,154
No Change,976


In [132]:
# --- 5) (Optional) fill misses by snapping to nearest polygon within 50 ft ---
# This helps when a point lands just outside a sliver polygon.
miss_ids = working_with_canopy.loc[working_with_canopy["canopy_change_class"].isna(), "property_id"]
if len(miss_ids):
    nearest = gpd.sjoin_nearest(
        shape_bldg[shape_bldg["property_id"].isin(miss_ids)],
        canopy,
        how="left",
        max_distance=50  # units = CRS units; 50ft if CRS=2263
    )[["property_id", "Class"]].rename(columns={"Class": "canopy_change_class_nearest"})

    working_with_canopy_50ft = working_with_canopy.merge(nearest, on="property_id", how="left")
    working_with_canopy_50ft["canopy_change_class"] = (
        working_with_canopy_50ft["canopy_change_class"]
        .fillna(working_with_canopy_50ft["canopy_change_class_nearest"])
    )
    working_with_canopy_50ft.drop(columns=["canopy_change_class_nearest"], inplace=True)

In [133]:
working_with_canopy_50ft.groupby(["canopy_change_class"]).agg({"property_id":"nunique"})

Unnamed: 0_level_0,property_id
canopy_change_class,Unnamed: 1_level_1
Gain,6307
Loss,1101
No Change,1461


In [136]:
working_with_canopy_50ft.shape

(109537, 366)

In [137]:
working_with_canopy_50ft = working_with_canopy_50ft.dropna(how='all', axis=1)

In [141]:
 working_with_canopy_50ft[['investment_in_energy_projects', 'investment_in_energy_projects_1']].drop_duplicates()

Unnamed: 0,investment_in_energy_projects,investment_in_energy_projects_1
0,,
56633,0.0,0.0
56830,575000.0,0.65
60649,20000.0,0.25


In [149]:
noEnergy_investments = working_with_canopy_50ft[(working_with_canopy_50ft['investment_in_energy_projects'].isnull() 
                                                 & working_with_canopy_50ft['investment_in_energy_projects_1'].isnull()
                                                &working_with_canopy_50ft['green_power_onsite_kwh'].isnull())]
noEnergy_investments.groupby(["canopy_change_class"]).agg({"property_id":"nunique"})



Unnamed: 0_level_0,property_id
canopy_change_class,Unnamed: 1_level_1
Gain,192
Loss,38
No Change,55


array([ 0., nan])

In [151]:
working_with_canopy_50ft[["green_power_onsite_kwh"]].drop_duplicates()

Unnamed: 0,green_power_onsite_kwh
0,
248,56620.0
261,38580.0
631,45420.0
36766,56900.0
36778,38000.0
57140,202815.0
79671,195375.5
104264,50990.0
105738,45100.0


In [152]:
### Limiting to the columns we need for the analysis
[["property_id","property_name","year_ending_year","address_1","city","borough","bbl","bin","census_tract","ct2010","nta","latitude","longitude",

 ]]
  

NameError: name 'nta' is not defined

In [158]:
working_with_canopy_50ft["number_of_buildings"].unique()

array([  1.,   6.,   8.,   2.,   5.,  10.,  52.,  11.,   4.,  35.,   3.,
        45., 107.,  22.,  32.,  12.,  26.,   7.,  14.,  16.,  19.,  15.,
        24.,  30.,  18.,  42.,  13.,   9.,  25.,  91., 140., 131., 126.,
         0.,  31.,  20.,  28.,  23.,  21.,  nan, 161.,  27.])

In [156]:
working_with_canopy_50ft[["property_gfa_calculated","number_of_buildings","property_gfa_epa_calculated",
                          "multifamily_housing_number","multifamily_housing_total","occupancy"]]

Unnamed: 0,property_gfa_calculated,number_of_buildings,property_gfa_epa_calculated,multifamily_housing_number,multifamily_housing_total,occupancy
0,127500.0,1.0,,185.0,139.0,100.0
1,127500.0,1.0,,185.0,139.0,100.0
2,127500.0,1.0,,185.0,139.0,100.0
3,127500.0,1.0,,185.0,139.0,100.0
4,127500.0,1.0,,185.0,139.0,100.0
...,...,...,...,...,...,...
109532,,,,0.0,,
109533,,,,0.0,,
109534,,,,0.0,,
109535,,,,0.0,,


In [153]:
"year_built","number_of_buildings","property_gfa_calculated"   # or property_gfa_epa_calculated if cleaner
multifamily_housing_number
multifamily_housing_total
occupancy

NameError: name 'multifamily_housing_number' is not defined

In [148]:
list(noEnergy_investments.dropna(how='all',axis=1).columns)

['property_id',
 'property_name',
 'year_ending',
 'nyc_borough_block_and_lot',
 'nyc_building_identification',
 'address_1',
 'city',
 'largest_property_use_type_1',
 'year_built',
 'number_of_buildings',
 'occupancy',
 'metered_areas_energy',
 'metered_areas_water',
 'energy_star_score',
 'national_median_energy_star',
 'energy_star_certification',
 'energy_star_certification_1',
 'site_eui_kbtu_ft',
 'weather_normalized_site_eui',
 'national_median_site_eui',
 'site_energy_use_kbtu',
 'weather_normalized_site_energy',
 'source_eui_kbtu_ft',
 'weather_normalized_source',
 'national_median_source_eui',
 'source_energy_use_kbtu',
 'weather_normalized_source_1',
 'fuel_oil_1_use_kbtu',
 'fuel_oil_2_use_kbtu',
 'fuel_oil_4_use_kbtu',
 'fuel_oil_5_6_use_kbtu',
 'district_steam_use_kbtu',
 'natural_gas_use_kbtu',
 'electricity_use_grid_purchase',
 'electricity_use_grid_purchase_1',
 'electricity_use_grid_purchase_2',
 'annual_maximum_demand_kw',
 'annual_maximum_demand_mm',
 'annual_maximu

In [None]:
### Reading in Tax Zoning 

In [160]:
tax_zoning = {
    "ALL":{
        "api":"https://data.cityofnewyork.us/resource/fdkv-4t4z.json",
        "info":"https://data.cityofnewyork.us/City-Government/NYC-Zoning-Tax-Lot-Database/fdkv-4t4z/about_data"
}}

In [162]:
def fetch_all_rows_1k(api_url: str, source_years: str, source_info_url: str) -> pd.DataFrame:
    offset = 0
    frames = []

    while True:
        params = {"$limit": PAGE, "$offset": offset}

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                resp = session.get(api_url, params=params, headers=headers, timeout=TIMEOUT)
                if resp.status_code in (429, 502, 503, 504):
                    time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))
                    continue
                resp.raise_for_status()

                if "json" not in resp.headers.get("Content-Type", "").lower():
                    preview = resp.text[:200]
                    raise ValueError(f"Non-JSON response (status {resp.status_code}): {preview}")

                data_chunk = resp.json()
                if not data_chunk:
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

                # Create DataFrame
                df = pd.DataFrame(data_chunk)

                # 🔑 Add your metadata columns here
                df["source_years"] = source_years
                df["source_api_url"] = api_url
                df["source_info_url"] = source_info_url

                frames.append(df)

                # If less than PAGE, stop; otherwise keep paginating
                if len(data_chunk) < PAGE:
                    return pd.concat(frames, ignore_index=True)

                offset += PAGE
                break  # Success, go to next page

            except (requests.RequestException, JSONDecodeError, ValueError) as e:
                if attempt == MAX_RETRIES:
                    print(f"⚠️ Failed fetching {api_url} at offset {offset}: {e}")
                    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
                time.sleep(BACKOFF_BASE ** attempt * (0.1 * attempt))

In [163]:
PAGE = 1000               # <-- enforce 1,000 rows per page
TIMEOUT = 30
MAX_RETRIES = 5
BACKOFF_BASE = 1.5

session = requests.Session()
headers = {}

agg_running_list = []
for k, v in tax_zoning.items():
    print(f"Fetching {k} -> {v['api']}")
    df = fetch_all_rows_1k(v["api"], k, v["info"])
    if not df.empty:
        agg_running_list.append(df)
    else:
        print(f"Warning: no rows returned for {k} ({v['api']}).")

nyc_tax_zoning = pd.concat(agg_running_list, ignore_index=True) if agg_running_list else pd.DataFrame()
print(f"Total rows: {len(nyc_tax_zoning)}")


Fetching ALL -> https://data.cityofnewyork.us/resource/fdkv-4t4z.json
Total rows: 857969


In [164]:
nyc_tax_zoning.to_csv("nyc_tax_zoning.csv", index=False )