# LL84 Processing

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
pd.set_option('display.max_columns', None)

## Step 0: Load in raw data

In [2]:
#load in raw data

ll_raw = pd.read_csv('../data/raw_data/DOB/LL84 Energy Efficiency/NYC_Building_Energy_and_Water_Data_Disclosure_for_Local_Law_84__2022-Present__20250915.csv')
schools = gpd.read_file('../data/processed_data/school_points_with_lcgms.geojson')

  ll_raw = pd.read_csv('../data/raw_data/DOB/LL84 Energy Efficiency/NYC_Building_Energy_and_Water_Data_Disclosure_for_Local_Law_84__2022-Present__20250915.csv')


## Step 1: Match schools to LL84 data using building codes

In [4]:
#go through and match buildings based on codes first
matches = []
for bc in schools['Building Code']:
    #find matches using building codes
    df = ll_raw[ll_raw['Property Name'].str.contains(bc)].copy()
    #drop buildings that are not schools, daycare centers, or educational facilities, or worship facilities but that might occupy the same lot
    df = df[df['List of All Property Use Types (GFA) (ft²)'].str.contains('|'.join(['K-12 School', 'Pre-school/Daycare', 'Other - Education', 'Worship Facility']), regex=True)]

    #if no matches are found, try to use BBL to match
    if df.shape[0] == 0:
        try:
            addr = schools[schools['Building Code'] == bc]['Borough Block Lot'].values[0].casefold()
        except:
            addr = None
        df = ll_raw[ll_raw['NYC Borough, Block and Lot (BBL)'].str.casefold() == addr].copy()

    #once aggain, drop buildings that are not schools, daycare centers, or educational facilities, but that might occupy the same lot
    df = df[df['List of All Property Use Types (GFA) (ft²)'].str.contains('|'.join(['K-12 School', 'Pre-school/Daycare', 'Other - Education', 'Worship Facility']), regex=True)]
    
    #if multiple entries are found, select the most recent ones
    if df.shape[0] > 1:
        maxyear = df['Calendar Year'].unique().max().item()
        df = df[df['Calendar Year'] == maxyear]
    
        #for matches with multiple buildings, drop any at alternate addresses
        """
        if df.shape[0] > 1:
            #if address does not exist in school table, will return None
            try:
                addr = schools[schools['Building Code'] == bc]['Primary Address'].values[0].casefold()
            except:
                addr = None
            df = df[df['Address 1'].str.casefold() == addr]
            try:
                addr = schools[schools['Building Code'] == bc]['Borough Block Lot'].values[0].casefold()
            except:
                addr = None
            df = df[df['NYC Borough, Block and Lot (BBL)'].str.casefold() == addr].copy()
        #if there are still multiple entries, choose the one that corresponds to a single building
        """
        if df.shape[0] > 1:
            df = df[~df['NYC Building Identification Number (BIN)'].str.contains(',')]

    #if there are still multiple matched buildings, skip the school
    if df.shape[0] > 1:
        df = df[[]]        

    
    df['Building Code'] = bc
    matches.append(df)


ll84_matched_bc = pd.concat(matches)

### Manual matching to correct for missed matches

In [5]:
# the first four entries here correspond to schools that were initially filtered out of matches (generally due to multiple buildings), but still had a valid match in the ll84 database, whereas the rest correspond to schools with multiple matched buildings
manual_matches = {
    'K241': 36863,
    'Q774': 47929,
    'M838': 5179,
    'X445': 36862,
    'K286' : 52066,
    'K092' : 38626,
    'K490' : 52758,
    'M170' : 49040,
    'MCDT' : 46251,
    'M908' : 34331,
    'K535' : 52810,
    'K312' : 36864,
    'Q136' : 36867,
    'XCKQ' : 57812,
    'X862' : 51985,
    'X063' : 51984,
    'X008' : 52538,
    'X122' : 52376,
    'X826' : 39016
    
}


manual_add = ll_raw.loc[manual_matches.values()]
manual_add['Building Code'] = manual_matches.keys()

ll84_matched_bc = pd.concat([ll84_matched_bc, manual_add])

## Step 3: Merge LL84 dataframe with the schools dataframe

We choose a subset of columns to retain in the merged dataframe. These can be changed by editing `data columns` below.

In [9]:
data_columns = [
           'ENERGY STAR Score',
           'Site EUI (kBtu/ft²)',
           'Site Energy Use (kBtu)',
           'Percent Electricity',
           'Direct GHG Emissions (Metric Tons CO2e)',
           'Direct GHG Emissions Intensity (kgCO2e/ft²)',
           'Water Use (All Water Sources) (kgal)']

ll84_matched_bc[data_columns] = ll84_matched_bc[data_columns].apply(pd.to_numeric, errors='coerce')


merged = schools.merge(ll84_matched_bc[['Building Code'] + data_columns]
                       .drop_duplicates(subset='Building Code', keep='first'), on="Building Code", how='left')

In [14]:
#Here, we shorten column names for the final shapefile

#pre-existing shortened names
short_col_map = {'Location Name': 'Loc_Name',
    'Managed By Name': 'Managed_By',
    'Location Code': 'Loc_Code',
    'Building Code': 'Bldg_Code',
    'ATS': 'ATS',
    'Primary Address': 'Address',
    'City': 'City',
    'State': 'State',
    'Zip': 'Zip',
    'Borough Block Lot': 'BBL',
    'Census Tract': 'C_Tract',
    'Community District': 'Comm_Dist',
    'Council District': 'Council_Di',
    'geometry': 'geometry',
    'BEDS Number': 'BEDS_Num',
    'Location Type Description': 'Loc_Type_D',
    'Location Category Description': 'Loc_Cat_D',
    'Grades': 'Grades',
    'Grades Final': 'Grades_Fin',
    'Open Date': 'Open_Date',
    'NTA': 'NTA',
    'NTA_Name': 'NTA_Name',
    'Principal Name': 'Princ_Name',
    'Principal Title': 'Princ_Titl',
    'Principal Phone Number': 'Princ_Phon',
    'Fax Number': 'Fax_Num',
    'Geographical District Code': 'GeoDisCode',
    'Administrative District Code': 'AdDistCode',
    'Administrative District Location Code': 'AdDistLocC',
    'Administrative District Name': 'AdDistName',
    'Community School Sup Name': 'ComScSupNa',
    'BCO Location Code': 'BCOLocCode',
    'in_LCGMS': 'in_LCGMS',
    'full_address': 'full_addr',
    'google_location_type': 'g_loc_type',
    'lat': 'lat',
    'lng': 'lng',
    'distance_to_nearest_peaker_miles': 'peaker_mi'
}

#shortened names for our columns
short_col_map = short_col_map | {
    'ENERGY STAR Score' : 'eng_star',
    'Site EUI (kBtu/ft²)' : 'eui_norm',
    'Site Energy Use (kBtu)': 'eui_raw',
    'Percent Electricity': 'pct_elec',
    'Direct GHG Emissions (Metric Tons CO2e)' : 'ghg_raw',
    'Direct GHG Emissions Intensity (kgCO2e/ft²)' : 'ghg_norm',
    'Water Use (All Water Sources) (kgal)' : 'water_use'}


In [15]:
short_col_map

{'Location Name': 'Loc_Name',
 'Managed By Name': 'Managed_By',
 'Location Code': 'Loc_Code',
 'Building Code': 'Bldg_Code',
 'ATS': 'ATS',
 'Primary Address': 'Address',
 'City': 'City',
 'State': 'State',
 'Zip': 'Zip',
 'Borough Block Lot': 'BBL',
 'Census Tract': 'C_Tract',
 'Community District': 'Comm_Dist',
 'Council District': 'Council_Di',
 'geometry': 'geometry',
 'BEDS Number': 'BEDS_Num',
 'Location Type Description': 'Loc_Type_D',
 'Location Category Description': 'Loc_Cat_D',
 'Grades': 'Grades',
 'Grades Final': 'Grades_Fin',
 'Open Date': 'Open_Date',
 'NTA': 'NTA',
 'NTA_Name': 'NTA_Name',
 'Principal Name': 'Princ_Name',
 'Principal Title': 'Princ_Titl',
 'Principal Phone Number': 'Princ_Phon',
 'Fax Number': 'Fax_Num',
 'Geographical District Code': 'GeoDisCode',
 'Administrative District Code': 'AdDistCode',
 'Administrative District Location Code': 'AdDistLocC',
 'Administrative District Name': 'AdDistName',
 'Community School Sup Name': 'ComScSupNa',
 'BCO Location

In [None]:
#A couple kBTU values may get dropped due to errors in kBTU calculation in original dataset
merged.rename(columns=short_col_map).to_file(
    '../data/processed_data/energy_water/school_points_with_lcgms_ll84.shp',
      )

  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
