# NYC DOE Lead-Based Paint Safety Dataset
## This website details the school lead-based paint safety data:
## https://www.schools.nyc.gov/school-life/space-and-facilities/space-and-facilities-reports/lead-based-paint
#### The information comes from the 2025 Cohort 1 and 2, and the 2024 Cohort 1, 2, and 3 data (since 2025 has not been published for Cohort 3). If 2025 data was availible, the 2024 data was not used in analysis.

The datasets has 909 buildings.

In [79]:
!pip3 install pandas openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [80]:
import pandas as pd
import geopandas as gpd
import numpy as np
from pathlib import Path
import warnings
from openpyxl import load_workbook
warnings.filterwarnings('ignore')

In [81]:
data_dir = Path("../input_data")
output_dir = Path("../processed_data")
output_dir.mkdir(exist_ok=True, parents=True)

In [82]:
gdf = gpd.read_file(output_dir / "school_points_with_lcgms.geojson")
wb_1_24 = load_workbook(data_dir / "2024-Cohort-1-LBP.xlsx", read_only=True)
wb_2_24 = load_workbook(data_dir / "2024-Cohort-2-LBP.xlsx", read_only=True)
wb_3_24 = load_workbook(data_dir / "2024-Cohort-3-LBP.xlsx", read_only=True)
wb_1_25 = load_workbook(data_dir / "2025-Cohort-1-LBP.xlsx", read_only=True)
wb_2_25 = load_workbook(data_dir / "2025-Cohort-2-LBP.xlsx", read_only=True)

In [83]:
ws_1_2024 = wb_1_24["Sheet1"]
ws_2_2024 = wb_2_24["Sheet1"]
ws_3_2024 = wb_3_24["Sheet1"]
ws_1_2025 = wb_1_25["Sheet1"]
ws_2_2025 = wb_2_25["Sheet1"]

In [84]:
cohort_1_2024_data = ws_1_2024.values
cohort_2_2024_data = ws_2_2024.values
cohort_3_2024_data = ws_3_2024.values
cohort_1_2025_data = ws_1_2025.values
cohort_2_2025_data = ws_2_2025.values

In [85]:
cohort_1_2024_cols = next(cohort_1_2024_data)
cohort_1_2024_df = pd.DataFrame(cohort_1_2024_data, columns=cohort_1_2024_cols)
cohort_2_2024_cols = next(cohort_2_2024_data)
cohort_2_2024_df = pd.DataFrame(cohort_2_2024_data, columns=cohort_2_2024_cols)
cohort_3_2024_cols = next(cohort_3_2024_data)
cohort_3_2024_df = pd.DataFrame(cohort_3_2024_data, columns=cohort_3_2024_cols)

cohort_1_2025_cols = next(cohort_1_2025_data)
cohort_1_2025_df = pd.DataFrame(cohort_1_2025_data, columns=cohort_1_2025_cols)
cohort_2_2025_cols = next(cohort_2_2025_data)
cohort_2_2025_df = pd.DataFrame(cohort_2_2025_data, columns=cohort_2_2025_cols)

In [86]:
wb_1_24.close()
wb_2_24.close()
wb_3_24.close()
wb_1_25.close()
wb_2_25.close()

In [87]:
gdf['Building Code'] = gdf['Building Code'].astype(str).str.strip()

cohort_1_2024_df['Building Code'] = cohort_1_2024_df['BUILDING CODE'].astype(str).str.strip()
cohort_2_2024_df['Building Code'] = cohort_2_2024_df['BUILDING CODE'].astype(str).str.strip()
cohort_3_2024_df['Building Code'] = cohort_3_2024_df['BUILDING CODE'].astype(str).str.strip()
cohort_1_2025_df['Building Code'] = cohort_1_2025_df['BUILDING CODE'].astype(str).str.strip()
cohort_2_2025_df['Building Code'] = cohort_2_2025_df['BUILDING CODE'].astype(str).str.strip()
cohort_1_2024_df = cohort_1_2024_df.drop(columns=[None])
cohort_2_2024_df = cohort_2_2024_df.drop(columns=[None])
cohort_3_2024_df = cohort_3_2024_df.drop(columns=[None])
cohort_1_2025_df = cohort_1_2025_df.drop(columns=[None])
cohort_2_2025_df = cohort_2_2025_df.drop(columns=[None])

In [88]:
# standardize
cohort_dfs = {
    'cohort_1_2024': cohort_1_2024_df,
    'cohort_2_2024': cohort_2_2024_df,
    'cohort_3_2024': cohort_3_2024_df,
    'cohort_1_2025': cohort_1_2025_df,
    'cohort_2_2025': cohort_2_2025_df
}

for name, df in cohort_dfs.items():
    df.drop(columns=['None'], errors='ignore', inplace=True)
    df['Building Code'] = df['BUILDING CODE'].astype(str).str.strip()
    df['Room'] = df['ROOM'].astype(str).str.strip()

# label and see totals per cohort
cohort_1_2024_df['cohort'] = 'Cohort 1 2024'
cohort_2_2024_df['cohort'] = 'Cohort 2 2024'
cohort_3_2024_df['cohort'] = 'Cohort 3 2024'
cohort_1_2025_df['cohort'] = 'Cohort 1 2025'
cohort_2_2025_df['cohort'] = 'Cohort 2 2025'

print("Individual Cohort Totals:")
print(f"Cohort 1 2024: {cohort_1_2024_df['Building Code'].nunique()} buildings, {len(cohort_1_2024_df)} rooms")
print(f"Cohort 2 2024: {cohort_2_2024_df['Building Code'].nunique()} buildings, {len(cohort_2_2024_df)} rooms")
print(f"Cohort 3 2024: {cohort_3_2024_df['Building Code'].nunique()} buildings, {len(cohort_3_2024_df)} rooms")
print(f"Cohort 1 2025: {cohort_1_2025_df['Building Code'].nunique()} buildings, {len(cohort_1_2025_df)} rooms")
print(f"Cohort 2 2025: {cohort_2_2025_df['Building Code'].nunique()} buildings, {len(cohort_2_2025_df)} rooms")

# combine by year
batch_2024 = pd.concat([
    cohort_1_2024_df,
    cohort_2_2024_df,
    cohort_3_2024_df
], ignore_index=True)

batch_2025 = pd.concat([
    cohort_1_2025_df,
    cohort_2_2025_df
], ignore_index=True)

print(f"\n2024 Batch Total: {batch_2024['Building Code'].nunique()} buildings, {len(batch_2024)} rooms")
print(f"2025 Batch Total: {batch_2025['Building Code'].nunique()} buildings, {len(batch_2025)} rooms")

# if duplicate - keep 2025 values, drop 2024 duplicates
batch_2024['room_key'] = batch_2024['Building Code'] + '_' + batch_2024['Room']
batch_2025['room_key'] = batch_2025['Building Code'] + '_' + batch_2025['Room']

duplicate_keys = set(batch_2024['room_key']) & set(batch_2025['room_key'])
print(f"\nDuplicate Building+Room combinations across batches: {len(duplicate_keys)}")

# remove duplicates from 2024 (keep 2025 values instead)
batch_2024_deduped = batch_2024[~batch_2024['room_key'].isin(duplicate_keys)].copy()

print(f"2024 Batch after removing duplicates: {batch_2024_deduped['Building Code'].nunique()} buildings, {len(batch_2024_deduped)} rooms")
print(f"Rows removed from 2024 batch: {len(batch_2024) - len(batch_2024_deduped)}")

# combine all years (with 2025 taking precedence)
all_cohorts = pd.concat([batch_2024_deduped, batch_2025], ignore_index=True)

print(f"\nTotal Combined (deduplicated): {all_cohorts['Building Code'].nunique()} buildings, {len(all_cohorts)} rooms")

# clean up temporary key column
all_cohorts = all_cohorts.drop(columns=['room_key'])

print("\n" + "="*60)
print("UNIQUE VALUES IN EACH COLUMN:")
print("="*60)

print("\nDETERIORATED PAINT values:")
print(all_cohorts['DETERIORATED PAINT'].value_counts(dropna=False))

print("\nPRESENCE OF LBP IN DETERIORATED PAINT values:")
print(all_cohorts['PRESENCE OF LBP IN DETERIORATED PAINT'].value_counts(dropna=False))

print("\nREMEDIATION REQUIRED values:")
print(all_cohorts['REMEDIATION REQUIRED'].value_counts(dropna=False))

print("\nREMEDIATION COMPLETE values:")
print(all_cohorts['REMEDIATION COMPLETE'].value_counts(dropna=False))

print("="*60)

# aggregate by building
building_summary = all_cohorts.groupby('Building Code').agg(
    **{
        'Total Rooms': ('Room', 'count'),
        'Deteriorated Paint': ('DETERIORATED PAINT', lambda x: (x == 'YES').sum()),
        'Lead Based Paint': ('PRESENCE OF LBP IN DETERIORATED PAINT', lambda x: (x == 'YES').sum()),
        'Remediation Required': ('REMEDIATION REQUIRED', lambda x: (x == 'YES').sum()),
        'Remediation Complete': ('REMEDIATION COMPLETE', lambda x: (x == 'YES').sum()),
    }
).reset_index()

print(f"\nBuildings in final summary: {len(building_summary)}")

# add nested room details (includes which cohort each room is from)
building_summary['Room Details'] = all_cohorts.groupby('Building Code').apply(
    lambda x: x[['ROOM', 'DETERIORATED PAINT', 'PRESENCE OF LBP IN DETERIORATED PAINT', 
                 'REMEDIATION REQUIRED', 'REMEDIATION COMPLETE', 'cohort']].to_dict('records')
).values

# merge into gdf
gdf['Building Code'] = gdf['Building Code'].astype(str).str.strip()

cols_to_drop = [col for col in gdf.columns if col in building_summary.columns and col != 'Building Code']
gdf = gdf.drop(columns=cols_to_drop, errors='ignore')

gdf = gdf.merge(building_summary, on='Building Code', how='left')

print("\nFinal gdf shape:", gdf.shape)
print(f"Buildings with cohort data: {gdf['Total Rooms'].notna().sum()}")

Individual Cohort Totals:
Cohort 1 2024: 860 buildings, 16787 rooms
Cohort 2 2024: 875 buildings, 8618 rooms
Cohort 3 2024: 806 buildings, 7072 rooms
Cohort 1 2025: 850 buildings, 15982 rooms
Cohort 2 2025: 851 buildings, 9368 rooms

2024 Batch Total: 906 buildings, 32477 rooms
2025 Batch Total: 861 buildings, 25350 rooms

Duplicate Building+Room combinations across batches: 18127
2024 Batch after removing duplicates: 238 buildings, 1035 rooms
Rows removed from 2024 batch: 31442

Total Combined (deduplicated): 909 buildings, 26385 rooms

UNIQUE VALUES IN EACH COLUMN:

DETERIORATED PAINT values:
DETERIORATED PAINT
NO     26039
YES      346
Name: count, dtype: int64

PRESENCE OF LBP IN DETERIORATED PAINT values:
PRESENCE OF LBP IN DETERIORATED PAINT
NA     26039
NO       198
YES      148
Name: count, dtype: int64

REMEDIATION REQUIRED values:
REMEDIATION REQUIRED
NA     26039
NO       198
YES      148
Name: count, dtype: int64

REMEDIATION COMPLETE values:
REMEDIATION COMPLETE
NA     262

In [104]:
import folium
from folium import plugins
import pandas as pd

map_data = gdf[gdf.geometry.notna()].copy()

map_data = map_data.to_crs(epsg=4326)

map_data['Latitude'] = map_data.geometry.centroid.y
map_data['Longitude'] = map_data.geometry.centroid.x

print(f"Buildings with geometry: {len(map_data)}")
print(f"Buildings with Total Rooms data: {map_data['Total Rooms'].notna().sum()}")

map_data = map_data[map_data['Total Rooms'].notna()].copy()

print(f"Buildings to map: {len(map_data)}")
print(f"\nBuildings with Lead Based Paint: {(map_data['Lead Based Paint'] > 0).sum()}")
print(f"Buildings with Deteriorated Paint: {(map_data['Deteriorated Paint'] > 0).sum()}")
print(f"Buildings requiring Remediation: {(map_data['Remediation Required'] > 0).sum()}")
print(f"Buildings with Remediation Complete: {(map_data['Remediation Complete'] > 0).sum()}")

if len(map_data) > 0:
    print(f"\nFirst building coordinates: {map_data.iloc[0]['Latitude']}, {map_data.iloc[0]['Longitude']}")

nyc_map = folium.Map(
    location=[40.7128, -74.0060],
    zoom_start=11,
    tiles='OpenStreetMap'
)

def get_color_for_status(row):
    if row['Remediation Required'] > row['Remediation Complete']:
        return '#e74c3c'
    elif row['Remediation Complete'] > 0:
        return '#27ae60'
    elif row['Lead Based Paint'] > 0:
        return '#f39c12'
    elif row['Deteriorated Paint'] > 0:
        return '#e8b623'
    elif row['Total Rooms'] > 0:
        return '#3498db'
    else:
        return '#bdc3c7'

for idx, building in map_data.iterrows():
    marker_color = get_color_for_status(building)
    
    popup_html = f"""
    <b>Building Code: {building['Building Code']}</b><br>
    <b>Total Rooms: {int(building['Total Rooms'])}</b><br>
    <hr style="margin: 5px 0;">
    """
    
    if building['Deteriorated Paint'] > 0:
        popup_html += f"<b>Deteriorated Paint: {int(building['Deteriorated Paint'])} rooms</b><br>"
    
    if building['Lead Based Paint'] > 0:
        popup_html += f"<b>Lead Based Paint: {int(building['Lead Based Paint'])} rooms</b><br>"
    
    if building['Remediation Required'] > 0:
        popup_html += f"<b>Remediation Required: {int(building['Remediation Required'])} rooms</b><br>"
    
    if building['Remediation Complete'] > 0:
        popup_html += f"<b>Remediation Complete: {int(building['Remediation Complete'])} rooms</b><br>"
    
    folium.CircleMarker(
        location=[building['Latitude'], building['Longitude']],
        radius=6,
        popup=folium.Popup(popup_html, max_width=300),
        color=marker_color,
        fill=True,
        fillColor=marker_color,
        fillOpacity=0.8,
        weight=2
    ).add_to(nyc_map)

legend_html = '''
<div style="position: fixed; 
            bottom: 50px; right: 50px; width: 240px; height: 180px; 
            background-color: white; border:2px solid grey; z-index:9999; 
            font-size:14px; padding: 10px">
<p style="margin-bottom: 8px;"><b>Lead Paint Status</b></p>
<p style="margin: 5px;"><span style="color: #27ae60;">●</span> Remediation Complete</p>
<p style="margin: 5px;"><span style="color: #e74c3c;">●</span> Remediation Required</p>
<p style="margin: 5px;"><span style="color: #f39c12;">●</span> Lead Based Paint Present</p>
<p style="margin: 5px;"><span style="color: #e8b623;">●</span> Deteriorated Paint Only</p>
<p style="margin: 5px;"><span style="color: #3498db;">●</span> No Issues Found</p>
<p style="margin: 5px;"><span style="color: #bdc3c7;">●</span> No Data</p>
</div>
'''
nyc_map.get_root().html.add_child(folium.Element(legend_html))

nyc_map

Buildings with geometry: 1950
Buildings with Total Rooms data: 1047
Buildings to map: 1047

Buildings with Lead Based Paint: 83
Buildings with Deteriorated Paint: 141
Buildings requiring Remediation: 83
Buildings with Remediation Complete: 83

First building coordinates: 40.64895900007969, -74.01142000011585


In [106]:
output_file = output_dir / "schools_with_lead_in_paint.csv"
gdf.to_csv(output_file, index=False)