# Join Processed Data to School Points

In [None]:
import pandas as pd
import geopandas as gpd

schools = gpd.read_file('../data/processed_data/school_points_with_lcgms.shp')

In [None]:
pd.set_option('display.max_columns', None)
schools

## Join Boroughs

Lost borough names at an earlier stage so just going to bring them back in here via spatial join

In [None]:
boroughs = gpd.read_file('../data/raw_data/NYC Planning/nybb_25c/nybb.shp')[['BoroName', 'geometry']].to_crs(schools.crs)
master_schools = gpd.sjoin(schools, boroughs, how='left', predicate='within').drop(columns=['index_right'])

## Join DACs to Schools

In [None]:
dacs = gpd.read_file('../data/processed_data/dac_nyc_lite.geojson')

In [None]:
# check that there aren't any public schools exactly on the border of a DAC
assert schools.geometry.apply(dacs.union_all().covers).sum() == schools.geometry.within(dacs.union_all()).sum()

In [None]:
master_schools = gpd.sjoin(schools, dacs, how='left', predicate='within')
master_schools.drop(columns=['index_right', 'county', 'geoid'], inplace=True)
master_schools['dac_designation'] = master_schools['dac_designation'].fillna(False)

## Join Election Results to Schools

In [None]:
primary_results = gpd.read_file('../data/processed_data/zohran_first_round_frac.geojson')

In [None]:
# Reproject to planar CRS Web Mercator (EPSG:3857) for accurate distance calculations
master_schools_og_crs = master_schools.crs
primary_results_og_crs = primary_results.crs
master_schools = master_schools.to_crs('EPSG:3857')
primary_results = primary_results.to_crs('EPSG:3857')

# First do regular spatial join
master_schools = gpd.sjoin(master_schools, primary_results, how='left', predicate='within').drop(columns=['index_right'])

# Find unmatched schools
unmatched_mask = master_schools['ZohranFirstRoundFrac'].isna()
unmatched_schools = master_schools[unmatched_mask].copy()

print(f"Found {unmatched_mask.sum()} schools without polygon matches, using nearest neighbor...")

# Use sjoin_nearest for unmatched schools
nearest_join = gpd.tools.sjoin_nearest(unmatched_schools.drop(columns='ZohranFirstRoundFrac'), primary_results, how='left')
master_schools.loc[unmatched_mask, 'ZohranFirstRoundFrac'] = nearest_join['ZohranFirstRoundFrac'].values

assert not master_schools['ZohranFirstRoundFrac'].isna().any()

# Reproject back to original CRS
master_schools = master_schools.to_crs(master_schools_og_crs)
primary_results = primary_results.to_crs(primary_results_og_crs)

## Join IBO School Barriers Data

This dataset has capacity/utilization, percentage of space with A/C, building accessibility, and some other really cool fields, and they're all already joined to location code. So hoping this will join well

NOTE: looks like IBO only included schools that were in all of the datasets they were joining (i.e. inner join for every join). This means if we go to the source data and do left joins instead, we might get better results. See footnotes on data sources [here](https://www.ibo.nyc.gov/content/publications/2025-march-barriers-to-learning-age-accessibility-space-usage-and-air-conditioning-in-nyc-school-buildings)

In [None]:
# TODO: go back and get the original sources of all the data in IBO dataset to see if we can get better coverage.
ibo_barriers = pd.read_excel('../data/raw_data/IBO/IBO-barriers-to-learning-data-file.xlsx', sheet_name='DATA')
# NOTE: there are only 1309 records in IBO data. 
print("Pct match from IBO to master_schools:", ibo_barriers['building_code'].isin(master_schools['Bldg_Code']).sum() / len(ibo_barriers))

ibo_barriers['central_ac'] = ibo_barriers['central_ac'].map({'Y': 1, 'N': 0})

ibo_cols_of_interest = [
    'building_code',
    'building_ownership_description',
    'yearbuilt',
    'age',
    'bap_rating',
    'Accessibility_Description',
    'bldg_enroll',
    'target_bldg_cap',
    'utilization',
    'overutilized',
    'per_area_PHYS_ED',
    'central_ac',
    'per_ac_area_total',
]
ibo_barriers = ibo_barriers[ibo_cols_of_interest]

In [None]:
master_schools = master_schools.merge(ibo_barriers, left_on='Bldg_Code', right_on='building_code', how='left').drop(columns=['building_code'])

## Join City Council Districts

In [None]:
council_districts = gpd.read_file('../data/processed_data/city_council_districts.geojson').to_crs(master_schools.crs)
master_schools = gpd.sjoin(master_schools, council_districts, how='left', predicate='within').drop(columns=['index_right', 'BOROUGH', 'Shape_Leng', 'Shape_Area'])

# Export Joined Data

## Shorten Columns for Shapefile Limit

In [None]:
shortened_cols = {
    # DAC columns
    'dac_designation': 'in_dac',
    'combined_score': 'comb_score',
    'percentile_rank_combined_nyc': 'pctl_comb',
    'burden_score': 'burd_score',
    'burden_score_percentile': 'pctl_burd',
    'vulnerability_score': 'vuln_score',
    'vulnerability_score_percentile': 'pctl_vuln',
    # Primary results columns
    'ZohranFirstRoundFrac': 'ZohrPrimR1',
    # IBO columns
    'age': 'Bldg_Age',
    'building_ownership_description': 'Bldg_Owner',
    'bldg_enroll': 'BldgEnroll',
    'target_bldg_cap': 'BldgCapac',
    'utilization': 'Util',
    'overutilized': 'Overutil',
    'Accessibility_Description': 'Accessible',
    'per_area_PHYS_ED': 'PctAreaPE',
    'per_ac_area_total': 'PctAreaAC',
    # Council District columns
    'NAME': 'CouncName',
    'POLITICAL PARTY': 'CouncParty',
    'DISTRICT OFFICE ADDRESS': 'CouncAddr',
    'DISTRICT OFFICE PHONE': 'CouncPhone'
}


# test if cols are correct length
for col in master_schools.rename(columns=shortened_cols).columns:
    if len(col) > 10:
        print(f"{col} too long: currently {len(col)} chars")

In [None]:
# Rename columns for Shapefile
master_schools = master_schools.rename(columns=shortened_cols)

## Export to Shapefile

In [None]:
import zipfile
import os
# Save shapefile first
shp_path = '../data/processed_data/master_schools.shp'
master_schools.sort_values('Loc_Code').to_file(
    shp_path,
    driver='ESRI Shapefile'
)

# Create zip file with all shapefile components
zip_path = '../data/processed_data/master_schools.zip'
base_name = '../data/processed_data/master_schools'

# Shapefile extensions to include
extensions = ['.shp', '.shx', '.dbf', '.prj', '.cpg']

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for ext in extensions:
        file_path = base_name + ext
        if os.path.exists(file_path):
            # Add file to zip with just the filename (no path)
            zipf.write(file_path, os.path.basename(file_path))
            print(f"Added {os.path.basename(file_path)} to zip")

print(f"Shapefile saved as zip: {zip_path}")

## Export to GeoJSON

In [None]:
master_schools.sort_values('Loc_Code').to_file(
    '../data/processed_data/master_schools.geojson', driver='GeoJSON'
)