In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd

## Load Public School Points

In [None]:
school_points = gpd.read_file('../data/processed_data/school_points_with_lcgms.geojson', dtype={'Zip': str, 'Borough Block Lot': str})
if school_points.crs != "EPSG:4326":
        school_points = school_points.to_crs("EPSG:4326")


In [None]:
public_school_points = school_points[school_points["Managed By Name"]=='DOE'].reset_index(drop=True)

## Load DACs

In [None]:
# TODO: fix the dtypes in DAC exports so that BBL, ZIP, and other code cols are string
dacs = gpd.read_file('../data/processed_data/dac_nyc_lite.geojson')

### Add Flag to Public Schools if in a DAC

In [None]:
# check that there aren't any public schools exactly on the border of a DAC
assert public_school_points.geometry.apply(dacs.union_all().covers).sum() == public_school_points.geometry.within(dacs.union_all()).sum()

In [None]:
public_school_points['in_dac'] = public_school_points.geometry.within(dacs.union_all())

In [None]:
print("proportion of public schools in a DAC:", public_school_points['in_dac'].sum() / len(public_school_points))

Show Top 10 schools by composite DAC score

In [None]:
public_schools_with_dacs = gpd.sjoin(public_school_points, dacs, how='left', predicate='within')

In [None]:
public_schools_with_dacs['dac_designation'] = public_schools_with_dacs['dac_designation'].fillna(False)

In [None]:
public_schools_with_dacs = public_schools_with_dacs[public_schools_with_dacs['dac_designation']].sort_values('percentile_rank_combined_nyc', ascending=False)

In [None]:
# TODO: fix this in the processed school points export
public_schools_with_dacs['Zip'] = public_schools_with_dacs['Zip'].astype(int).astype(str).str.zfill(5)
public_schools_with_dacs['Borough Block Lot'] = public_schools_with_dacs['Borough Block Lot'].astype(int).astype(str).str.zfill(10)
public_schools_with_dacs['duplicate_geometry'] = public_schools_with_dacs['geometry'].duplicated(keep=False)

In [None]:
public_school_points[public_school_points['Location Name'].isin(['P.S. 112 Jose Celso Barbosa', 'P.S. 206 Jose Celso Barbosa' ])]

In [None]:
cols_to_keep_from_schools = ['Location Code', 'Location Name', 'Administrative District Name', 'Principal Name', 'Principal Phone Number', 'Primary Address', 'Zip', 'Borough Block Lot']
cols_to_keep_from_dacs = ['combined_score', 'percentile_rank_combined_nyc', 'burden_score', 'burden_score_percentile', 'vulnerability_score','vulnerability_score_percentile']
all_cols_to_keep = cols_to_keep_from_schools + cols_to_keep_from_dacs
public_schools_with_dacs[all_cols_to_keep + ['geometry', 'duplicate_geometry']].head(25)

In [None]:
public_schools_with_dacs[all_cols_to_keep + ['geometry']].head(25).explore(tiles='CartoDB positron',
                popup=all_cols_to_keep,
                tooltip=['Location Name'],
                legend=True,
                style_kwds={'fillOpacity': 0.7, 'weight': 1}
)

## Load Enrollment Capacity and Utilization Data


Questions: 
- Do we use building enrollment/capacity or school enrollment/capacity?
- What threshold do we use for pct utilization to indicate burden?
    - The language in the policy memo is "whether it meets the capacity of enrollment demands", which I interpret as less than or equal to 100% Utilization. But that leaves us with a lot of options

In [None]:
capacity_utilization_df = pd.read_csv('../data/raw_data/SCA/Capacity and Utilization/Enrollment_Capacity_And_Utilization_Reports_20250915.csv')
print('total records:', len(capacity_utilization_df))
print('unique buildings:', capacity_utilization_df['Bldg ID'].nunique())
print('unique organizations:', capacity_utilization_df['Organization Name'].nunique())

In [None]:
# Fix data types
# Convert 'Data As Of' to datetime
capacity_utilization_df['Data As Of'] = pd.to_datetime(capacity_utilization_df['Data As Of'], format='%m/%d/%Y')


In [None]:
# Deduplicate by taking most recent record for each organization/building combination
capacity_utilization_df = capacity_utilization_df.sort_values('Data As Of').drop_duplicates(subset=['Organization Name', 'Bldg ID'], keep='last')

In [None]:
# TODO: need to get 5K records down to 1.5K unique buildings

# For these 837 buildings, there is an org name that matches the building name
# ASSUMPTION: when org name==building name, this is the school whose capacity we care about.
capacity_utilization_df[capacity_utilization_df['Bldg Name']==capacity_utilization_df['Organization Name']]['Bldg ID'].nunique()

In [None]:
# TODO: find buildings that don't have a matching organization name
# For each group, check if name of group appears in Organization Name column

capacity_utilization_df.groupby('Bldg Name').apply(lambda x: (x['Organization Name']==x.name).any())

In [None]:
capacity_utilization_df[capacity_utilization_df['Bldg Name']=='1368 FULTON STREET']

In [None]:
# Show how many unique/overlapping school buildings
schools_capacity_merged = public_school_points[['Location Code', 'Location Name']].merge(
    capacity_utilization_df[['Bldg ID', 'Bldg Name']].drop_duplicates(subset='Bldg ID'), left_on='Location Code', right_on='Bldg ID', how='outer', indicator=True)
schools_capacity_merged['_merge'].value_counts()

In [None]:
public_school_points[public_school_points['Location Code'].fillna('').str.startswith("K414")].head(20)[['Latitude', 'Longitude','geometry']]#contains('P.S..019', regex=True)]

In [None]:
schools_capacity_merged[schools_capacity_merged['_merge']=='right_only']