# Disadvantaged Communities (DAC)
Source: https://data.ny.gov/Energy-Environment/Final-Disadvantaged-Communities-DAC-2023/2e6c-s6fp/about_data

Last updated: July 1, 2024

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd

## Process Data

In [None]:
dac_gdf = gpd.read_file('..//data/raw_data/NYSERDA/Final Disadvantaged Communities (DAC) 2023_20250731.geojson')

# Assert that all records listed as 'nyc_region'=='NYC' have 'county'.isin(['New York', 'Kings', 'Queens', 'Bronx', 'Richmond'])
assert (dac_gdf[dac_gdf['nyc_region'] == 'NYC']['county'].isin(['New York', 'Kings', 'Queens', 'Bronx', 'Richmond'])).all()
# Assert that all records listed as 'county'.isin(['New York', 'Kings', 'Queens', 'Bronx', 'Richmond']) have 'nyc_region'=='NYC'
assert (dac_gdf[dac_gdf['county'].isin(['New York', 'Kings', 'Queens', 'Bronx', 'Richmond'])]['nyc_region'] == 'NYC').all()

# Filter to only DAC census tracts within NYC
dac_gdf = dac_gdf[dac_gdf['nyc_region']=='NYC']

dac_gdf.rename(columns={
    # This rename of percentile_rank_combined cols is based on order of appearance in data dictionary. NYSERDA_FINALDisadvantagedCommuniesDAC_DataDictionary.pdf
    'percentile_rank_combined': 'percentile_rank_combined_statewide',
    'percentile_rank_combined_1': 'percentile_rank_combined_nyc',
    'percentile_rank_combined_2': 'percentile_rank_combined_ros'
}, inplace=True)

Drop unnecessary columns

In [None]:
# Drop all columns that have only one unique value
for col in dac_gdf.columns:
    if dac_gdf[col].unique().size == 1:
        print(f"Dropping column `{col}` because it has only one unique value: {dac_gdf[col].unique()[0]}")
        dac_gdf.drop(columns=[col], inplace=True)

# Drop unnecessary columns
cols_to_drop = ['percentile_rank_combined_statewide', 'urban_rural']
dac_gdf.drop(columns=cols_to_drop, inplace=True)

Fix dtypes

In [None]:
# Convert dac_designation and household_low_count_flag to boolean
dac_gdf['dac_designation'] = dac_gdf['dac_designation'] == 'Designated as DAC'
dac_gdf['household_low_count_flag'] = dac_gdf['household_low_count_flag'] == 'Yes'

# Convert all columns to numeric except those that are non-numeric, based on data dict
non_numeric_cols = ['county', 'dac_designation', 'geoid', 'household_low_count_flag', 'geometry']
for col in dac_gdf.columns:
    if col not in non_numeric_cols:
        try:
            dac_gdf[col] = pd.to_numeric(dac_gdf[col], errors='coerce')
        except ValueError:
            print(f"Column `{col}` could not be converted to numeric.")


Re-order columns

In [None]:
core_cols = ['dac_designation', 'combined_score', 'percentile_rank_combined_nyc', 'burden_score', 'burden_score_percentile', 'vulnerability_score', 'vulnerability_score_percentile']
geographic_cols = ['county', 'geoid', 'geometry']
general_demo_cols = ['household_count', 'household_low_count_flag', 'population_count']

#############################################################################################
##### Indicators Considered to Represent Environmental Burdens and Climate Change Risks #####
#############################################################################################
pollution_exposure_cols = ['benzene_concentration', 'homes_built_before_1960', 'particulate_matter_25', 'traffic_number_vehicles', 'traffic_truck_highways', 'wastewater_discharge']
land_use_cols = ['housing_vacancy_rate', 'industrial_land_use', 'oil_storage', 'power_generation_facilities', 'remediation_sites', 'rmp_sites', 'scrap_metal_processing']
climate_risk_cols = ['agricultural_land_use', 'coastal_flooding_storm_risk', 'days_above_90_degrees_2050', 'drive_time_healthcare', 'inland_flooding_risk', 'low_vegetative_cover']

####################################################################################
##### Indicators Considered to Represent Population and Health Vulnerabilities #####
####################################################################################
# NOTE: Z campaign might really like using 'renter_percent' and 'home_energy_affordability' columns
income_edu_empl_cols = ['household_single_parent', 'lmi_80_ami', 'lmi_poverty_federal', 'population_no_college', 'unemployment_rate']
race_lang_cols = ['asian_percent', 'black_african_american_percent', 'english_proficiency', 'latino_percent', 'native_indigenous', 'redlining_updated']
health_cols = ['age_over_65','asthma_ed_rate', 'copd_ed_rate', 'health_insurance_rate', 'households_disabled', 'low_birth_weight', 'mi_hospitalization_rate', 'premature_deaths']
housing_cols = ['home_energy_affordability', 'renter_percent', 'rent_percent_income', 'mobile_homes', 'internet_access']


# Combine all defined column lists
all_defined_cols_in_order = core_cols + pollution_exposure_cols + land_use_cols + climate_risk_cols + income_edu_empl_cols + race_lang_cols + health_cols + housing_cols + general_demo_cols + geographic_cols

# Assert that the above lists cover all columns
assert len(dac_gdf.columns[~dac_gdf.columns.isin(all_defined_cols_in_order)]) == 0

# Re-order columns
# TODO: Should I do multi-index to capture the different categories of columns?
dac_gdf = dac_gdf[all_defined_cols_in_order]

In [None]:
dac_gdf.__class__.__module__

In [None]:
dac_gdf.head()

## Sanity Check: Visualize Data with Geopandas

In [None]:
dac_gdf.explore(tiles='CartoDB positron',
                popup=housing_cols,  # Show these fields on click
                tooltip=['dac_designation', 'percentile_rank_combined_nyc', 'combined_score'],  # Show on hover
                legend=True,
                color='dac_designation',
                style_kwds={'fillOpacity': 0.7, 'weight': 1}
)

## Export to `processed_data` folder as GeoJSON

In [None]:
# Export the full version with all defined columns
dac_gdf.to_file('../data/processed_data/dac_nyc_full.geojson', driver='GeoJSON')

# Export a lite version with only core and geographic columns, filtering for dac_designation == True
dac_gdf_lite = dac_gdf[dac_gdf['dac_designation'] == True][core_cols + geographic_cols]
dac_gdf_lite.to_file('../data/processed_data/dac_nyc_lite.geojson', driver='GeoJSON')