In [1]:
import geopandas
import numpy as np
import pandas as pd
import json
from collections import defaultdict

from constants import *

### Global configuration
`geopandas.options.io_engine` - Determines the engine used for reading shapefiles. `fiona` is used by default; `pyogrio` needs to be installed separately and is faster.

In [2]:
geopandas.options.io_engine = "pyogrio"

### Merging demographic and election data
Precinct-level data from the census dataset is merged with block-level data from the election dataset.

This will be done by determining which precinct each census block is contained within, and merging all blocks which are contained in the same precinct. The fields from each census block will be summed up to get the precinct-level election data.

The precinct boundaries in the census data file will be treated as canonical, since they are clean and do not contain holes or overlapping boundaries.

In [13]:
def get_precinct_data(
                        census_df, 
                        election_df, 
                        census_column_format=None, 
                        election_column_format=None, 
                        verbose=False, 
                        output_format=["merged_data"]
                    ):
    '''
    Merges a state's precinct-level census dataset and census block level voting dataset.

    Parameters
    ----------
    census_df : GeoDataFrame
        GeoDataFrame containing the state's precinct-level census data.
    election_df : DataFrame
        DataFrame containing the state's block-level election data.
    census_column_format : dict
        dict containing a list of columns to rename and what to rename them to.
    election_column_format : dict
        dict containing a list of columns to rename and what to rename them to. 
        Also determines which columns are summed up when grouping under districts
    verbose : bool
        Indicates whether or not descriptive data for manual double-checking should be printed.
    output_format: list[str]
        Gives a list of items to be returned.
        Allowed values include:
        'merged_data' - GeoDataFrame consisting of data merged to precinct-level
        'mappings' - dict mapping each census block to a precinct
        'unused_blocks' - GeoDataFrame containing blocks which could not be mapped to a precinct (if any)
    
    Returns
    -------
    List of selected output items (or a single object if only one output is selected)
    '''

    # Set up census dataframe
    census_df["vtd_geo_id"] = census_df["GEOID20"]
    census_df = census_df.set_index("GEOID20")
    if census_column_format:
        census_df.rename(columns=census_column_format, inplace=True)

    # Set up election dataframe
    election_df = election_df.set_index("GEOID20")
    if election_column_format:
        election_df.rename(columns=election_column_format, inplace=True)
    
    # Join blocks onto the precinct-level DataFrame
    joined_block_df = geopandas.sjoin(
                        election_df, 
                        census_df[["vtd_geo_id", "geometry"]], 
                        how="left", 
                        predicate="covered_by"
                    )
    
    precinct_df = census_df
    precinct_df.index.rename("vtd_geo_id", inplace=True)
    election_cols = list(election_column_format.values())
    precinct_df[election_cols] = joined_block_df.groupby("vtd_geo_id")[election_cols].sum()

    # Add boundaries back to merged precinct-level data
    precinct_df = precinct_df.set_geometry(census_df.geometry)

    # Verbose output
    if verbose:
        print("Unused blocks: ", joined_block_df["vtd_geo_id"].isna().sum())

    # Return output
    output = list()
    for entry in output_format:
        if entry == "merged_data":
            output.append(precinct_df)
        elif entry == "mappings":
            # Create dict indicating the blocks that correspond to each district
            d = defaultdict(lambda : list())
            for index, row in joined_block_df.iterrows():
                d[row["vtd_geo_id"]].append(index)
            output.append(dict(d))
        elif entry == "unused_blocks":
            output.append(joined_block_df.loc[joined_block_df["vtd_geo_id"].isna()])

    if len(output) == 1:
        return output[0]
    return output


In [4]:
def get_percentages(precinct_df, inplace=False):
    '''
    Adds columns indicating the percentage of the population in each demographic group.
    Also adds columns indicating the percentage of voters who voted for the Democrat/Republican party.

    Parameters
    ----------
    precinct_df : GeoDataFrame
        GeoDataFrame containing the state's merged precinct-level data.
    inplace: bool
        Indicates whether the GeoDataFrame should be modified in place or if a new object should be created
    
    Returns
    -------
    List of selected output items (or a single object if only one output is selected)
    '''
    # Parse arguments
    if not inplace:
        precinct_df = precinct_df.copy()

    # Calculate percentages for each demographic group
    pct_calculations = {
        "pop_white": "pct_white",
        "pop_black": "pct_black",
        "pop_native": "pct_native",
        "pop_asian": "pct_asian",
        "pop_pacific": "pct_pacific",
        "pop_two_or_more": "pct_two_or_more",
        "pop_hispanic": "pct_hispanic"
    }
    for k, v in pct_calculations.items():
        precinct_df[v] = precinct_df[k] / precinct_df["pop_total"]
        
    # Calculate percentages for each demographic group (ages 18 and up only)
    pct_calculations = {
        "pop_white_18": "pct_white_18",
        "pop_black_18": "pct_black_18",
        "pop_native_18": "pct_native_18",
        "pop_asian_18": "pct_asian_18",
        "pop_pacific_18": "pct_pacific_18",
        "pop_two_or_more_18": "pct_two_or_more_18",
        "pop_hispanic_18": "pct_hispanic_18"
    }
    for k, v in pct_calculations.items():
        precinct_df[v] = precinct_df[k] / precinct_df["pop_total_18"]
    
    # Calculate percentages for each party (only Democrat and Republican voters are taken into account)
    precinct_df["pct_vote_dem"] = precinct_df["vote_dem"] / (precinct_df["vote_dem"] + precinct_df["vote_rep"])
    precinct_df["pct_vote_rep"] = precinct_df["vote_rep"] / (precinct_df["vote_dem"] + precinct_df["vote_rep"])

    return precinct_df

### Calculating adjacency

Converts adjacency data stored in CSV format to a dictionary of adjacency lists. A minimum threshold can be set on the length of the shared border required for precincts to be considered adjacent

In [5]:
def validate_adjacency_data(adj_df, 
                            precinct_df, 
                            pct_adj_thresh=0.0, 
                            abs_adj_thresh=0.0, 
                            verbose=True, 
                            return_rejections=False):
    '''
    Checks an adjacency file, removing edges between precincts whose shared boundary is below some threshold.

    Parameters
    ----------
    adj_df : DataFrame
        DataFrame containing the state's precinct adjacency data.
    precinct_df : GeoDataFrame
        GeoDataFrame indexed by GeoID containing the state's precinct boundaries.
    pct_adj_thresh : float
        Edges such that `shared_boundary.length / precinct_boundary.length < pct_adj_thresh` 
        for both precincts connected by the edge are rejected
    abs_adj_thresh : float
        If `pct_adj_thresh` is set, then edges are only rejected if `shared_boundary.length < abs_adj_thresh` is also true
    verbose : bool
        Indicates whether debugging information should be printed.
    return_rejections : bool
        Indicates whether the list of rejected adjacency edges should be returned.
    
    Returns
    -------
    dict containing adjacency lists for each precinct (and dict of rejected edges if specified)
    '''
    # Pre-compute precinct boundary lengths
    prec_boundary_lengths = precinct_df.to_crs(crs={"proj": "cea"}).length

    # Iterate over each adjacency list in the CSV file
    adjacency_lists = dict()
    if verbose or return_rejections:
        rejections = dict()
    for n, row in adj_df.iterrows():
        # Parse GeoIDs
        curr_geoid = str(row["GEOID20"])
        adj_geoids = [x.strip("\'") for x in row["ADJ_GEOMS"].strip("[]").split(", ")]

        # Calculate boundaries and check if they surpass the minimum length
        curr_prec_boundary_length = prec_boundary_lengths.loc[curr_geoid]
        adj_precs_boundary_lengths = prec_boundary_lengths.loc[adj_geoids]

        shared_boundaries = precinct_df.loc[adj_geoids].intersection(precinct_df.loc[[curr_geoid] * len(adj_geoids)], align=False)
        shared_boundary_lengths = shared_boundaries.to_crs(crs={"proj": "cea"}).length
        curr_prec_shared_prop = shared_boundary_lengths / curr_prec_boundary_length
        adj_precs_shared_prop = shared_boundary_lengths / adj_precs_boundary_lengths

        is_valid_border = (curr_prec_shared_prop > pct_adj_thresh) | (adj_precs_shared_prop > pct_adj_thresh) | (shared_boundary_lengths > abs_adj_thresh)
        adjacency_lists[curr_geoid] = list(precinct_df.loc[adj_geoids][is_valid_border].index)

        if verbose:
            if len(adjacency_lists[curr_geoid]) == 0:
                print(f"Warning: Precinct {curr_geoid} does not have neighboring precincts fulfilling requirements")
                print(f"Precinct {curr_geoid} boundary length: {curr_prec_boundary_length}")
                print(pd.Series(data={
                                        "adj_precs_shared_prop": adj_precs_shared_prop, 
                                        "adj_prec_boundary_lengths": adj_precs_boundary_lengths
                                }, 
                                index=adj_prec_boundary_length))

        # Determine which adjacencies were rejected
        if verbose or return_rejections:
            if (~is_valid_border).sum() > 0:
                rejections[curr_geoid] = list(precinct_df.loc[adj_geoids][~is_valid_border].index)

    # Verify that there are no edges which only go one way
    bad_edges = list()
    for prec, neighbors in adjacency_lists.items():
        for n in neighbors:
            if prec not in adjacency_lists[n]:
                bad_edges.append((prec, n))
    if len(bad_edges) > 0:
        raise Exception(f"The following edges only go one way: {bad_edges}")

    if verbose:
        # Determine number of rejected edges
        rejection_count = sum([len(x) for x in rejections.values()])
        print(f"{rejection_count} adjacency edges rejected")

    if return_rejections:
        return (adjacency_lists, rejections)
    return adjacency_lists

### Running preprocessing functions on selected states

In [6]:
# Set options
census_column_format = {
    "P0010001": "pop_total",            # Total population
    "P0010003": "pop_white",            # White alone
    "P0010004": "pop_black",            # Black or African American alone
    "P0010005": "pop_native",           # American Indian and Alaska Native alone
    "P0010006": "pop_asian",            # Asian alone
    "P0010007": "pop_pacific",          # Native Hawaiian and Other Pacific Islander alone
    "P0010009": "pop_two_or_more",      # Population of two or more races
    "P0020002": "pop_hispanic",         # Hispanic or Latino
    "P0030001": "pop_total_18",         # Total population (age 18 and over)
    "P0030003": "pop_white_18",         # White alone (age 18 and over)
    "P0030004": "pop_black_18",         # Black or African American alone (age 18 and over)
    "P0030005": "pop_native_18",        # American Indian and Alaska Native alone (age 18 and over)
    "P0030006": "pop_asian_18",         # Asian alone (age 18 and over)
    "P0030007": "pop_pacific_18",       # Native Hawaiian and Other Pacific Islander alone (age 18 and over)
    "P0030009": "pop_two_or_more_18",   # Population of two or more races (age 18 and over)
    "P0040002": "pop_hispanic_18",      # Hispanic or Latino (age 18 and over)
}
election_column_format = {
    "G20PREDBID" : "vote_dem",          # Voted for Biden in 2020 presidential election
    "G20PRERTRU": "vote_rep"            # Voted for Trump in 2020 presidential election
}
output_columns = list(census_column_format.values()) + list(election_column_format.values()) + ["geometry"]

pct_adj_thresh = 0.05
abs_adj_thresh = 200 * 0.3048 # Equal to 200 ft

In [14]:
# Arizona precinct file
print(f"Creating AZ precinct data file:")

az_census_df = geopandas.read_file(AZ_DEMOGRAPHIC_PATH)
az_election_df = geopandas.read_file(AZ_ELECTION_PATH)

az_precinct_df = get_precinct_data(
                    az_census_df, 
                    az_election_df, 
                    census_column_format=census_column_format, 
                    election_column_format=election_column_format, 
                    verbose=True
                )
az_precinct_df[output_columns].to_file(AZ_PRECINCT_OUTPUT_PATH, driver="GeoJSON")

Creating AZ precinct data file:
Unused blocks:  0


In [None]:
az_precinct_df

In [8]:
# Arizona adjacency file
print(f"Creating AZ adjacency list file:")

az_adj_df = pd.read_csv(AZ_ADJACENCY_PATH)
az_adj_list = validate_adjacency_data(
                adj_df=az_adj_df,
                precinct_df=az_precinct_df,
                pct_adj_thresh=pct_adj_thresh,
                abs_adj_thresh=abs_adj_thresh,
                verbose=True
            )
with open(AZ_ADJACENCY_OUTPUT_PATH, "w") as output_file:
    json.dump(az_adj_list, output_file)

Creating AZ adjacency list file:
84 adjacency edges rejected


In [9]:
# Virginia precinct file
print(f"Creating VA precinct data file:")

va_census_df = geopandas.read_file(VA_DEMOGRAPHIC_PATH)
va_election_df = geopandas.read_file(VA_ELECTION_PATH)

va_precinct_df = get_precinct_data(
                    va_census_df, 
                    va_election_df, 
                    census_column_format=census_column_format, 
                    election_column_format=election_column_format, 
                    verbose=True
                )
va_precinct_df[output_columns].to_file(VA_PRECINCT_OUTPUT_PATH, driver="GeoJSON")

Creating VA precinct data file:
Unused blocks:  0


In [10]:
# Virginia adjacency file
print(f"Creating VA adjacency list file:")

va_adj_df = pd.read_csv(VA_ADJACENCY_PATH)
va_adj_list = validate_adjacency_data(
                adj_df=va_adj_df,
                precinct_df=va_precinct_df,
                pct_adj_thresh=pct_adj_thresh,
                abs_adj_thresh=abs_adj_thresh,
                verbose=True
            )
with open(VA_ADJACENCY_OUTPUT_PATH, "w") as output_file:
    json.dump(va_adj_list, output_file)

Creating VA adjacency list file:
202 adjacency edges rejected


In [11]:
# Wisconsin precinct file
print(f"Creating WI precinct data file:")

wi_census_df = geopandas.read_file(WI_DEMOGRAPHIC_PATH)
wi_election_df = geopandas.read_file(WI_ELECTION_PATH)

wi_precinct_df = get_precinct_data(
                    wi_census_df,
                    wi_election_df,
                    census_column_format=census_column_format,
                    election_column_format=election_column_format,
                    verbose=True
                )
wi_precinct_df[output_columns].to_file(WI_PRECINCT_OUTPUT_PATH, driver="GeoJSON")

Creating WI precinct data file:
Unused blocks:  0


In [12]:
# Wisconsin adjacency file
print(f"Creating WI adjacency list file:")

wi_adj_df = pd.read_csv(WI_ADJACENCY_PATH)
wi_adj_list = validate_adjacency_data(
                adj_df=wi_adj_df,
                precinct_df=wi_precinct_df,
                pct_adj_thresh=pct_adj_thresh,
                abs_adj_thresh=abs_adj_thresh,
                verbose=True
            )
with open(WI_ADJACENCY_OUTPUT_PATH, "w") as output_file:
    json.dump(wi_adj_list, output_file)

Creating WI adjacency list file:
954 adjacency edges rejected
