In [1]:
import geopandas
import numpy as np
import pandas as pd
from collections import defaultdict

from constants import *

### Configuration
`OVERLAP_IGNORE_THRESHOLD` - If the percentage overlap is less than this value, then the overlap is ignored.

`OVERLAP_CONFIRM_THRESHOLD` - If the percentage overlap is greater than this value, then the districts are considered clearly overlapping.

`geopandas.options.io_engine` - Determines the engine used for reading shapefiles. `fiona` is used by default; `pyogrio` needs to be installed separately and is faster.

In [2]:
OVERLAP_IGNORE_THRESHOLD = 0.1
OVERLAP_CONFIRM_THRESHOLD = 0.8

geopandas.options.io_engine = "pyogrio"

In [3]:
# Temp replacement for file paths while handling stuff
VA_DEMOGRAPHIC_PATH = "../../Data/va_pl2020_vtd.zip"
VA_ELECTION_PATH = "../../Data/va_vest_20.zip"

### Overlap Checking
District data from the census dataset is merged with district data from the VEST dataset.

This will be done by determining how the districts in the census dataset overlap with districts in the VEST dataset. The set of district boundaries should be relatively similar, though a district in the census dataset may be broken up into multiple districts in the VEST dataset (or vice versa).

The boundaries in the census data file will be treated as canonical, since they contain fewer overlapping sections and empty holes. Data from districts in the VEST file will be mapped to the district in the census file which covers that area.

In [56]:
va_election_df.dtypes

STATEFP         object
COUNTYFP        object
PRECINCTID      object
VAP_MOD          int64
vote_dem       float64
vote_rep       float64
G20PRELJOR     float64
G20PREOWRI     float64
G20USSDWAR     float64
G20USSRGAD     float64
G20USSOWRI     float64
geometry      geometry
vote_other     float64
dtype: object

In [21]:
def get_precinct_data(census_df, election_df, verbose=False, output_format=["merged_data"]):
    '''
    Merges a state's precinct-level census dataset and census block level voting dataset.

    Parameters
    ----------
    census_df : GeoDataFrame
        GeoDataFrame containing the state's precinct-level census data.
    election_df : DataFrame
        DataFrame containing the state's block-level election data.
    verbose : bool
        Indicates whether or not descriptive data for manual double-checking should be printed.
    output_format: list[str]
        Gives a list of items to be returned.
        Allowed values include:
        'merged_data' - GeoDataFrame consisting of data merged to precinct-level
        'mappings' - dict mapping each census block to a precinct
        'unused_blocks' - GeoDataFrame containing blocks which could not be mapped to a precinct (if any)
    
    Returns
    -------
    List of selected output items (or a single object if only one output is selected)
    '''

    # Set index to GeoID and format census data
    census_df_format = {
        "P0030001": "pop_total",
        "P0030003": "pop_white",
        "P0030004": "pop_black",
        "P0030005": "pop_native",
        "P0030006": "pop_asian",
        "P0030007": "pop_pacific",
        "P0030009": "pop_two_or_more",
        "P0040002": "pop_hispanic"
    }
    census_df = census_df.set_index("GEOID20").rename(columns=census_df_format)

    # Set index to GeoID and format election data
    election_df_format = {
        "G20PREDBID" : "vote_dem",
        "G20PRERTRU": "vote_rep"
    }
    election_df = election_df.set_index("GEOID20").rename(columns=election_df_format)
    
    # Join blocks onto the precinct-level DataFrame
    census_df["VTD_GEOID20"] = census_df.index
    merged_df = geopandas.sjoin(election_df, census_df, how="left", predicate="covered_by")
    precinct_df = merged_df.groupby("VTD_GEOID20").sum()

    # Return output
    output = list()
    for entry in output_format:
        if entry == "merged_data":
            output.append(precinct_df)
        elif entry == "mappings":
            # Create dict indicating the blocks that correspond to each district
            d = defaultdict(lambda : list())
            for index, row in merged_df.iterrows():
                d[row["VTD_GEOID20"]].append(index)
            output.append(dict(d))
        elif entry == "unused_blocks":
            output.append(merged_df.loc[merged_df["VTD_GEOID20"].isna()])

    if len(output) == 1:
        return output[0]
    return output


In [35]:
def get_percentages(precinct_df):
    # Calculate percentages for each demographic group
    pct_calculations = {
        "pop_white": "pct_white",
        "pop_black": "pct_black",
        "pop_native": "pct_native",
        "pop_asian": "pct_asian",
        "pop_pacific": "pct_pacific",
        "pop_two_or_more": "pct_two_or_more",
        "pop_hispanic": "pct_hispanic"
    }
    for k, v in pct_calculations.items():
        precinct_df[v] = precinct_df[k] / precinct_df["pop_total"]
    
    # Calculate percentages for each party (only Democrat and Republican voters are taken into account)
    precinct_df["pct_vote_dem"] = precinct_df["vote_dem"] / (precinct_df["vote_dem"] + precinct_df["vote_rep"])
    precinct_df["pct_vote_rep"] = precinct_df["vote_rep"] / (precinct_df["vote_dem"] + precinct_df["vote_rep"])

    return precinct_df

In [7]:
# Load data files
va_census_df = geopandas.read_file(VA_DEMOGRAPHIC_PATH)
va_election_df = geopandas.read_file(VA_BLOCK_ELECTION_PATH)

# Get merged precinct data
va_precinct_df = get_precinct_data(va_census_df, va_election_df)
#va_precinct_df.to_file("va_precinct_data.json", driver="GeoJSON")