In [1]:
import extract
import pandas as pd
import os
import geopandas as gpd

In [2]:
import importlib
importlib.reload(extract)

<module 'extract' from 'd:\\Users\\User\\Downloads\\RA Work\\acs\\extract.py'>

In [3]:
# open elections.xlsx
elections = pd.read_excel("elections.xlsx")
elections

Unnamed: 0,filename,city,state,Geography,District #,office,year,geo_filename
0,NewYorkCity_06222021_DEMBoroughPresidentBronx.csv,New York,NY,boroughcounty,Bronx,Borough president,2021,
1,NewYorkCity_06222021_DEMBoroughPresidentKings.csv,New York,NY,boroughcounty,Kings,Borough president,2021,
2,NewYorkCity_06222021_DEMBoroughPresidentNewYor...,New York,NY,boroughcounty,New York,Borough president,2021,
3,NewYorkCity_06222021_DEMBoroughPresidentQueens...,New York,NY,boroughcounty,Queens,Borough president,2021,
4,NewYorkCity_06222021_DEMBoroughPresidentRichmo...,New York,NY,boroughcounty,Richmond,Borough president,2021,
...,...,...,...,...,...,...,...,...
404,Alaska_11082022_SenateDistrictP.csv,,AK,upper state legislative,P,State senate,2022,alaska_upper_districts_2022.csv
405,Alaska_11082022_SenateDistrictQ.csv,,AK,upper state legislative,Q,State senate,2022,alaska_upper_districts_2022.csv
406,Alaska_11082022_SenateDistrictR.csv,,AK,upper state legislative,R,State senate,2022,alaska_upper_districts_2022.csv
407,Alaska_11082022_SenateDistrictS.csv,,AK,upper state legislative,S,State senate,2022,alaska_upper_districts_2022.csv


In [4]:
# add tracts and percents columns to elections
elections['tracts'] = None
elections['percents'] = None

In [5]:
abbreviation_to_name = {
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States.
    "AK": "Alaska",
    "AL": "Alabama",
    "AR": "Arkansas",
    "AZ": "Arizona",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "IA": "Iowa",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "MA": "Massachusetts",
    "MD": "Maryland",
    "ME": "Maine",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MO": "Missouri",
    "MS": "Mississippi",
    "MT": "Montana",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "NE": "Nebraska",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NV": "Nevada",
    "NY": "New York",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VA": "Virginia",
    "VT": "Vermont",
    "WA": "Washington",
    "WI": "Wisconsin",
    "WV": "West Virginia",
    "WY": "Wyoming",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district.
    "DC": "District of Columbia",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories.
    "AS": "American Samoa",
    "GU": "Guam GU",
    "MP": "Northern Mariana Islands",
    "PR": "Puerto Rico PR",
    "VI": "U.S. Virgin Islands",
}

In [6]:
# getting tracts and percent overlap for each district

# for each row in elections
for index, row in elections.iterrows():
    if index % 10 == 0:
        print(f"Processing row {index}")

    # if geo_filename or tract_filename is not nan
    if not pd.isna(row['geo_filename']):
        # if tracts and percents are not already in the row
        if pd.isna(row['tracts']) or pd.isna(row['percents']):
            district = row['District #']

            # open geo_filename and tract_filename in geo_polygons and tract_polygons folders as dfs
            geo_df = pd.read_csv(os.path.join("geo_polygons", row['geo_filename']))
            # get state from row and conver to full name
            state = abbreviation_to_name[row['state']].lower()
            # round year down to nearest 10
            year = str(int(row['year'] / 10) * 10)
            tract_df = pd.read_csv(os.path.join("tract_polygons", f"{state}_tracts_{year}.csv"))

            # rename geometry columns to WKT
            geo_df = geo_df.rename(columns={"geometry": "WKT"})
            tract_df = tract_df.rename(columns={"geometry": "WKT"})

            # convert dfs to geodataframes
            geo_df = gpd.GeoDataFrame(geo_df, geometry=gpd.GeoSeries.from_wkt(geo_df['WKT']))
            tract_df = gpd.GeoDataFrame(tract_df, geometry=gpd.GeoSeries.from_wkt(tract_df['WKT']))

            # get geometry in geo_df where district is district
            try: 
                district_geo = geo_df[geo_df['district'] == district].geometry.iloc[0]
            except:
                print(f"District {district} not found in {row['geo_filename']}")
                break

            tracts, percents = extract.overlap(tract_df, district_geo)

            # add tracts and percents to elections
            # convert tracts and percents to string representations
            tracts = ", ".join(tracts)
            percents = ", ".join([str(percent) for percent in percents])
            elections.at[index, 'tracts'] = tracts
            elections.at[index, 'percents'] = percents

Processing row 0
Processing row 10
Processing row 20
Processing row 30
Processing row 40
Processing row 50
Processing row 60
Processing row 70
Processing row 80
Processing row 90
Processing row 100
Processing row 110
Processing row 120
Processing row 130
Processing row 140
Processing row 150
Processing row 160
Processing row 170
Processing row 180
Processing row 190
Processing row 200
Processing row 210
Processing row 220
Processing row 230
Processing row 240
Processing row 250
Processing row 260
Processing row 270
Processing row 280
Processing row 290
Processing row 300
Processing row 310
Processing row 320
Processing row 330
Processing row 340
Processing row 350
Processing row 360
Processing row 370
Processing row 380
Processing row 390
Processing row 400


In [7]:
# save as elections_with_tracts.xlsx
elections.to_excel("elections_with_tracts.xlsx", index=False)