In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd

## Downloading Bouding Boxes to match Jenny's Countys to a STATEFP code

In [None]:
county_bounding_boxes = pd.read_csv('https://gist.githubusercontent.com/a8dx/7e550680f7ea6a68f20da00e21d7ce9b/raw/5f88f425bb362f8e200456fb8181ecb0a8fd55d6/US_County_Boundingboxes.csv')

states_to_drop = ['American Samoa', 'Puerto Rico', 'Alaska', 'Hawaii', 'Commonwealth of the Northern Mariana Islands', 'United States Virgin Islands', 'Guam']

county_bounding_boxes = county_bounding_boxes[~county_bounding_boxes['STATE_NAME'].isin(states_to_drop)]
county_bounding_boxes.head()

## Read in shapefile of Census bounding boxes to get exact shapely polygons for each county

In [None]:
county_bounding_boxes_full = gpd.read_file('US County Boundary 2018/cb_2018_us_county_500k.shp')
county_bounding_boxes_full['STATEFP'] = county_bounding_boxes_full['STATEFP'].astype(int)
county_bounding_boxes_full.head()

## Merge both dataframes on STATEFP

In [None]:
county_bounding_boxes_full = county_bounding_boxes_full.merge(county_bounding_boxes[['STATE_NAME', 'STATEFP']].drop_duplicates(), left_on='STATEFP', right_on='STATEFP')
county_bounding_boxes_full.head()

## Calculate the area of each county polygon
- epsg 5070 is the projection used by to get the exact area of each county polygon without distortion for the US

In [None]:
# Calculate the area of each county
county_bounding_boxes_full['area km2'] = county_bounding_boxes_full['geometry'].to_crs(epsg=5070).map(lambda p: p.area / 10**6)
# sort the counties by StateFP and CountyFP
county_bounding_boxes_full = county_bounding_boxes_full.sort_values(['STATEFP', 'COUNTYFP'])
# get sq mile area
county_bounding_boxes_full['area mi2'] = county_bounding_boxes_full['area km2'] * 0.386102
county_bounding_boxes_full.head()

## Get relavant columns and Remove Duplicate counties

In [None]:
county_bounding_boxes_full = county_bounding_boxes_full[[
    "GEOID", "STATE_NAME", "NAME", "area km2", "area mi2" 
]]
county_bounding_boxes_full = county_bounding_boxes_full.rename(columns={
    "STATE_NAME": "state",
    "NAME": "county",
})

# find all the state county pairs that are duplicated
county_bounding_boxes_full[county_bounding_boxes_full.duplicated(subset=['state', 'county'], keep=False)].sort_values(['state', 'county'])

county_bounding_boxes_full.head()

In [None]:
# index of the duplicate cases that are incorrect (checked the area one by one by hand)
idxs = [1703, 1318, 2646, 2617, 2626, 2605]
county_bounding_boxes_full = county_bounding_boxes_full.drop(idxs)

## Save to CSV

In [None]:
# county_bounding_boxes_full.to_csv("fixed_BB/county_bounding_boxes_full.csv", index=False)