In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

RAW_DATA_PATH = "../../data/"
shapefile = "" # path to shapefile (Too large so not in repo)

In [2]:
bg_bb = gpd.read_file(f"{RAW_DATA_PATH}block_group_raw/{shapefile}") # Not in repo due to size
bg_bb.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,GEOIDFQ,GEOID,NAME,NAMELSAD,LSAD,ALAND,AWATER,geometry
0,1,113,30300,1,1500000US011130303001,11130303001,1,Block Group 1,BG,829037,0,"POLYGON ((-85.00365 32.47885, -85.00133 32.480..."
1,4,15,953406,2,1500000US040159534062,40159534062,2,Block Group 2,BG,577144904,15330641,"POLYGON ((-114.32649 34.43788, -114.31824 34.4..."
2,4,1,945002,1,1500000US040019450021,40019450021,1,Block Group 1,BG,607817889,362533,"POLYGON ((-109.32335 35.54182, -109.31495 35.5..."
3,4,27,301,2,1500000US040270003012,40270003012,2,Block Group 2,BG,2041298,71837,"POLYGON ((-114.66727 32.72505, -114.66541 32.7..."
4,5,7,20803,3,1500000US050070208033,50070208033,3,Block Group 3,BG,3545865,53379,"POLYGON ((-94.27908 36.48882, -94.27656 36.491..."


In [3]:
county_bounding_boxes_FIPS = pd.read_csv(f"{RAW_DATA_PATH}extras/US_FIPS_Codes.csv")

states_to_drop = ['American Samoa', 'Puerto Rico', 'Alaska', 'Hawaii', 'Commonwealth of the Northern Mariana Islands', 'United States Virgin Islands', 'Guam']

# Columns names in the first row
county_bounding_boxes_FIPS.columns = county_bounding_boxes_FIPS.iloc[0]
county_bounding_boxes_FIPS = county_bounding_boxes_FIPS[1:]
county_bounding_boxes_FIPS = county_bounding_boxes_FIPS.iloc[:, :-1]

county_bounding_boxes_FIPS = county_bounding_boxes_FIPS[~county_bounding_boxes_FIPS['State'].isin(states_to_drop)]

county_bounding_boxes_FIPS.head()

Unnamed: 0,State,County Name,FIPS State,FIPS County
1,Alabama,Autauga,1,1
2,Alabama,Baldwin,1,3
3,Alabama,Barbour,1,5
4,Alabama,Bibb,1,7
5,Alabama,Blount,1,9


In [4]:
merged = pd.merge(bg_bb, county_bounding_boxes_FIPS, left_on=['STATEFP', 'COUNTYFP'], right_on=['FIPS State', 'FIPS County'], how='inner').drop(columns=['FIPS State', 'FIPS County'])
merged.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,GEOIDFQ,GEOID,NAME,NAMELSAD,LSAD,ALAND,AWATER,geometry,State,County Name
0,1,113,30300,1,1500000US011130303001,11130303001,1,Block Group 1,BG,829037,0,"POLYGON ((-85.00365 32.47885, -85.00133 32.480...",Alabama,Russell
1,1,113,31200,1,1500000US011130312001,11130312001,1,Block Group 1,BG,252578677,909711,"POLYGON ((-85.31136 32.35557, -85.30299 32.377...",Alabama,Russell
2,1,113,30403,2,1500000US011130304032,11130304032,2,Block Group 2,BG,840428,0,"POLYGON ((-85.03047 32.49030, -85.02532 32.492...",Alabama,Russell
3,1,113,30500,2,1500000US011130305002,11130305002,2,Block Group 2,BG,1457772,0,"POLYGON ((-85.03723 32.49351, -85.03429 32.493...",Alabama,Russell
4,1,113,30601,1,1500000US011130306011,11130306011,1,Block Group 1,BG,2692957,6893,"POLYGON ((-85.03005 32.43032, -85.02710 32.432...",Alabama,Russell


In [16]:
# check for duplicates
print(merged.shape)

# check using GEOID
print(merged['GEOID'].nunique())


(235092, 14)
235092


## Calculate the area of each bg polygon

In [9]:
merged['area km2'] = merged['geometry'].to_crs(epsg=5070).map(lambda p: p.area / 10**6)
merged['area mi2'] = merged['geometry'].to_crs(epsg=5070).map(lambda p: p.area / 10**6 * 0.386102)
merged.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,GEOIDFQ,GEOID,NAME,NAMELSAD,LSAD,ALAND,AWATER,geometry,State,County Name,area km2,area mi2
0,1,113,30300,1,1500000US011130303001,11130303001,1,Block Group 1,BG,829037,0,"POLYGON ((-85.00365 32.47885, -85.00133 32.480...",Alabama,Russell,0.802873,0.309991
1,1,113,31200,1,1500000US011130312001,11130312001,1,Block Group 1,BG,252578677,909711,"POLYGON ((-85.31136 32.35557, -85.30299 32.377...",Alabama,Russell,253.475811,97.867518
2,1,113,30403,2,1500000US011130304032,11130304032,2,Block Group 2,BG,840428,0,"POLYGON ((-85.03047 32.49030, -85.02532 32.492...",Alabama,Russell,0.842279,0.325205
3,1,113,30500,2,1500000US011130305002,11130305002,2,Block Group 2,BG,1457772,0,"POLYGON ((-85.03723 32.49351, -85.03429 32.493...",Alabama,Russell,1.459928,0.563681
4,1,113,30601,1,1500000US011130306011,11130306011,1,Block Group 1,BG,2692957,6893,"POLYGON ((-85.03005 32.43032, -85.02710 32.432...",Alabama,Russell,2.682887,1.035868


In [11]:
bounding_box_bg = merged[['GEOID', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'State', 'County Name', 'area km2', 'area mi2']]
bounding_box_bg.head()

Unnamed: 0,GEOID,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,State,County Name,area km2,area mi2
0,11130303001,1,113,30300,1,Alabama,Russell,0.802873,0.309991
1,11130312001,1,113,31200,1,Alabama,Russell,253.475811,97.867518
2,11130304032,1,113,30403,2,Alabama,Russell,0.842279,0.325205
3,11130305002,1,113,30500,2,Alabama,Russell,1.459928,0.563681
4,11130306011,1,113,30601,1,Alabama,Russell,2.682887,1.035868


In [13]:
bounding_box_bg.to_csv(f"{RAW_DATA_PATH}block_group_clean/bounding_box_full_bg.csv", index=False)