In [7]:
!pip install censusgeocode


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import glob
import json
import requests
import pandas as pd
from pprint import pprint

# Census Examples 

This notebook uses the `censusgeocode` package in Python (which is simply a wrapper around the US Census' official Geocoder API) to get census geographies for list of addresses or lat/longs

- https://pypi.org/project/censusgeocode/

### Step 1 | Grab your data at the address level

In [9]:
df = pd.read_csv('baltimore-lmop.csv')

In [14]:
# remove null in lat, long

df = df.dropna(subset=['lat', 'long'])

### Step 2 | Geoode Lat/Long if they're not already present

It already exists in this dataset. Census geocode has a function to go from addresss --> lat/long, but I haven't had time to implement it here. This dataset already has lat/longs. Message me if you're struggling with this step.

### Step 3 | Get Census Geographies

In [15]:
# Code adapted from:
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data
# Defines a geocode function that accepts lat/long and spits out geographies
# The code then runs that funciton in parllel (for speed).

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import glob
import json
import requests
import pandas as pd
from pprint import pprint
from tqdm import tqdm


import requests_cache
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        census = data['result']['geographies']['Census Blocks'][0]
        return census
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

def bulk_geocode(latitudes, longitudes):
    """
    Geocode a list of latitudes and longitudes in parallel (for speed).
    """

    with ThreadPoolExecutor() as tpe:
        latitudes = df['lat']
        longitudes = df['long']
        mapped_results = tpe.map(geocode, latitudes, longitudes)
        data = list(tqdm(mapped_results, total=len(df)))

    return pd.DataFrame(data)

census_geos_df = bulk_geocode(df['lat'], df['long']) 
census_geos_df.head()


100%|██████████| 48/48 [00:00<00:00, 172.95it/s]


Unnamed: 0,SUFFIX,POP100,GEOID,CENTLAT,BLOCK,AREAWATER,STATE,BASENAME,OID,LSADC,...,TRACT,CENTLON,BLKGRP,AREALAND,HU100,INTPTLON,MTFCC,LWBLKTYP,UR,COUNTY
0,,303,240276030013008,39.3107759,3008,20102,24,3008,210701008433656,BK,...,603001,-76.9165175,3,3082515,85,-76.9181838,G5040,B,R,27
1,,40,240037516002006,38.9992836,2006,43753,24,2006,210701008460394,BK,...,751600,-76.577941,2,2964312,10,-76.5780772,G5040,B,R,3
2,,121,240098610033000,38.3826801,3000,0,24,3000,210701008422291,BK,...,861003,-76.4343048,3,612423,88,-76.4343048,G5040,L,U,9
3,,138,240199702002028,38.6669238,2028,130246,24,2028,210701007975810,BK,...,970200,-75.8859696,2,4509149,61,-75.8860477,G5040,B,R,19
4,,0,245102604022006,39.3120516,2006,0,24,2006,210701007704285,BK,...,260402,-76.5458312,2,409395,0,-76.5458312,G5040,L,U,510


In [16]:
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]
census_geos_df

Unnamed: 0,GEOID,STATE,COUNTY,TRACT,BLOCK
0,240276030013008,24,27,603001,3008
1,240037516002006,24,3,751600,2006
2,240098610033000,24,9,861003,3000
3,240199702002028,24,19,970200,2028
4,245102604022006,24,510,260402,2006
5,240338006071007,24,33,800607,1007
6,240338006071007,24,33,800607,1007
7,240338006071007,24,33,800607,1007
8,240150309033011,24,15,30903,3011
9,240479512001053,24,47,951200,1053


In [17]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,GHGRP ID,Landfill ID,Landfill Name,State,Physical Address,City,County,Zip Code,lat,long,...,Actual MW Generation,Rated MW Capacity,LFG Flow to Project (mmscfd),Current Year Emission Reductions (MMTCO2e/yr) - Direct,Current Year Emission Reductions (MMTCO2e/yr) - Avoided,GEOID,STATE,COUNTY,TRACT,BLOCK
0,1007291.0,734,Alpha Ridge SLF,MD,2350 Marriottsville Road,Marriottsville,Howard,21104.0,39.305776,-76.898803,...,0.58,1.059,0.28,0.0294,0.0028,240276030013008,24,27,603001,3008
1,,735,Annapolis SLF,MD,Defense Highway,Annapolis,Anne Arundel,21401.0,38.992,-76.573,...,,,,,,240037516002006,24,3,751600,2006
2,,736,Appeal SLF,MD,,Lusby,Calvert,20657.0,38.381112,-76.438334,...,,,,,,240098610033000,24,9,861003,3000
3,1000331.0,10120,Beulah Municipal Landfill,MD,6815 East New Market Ellwood Road,Hurlock,Dorchester,21643.0,38.6735,-75.8994,...,,,,,,240199702002028,24,19,970200,2028
4,,740,Bowley's Lane LF,MD,Bowley's Lane,Baltimore,Baltimore city,21206.0,39.3138,-76.5444,...,,,,,,245102604022006,24,510,260402,2006


In [18]:
# save to csv

df_with_geos.to_csv('baltimore-lmop-with-geocodes.csv', index=False)

# Step 4 | Pick a geographical level and get Census data
Do you want Census data at the state level? county? tract? block?

1. Pick a geographical level.
2. See `census-example.ipynb` if you want to learn how to get Census data at your desired level

In [None]:
# state level: tract

df_with_geos['GEOID'] = df_with_geos['GEOID'].astype(str)

# Hope that helps!