In [1]:
# !pip install censusgeocode

In [2]:
import glob
import json
import requests
import pandas as pd
from pprint import pprint

# Census Examples 

This notebook uses the `censusgeocode` package in Python (which is simply a wrapper around the US Census' official Geocoder API) to get census geographies for list of addresses or lat/longs

- https://pypi.org/project/censusgeocode/

### Step 1 | Grab your data at the address level

In [3]:
df = pd.read_parquet('example-data.parquet').sample(100)
df

Unnamed: 0,year,borough,zip,incident_address,lat,long,num_complaints
53246,2021,MANHATTAN,10031,260 CONVENT AVENUE,40.821582,-73.948482,17
23694,2020,QUEENS,11367,150-29 72 ROAD,40.727588,-73.816146,1
66441,2022,BRONX,10456,1378 COLLEGE AVENUE,40.837432,-73.910625,3
88333,2022,MANHATTAN,10003,3 EAST 9 STREET,40.732714,-73.995573,1
92044,2022,MANHATTAN,10027,158 WEST 122 STREET,40.806495,-73.948475,4
...,...,...,...,...,...,...,...
87650,2022,BROOKLYN,11249,109 NORTH 7 STREET,40.718610,-73.958944,127
67352,2022,BRONX,10457,363 EAST 180 STREET,40.852148,-73.898973,8
88983,2022,MANHATTAN,10010,209 EAST 25 STREET,40.739926,-73.981744,20
44477,2021,BROOKLYN,11225,481 CROWN STREET,40.665881,-73.943220,1


### Step 2 | Geoode Lat/Long if they're not already present

It already exists in this dataset. Census geocode has a function to go from addresss --> lat/long, but I haven't had time to implement it here. This dataset already has lat/longs. Message me if you're struggling with this step.

### Step 3 | Get Census Geographies

In [4]:
# Code adapted from. Defines a geocode function and then runs it in parallel (for speed)
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

def geocode(lat, lng):
    census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

    data = dict(geoid=census['GEOID'], 
                state=census['STATE'], 
                county=census['COUNTY'], 
                tract=census['TRACT'], 
                block=census['BLOCK'])
    
    return data


with ThreadPoolExecutor() as tpe:
    latitudes = df['lat']
    longitudes = df['long']
    mapped_results = tpe.map(geocode, latitudes, longitudes)
    data = list(tqdm(mapped_results, total=len(df)))

census_geos_df = pd.DataFrame(data)
census_geos_df.head()


  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,geoid,state,county,tract,block
0,360610227001004,36,61,22700,1004
1,360810779061002,36,81,77906,1002
2,360050177023001,36,5,17702,3001
3,360610059001000,36,61,5900,1000
4,360610220001000,36,61,22000,1000


In [5]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,year,borough,zip,incident_address,lat,long,num_complaints,geoid,state,county,tract,block
0,2021,MANHATTAN,10031,260 CONVENT AVENUE,40.821582,-73.948482,17,360610227001004,36,61,22700,1004
1,2020,QUEENS,11367,150-29 72 ROAD,40.727588,-73.816146,1,360810779061002,36,81,77906,1002
2,2022,BRONX,10456,1378 COLLEGE AVENUE,40.837432,-73.910625,3,360050177023001,36,5,17702,3001
3,2022,MANHATTAN,10003,3 EAST 9 STREET,40.732714,-73.995573,1,360610059001000,36,61,5900,1000
4,2022,MANHATTAN,10027,158 WEST 122 STREET,40.806495,-73.948475,4,360610220001000,36,61,22000,1000


# Step 4 | Pick a geographical level and get Census data
Do you want Census data at the state level? county? tract? block?

1. Pick a geographical level.
2. See `census-example.ipynb` if you want to learn how to get Census data at your desired level

# Hope that helps!