In [10]:
import glob
import json
import requests
import pandas as pd
from pprint import pprint

In [18]:
# ! pip install requests_cache


Collecting requests_cache
  Downloading requests_cache-1.0.1-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.7/58.7 kB[0m [31m658.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting cattrs>=22.2
  Downloading cattrs-22.2.0-py3-none-any.whl (35 kB)
Collecting url-normalize>=1.4
  Using cached url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, cattrs, requests_cache
Successfully installed cattrs-22.2.0 requests_cache-1.0.1 url-normalize-1.4.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
# This is my file that contains one row for every open street with the lat lon coordinates for the mean center of the street (one dot per registered street, calculated using QGIS)

df = pd.read_csv('./mean_coords_table.tsv', sep='\t')

In [16]:
df

Unnamed: 0,wkt_geom,MEAN_X,MEAN_Y,object_id,lat,lon
0,Point (932357.50813676416873932 129799.6779506...,9.323575e+05,129799.677951,1.0,40.52279,-74.186652
1,Point (952222.48087393492460251 147937.0785196...,9.522225e+05,147937.078520,384.0,40.57267,-74.115286
2,Point (957307.78094421629793942 167722.3556344...,9.573078e+05,167722.355634,480.0,40.62699,-74.097060
3,Point (961974.16859200922772288 165665.9565205...,9.619742e+05,165665.956521,501.0,40.62136,-74.080242
4,Point (962028.83035567402839661 168307.7564461...,9.620288e+05,168307.756446,6.0,40.62861,-74.080054
...,...,...,...,...,...,...
537,Point (1056074.98569662659429014 158679.738558...,1.056075e+06,158679.738558,325.0,40.60192,-73.741347
538,Point (1056284.10559147596359253 158493.067578...,1.056284e+06,158493.067578,326.0,40.60141,-73.740596
539,Point (1056446.62167064100503922 158342.209940...,1.056447e+06,158342.209940,320.0,40.60099,-73.740012
540,Point (1056606.48847688734531403 158161.324475...,1.056606e+06,158161.324475,321.0,40.60050,-73.739439


In [21]:
# Code adapted from:
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data
# Defines a geocode function that accepts lat/long and spits out geographies
# The code then runs that funciton in parllel (for speed).

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

import requests_cache
cache = requests_cache.CachedSession("geocode_cache", backend="filesystem")

def geocode(lat, lng):
    try:
        url = "https://geocoding.geo.census.gov/geocoder/geographies/coordinates"
        params = {
            "x": lng,
            "y": lat,
            "benchmark": "Public_AR_Census2020",
            "vintage": "Census2020_Census2020",
            "format": "json"
        }
        response = cache.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        census = data['result']['geographies']['Census Blocks'][0]
        return census
    except Exception as e:
        print(f"Error geocoding ({lat}, {lng}): {e}")
        return None

def bulk_geocode(latitudes, longitudes):
    """
    Geocode a list of latitudes and longitudes in parallel (for speed).
    """

    with ThreadPoolExecutor() as tpe:
        latitudes = df['lat']
        longitudes = df['lon']
        mapped_results = tpe.map(geocode, latitudes, longitudes)
        data = list(tqdm(mapped_results, total=len(df)))

    return pd.DataFrame(data)

census_geos_df = bulk_geocode(df['lat'], df['lon']) 
census_geos_df.head()

In [None]:
to_keep = ['GEOID', 'STATE', 'COUNTY', 'TRACT', 'BLOCK']
census_geos_df = census_geos_df[to_keep]
census_geos_df

In [None]:
df_with_geos = pd.concat(
    [ 
        df.reset_index(drop=True),
        census_geos_df.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()