In [None]:
!pip install -U -q pip tensorflow geopandas folium mapclassify google-cloud-storage google-cloud-bigquery[pandas] db-dtypes

In [None]:
import folium
import os
import requests
import zipfile
import geopandas as gpd
from tqdm import tqdm

import os
import zipfile
from google.cloud import storage, bigquery
import geopandas as gpd
import pandas as pd


The shapefiles are available at https://www2.census.gov/geo/tiger/TIGER2019/

In [None]:
# Path to your service account JSON file
service_account_json = 'nyu-datasets-77d8cc8e92d6.json'

# Set the environment variable
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_json

# US States and US Counties

In [None]:
state_url = 'https://www2.census.gov/geo/tiger/TIGER2019/STATE/tl_2019_us_state.zip'
states_gdf = gpd.GeoDataFrame.from_file(state_url)

In [None]:
counties_url = 'https://www2.census.gov/geo/tiger/TIGER2019/COUNTY/tl_2019_us_county.zip'
counties_gdf = gpd.GeoDataFrame.from_file(counties_url)

In [None]:
states = states_gdf['STATEFP'].values

In [None]:
counties = counties_gdf.filter(['STATEFP', 'COUNTYFP']).values

# Download state, county, tract, water data to Google Bucket


In [None]:
def download_file_and_upload_to_gcs(url, local_path, bucket_name, destination_blob_name):
    """
    Downloads a file from a URL and uploads it to Google Cloud Storage.

    Args:
    url (str): URL of the file to download.
    local_path (str): Local path to save the file.
    bucket_name (str): Name of the GCS bucket.
    destination_blob_name (str): Destination blob name in the GCS bucket.
    """
    # Download file
    response = requests.get(url)
    with open(local_path, 'wb') as file:
        file.write(response.content)

    # Upload to Google Cloud Storage
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(local_path)

In [None]:
# Download states
download_file_and_upload_to_gcs(state_url, 'maps/tl_2019_us_state.zip', 'census_shapefiles', 'states/tl_2019_us_state.zip')

In [None]:
# Download counties
download_file_and_upload_to_gcs(counties_url, 'maps/tl_2019_us_county.zip', 'census_shapefiles', 'counties/tl_2019_us_county.zip')

In [None]:
# Download tracts
for STATEFP in tqdm(states):
    filename = f"tl_2019_{STATEFP}_tract.zip"
    tract_url = f"https://www2.census.gov/geo/tiger/TIGER2019/TRACT/{filename}"
    download_file_and_upload_to_gcs(tract_url, f'maps/{filename}', 'census_shapefiles', f'tracts/{filename}')


In [None]:
# Download blockgroups
for STATEFP in tqdm(states):
    filename = f"tl_2019_{STATEFP}_bg.zip"
    tract_url = f"https://www2.census.gov/geo/tiger/TIGER2019/BG/{filename}"
    download_file_and_upload_to_gcs(tract_url, f'maps/{filename}', 'census_shapefiles', f'blockgroups/{filename}')

In [None]:
# Download blocks
for STATEFP in tqdm(states):
    filename = f"tl_2019_{STATEFP}_tabblock10.zip"
    tract_url = f"https://www2.census.gov/geo/tiger/TIGER2019/TABBLOCK/{filename}"
    download_file_and_upload_to_gcs(tract_url, f'maps/{filename}', 'census_shapefiles', f'blocks/{filename}')


In [None]:
# Download water areas
for STATEFP, COUNTYFP in tqdm(counties):
    filename = f"tl_2019_{STATEFP+COUNTYFP}_areawater.zip"
    tract_url = f"https://www2.census.gov/geo/tiger/TIGER2019/AREAWATER/{filename}"
    download_file_and_upload_to_gcs(tract_url, f'maps/{filename}', 'census_shapefiles', f'areawater/{filename}')

#  Import shapes to Bigquery

In [None]:
def upload_shapefile_to_bigquery(bucket_name, shapefile_blob_name, bigquery_dataset_id, bigquery_table_id, temp_dir='temp'):
    # Initialize Google Cloud clients
    storage_client = storage.Client()
    bigquery_client = bigquery.Client()

    # Create the temporary directory if it does not exist
    os.makedirs(temp_dir, exist_ok=True)

    # Download the shapefile ZIP from the bucket
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(shapefile_blob_name)

    # Create the full path for the zip file, ensuring all subdirectories exist
    zip_path = os.path.join(temp_dir, shapefile_blob_name)
    os.makedirs(os.path.dirname(zip_path), exist_ok=True)

    blob.download_to_filename(zip_path)

    # Extract the ZIP file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Find the .shp file inside the extracted folder
    shp_path = None
    for root, dirs, files in os.walk(temp_dir):
        for filename in files:
            if filename.endswith('.shp'):
                shp_path = os.path.join(root, filename)
                break
        if shp_path:
            break

    if not shp_path:
        raise FileNotFoundError("No .shp file found in the extracted ZIP.")

    # Read the shapefile into a GeoDataFrame
    gdf = gpd.read_file(shp_path)

    # Convert the geometries to a BigQuery-friendly format (e.g., WKT)
    gdf['geometry'] = gdf['geometry'].apply(lambda x: x.wkt)

    # Convert the GeoDataFrame to a Pandas DataFrame
    df = pd.DataFrame(gdf)

    # Create a BigQuery table and upload the DataFrame
    table_id = f"{bigquery_dataset_id}.{bigquery_table_id}"
    job = bigquery_client.load_table_from_dataframe(df, table_id)
    job.result()  # Wait for the job to complete 
    
    # Clean up: remove temporary files and directories
    for root, dirs, files in os.walk(temp_dir, topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))
        for name in dirs:
            os.rmdir(os.path.join(root, name))

            
def convert_wkt_to_geograpphy(tablename):
    
    bigquery_client = bigquery.Client()
    # Convert the WKT column to a GEOGRAPHY column and drop the original WKT column
    sql = f"""
        CREATE OR REPLACE TABLE `{tablename} AS
        SELECT *, ST_GeogFromText(geometry) AS geography
        FROM `{tablename}`;
        ALTER TABLE `{tablename}
        DROP COLUMN geometry;
    """
    query_job = bigquery_client.query(sql)
    query_job.result()  # Wait for the query to complete   

In [None]:
upload_shapefile_to_bigquery('census_shapefiles', 'states/tl_2019_us_state.zip', 'nyu-datasets.shapefiles', 'us_states')

In [None]:
upload_shapefile_to_bigquery('census_shapefiles', 'counties/tl_2019_us_county.zip', 'nyu-datasets.shapefiles', 'us_counties')

In [None]:
for STATEFP in tqdm(states):
    filename = f"tl_2019_{STATEFP}_tract.zip"
    upload_shapefile_to_bigquery('census_shapefiles', f'tracts/{filename}', 'nyu-datasets.shapefiles', 'us_tracts')

convert_wkt_to_geograpphy('nyu-datasets.shapefiles.us_tracts')

In [None]:
for STATEFP in tqdm(states):
    filename = f"tl_2019_{STATEFP}_bg.zip"
    upload_shapefile_to_bigquery('census_shapefiles', f'blockgroups/{filename}', 'nyu-datasets.shapefiles', 'us_blockgroups')

convert_wkt_to_geograpphy('nyu-datasets.shapefiles.us_blockgroups')

In [None]:
for STATEFP in tqdm(states):
    filename = f"tl_2019_{STATEFP}_tabblock10.zip"
    upload_shapefile_to_bigquery('census_shapefiles', f'blocks/{filename}', 'nyu-datasets.shapefiles', 'us_blocks')

convert_wkt_to_geograpphy('nyu-datasets.shapefiles.us_blocks')

In [None]:
# Download water areas
for STATEFP, COUNTYFP in tqdm(counties):
    filename = f"tl_2019_{STATEFP+COUNTYFP}_areawater.zip"
    tract_url = f"https://www2.census.gov/geo/tiger/TIGER2019/AREAWATER/{filename}"
    download_file_and_upload_to_gcs(tract_url, f'maps/{filename}', 'census_shapefiles', f'areawater/{filename}')

# Area Water

In [None]:
def download_and_process_geodata(STATEFP, COUNTYFP, YEAR=2019):
    """
    Downloads and processes geospatial data for a specified state and county.

    Args:
    STATEFP (str): State FIPS code.
    COUNTYFP (str): County FIPS code.
    YEAR (int, optional): Year for the dataset. Defaults to 2019.

    Returns:
    GeoDataFrame: A GeoDataFrame containing the processed geospatial data.
    """



    # Creating directory for maps
    os.makedirs('maps', exist_ok=True)

    # Download the boundaries of tracts
    filename = f"tl_{YEAR}_{STATEFP}_tract.zip"
    tract_url = f"https://www2.census.gov/geo/tiger/TIGER{YEAR}/TRACT/{filename}"
    download_file(tract_url, f'maps/{filename}')

    # Extracting the ZIP file
    with zipfile.ZipFile(f'maps/{filename}', 'r') as zip_ref:
        zip_ref.extractall('maps')

    shapefile = f"maps/{filename[:-4]}.shp"
    geo_df = gpd.read_file(shapefile)
    # Keep only the data for the county of interest
    geo_df = geo_df.query(f"COUNTYFP == '{COUNTYFP}'")

    # Download water areas to avoid including them in maps
    water_filename = f"tl_{YEAR}_{STATEFP+COUNTYFP}_areawater.zip"
    water_url = f"https://www2.census.gov/geo/tiger/TIGER{YEAR}/AREAWATER/{water_filename}"
    download_file(water_url, f'maps/{water_filename}')

    # Extracting the water ZIP file
    with zipfile.ZipFile(f'maps/{water_filename}', 'r') as zip_ref:
        zip_ref.extractall('maps')

    water_shapefile = f"maps/{water_filename[:-4]}.shp"
    water_geodf = gpd.read_file(water_shapefile)

    # Remove water areas from the tracts
    geo_df = gpd.overlay(geo_df, water_geodf, how="difference")

    # Keep only variables of interest
    geo_df = geo_df.filter(['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAMELSAD', 'ALAND', 'geometry'])

    return geo_df


In [None]:

# Example usage:
geo_df = download_and_process_geodata("36", "061") # For New York, Manhattan

# Plot the tract boundaries
geo_df.plot(
    figsize=(10, 10),
    color="white",
    edgecolor="black",
    linewidth=0.5,
    zorder=0,
    facecolor="white",
)