# Crop Sequence Boundary data extraction - states

## import libraries

In [1]:
import fiona
import geopandas as gpd
import shapely  # shapely 2.0
import pyogrio
import pyarrow
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from pyproj import CRS
# from mpl_toolkits.basemap import Basemap
from scipy.spatial import KDTree
from shapely.geometry import Point, LineString, shape
import itertools
from operator import itemgetter
from pprint import pprint
import time
import psutil
import os
from datetime import datetime


In [2]:
# Set GeoPandas to use pyogrio
gpd.options.io_engine = "pyogrio"

In [3]:
gpd.show_versions()


SYSTEM INFO
-----------
python     : 3.12.4 | packaged by conda-forge | (main, Jun 17 2024, 10:13:44) [Clang 16.0.6 ]
executable : /Users/jwhite/miniforge3/envs/siads699b/bin/python
machine    : macOS-14.5-arm64-arm-64bit

GEOS, GDAL, PROJ INFO
---------------------
GEOS       : 3.12.2
GEOS lib   : None
GDAL       : 3.9.1
GDAL data dir: /Users/jwhite/miniforge3/envs/siads699b/share/gdal/
PROJ       : 9.4.0
PROJ data dir: /Users/jwhite/miniforge3/envs/siads699b/share/proj

PYTHON DEPENDENCIES
-------------------
geopandas  : 1.0.1
numpy      : 2.0.0
pandas     : 2.2.2
pyproj     : 3.6.1
shapely    : 2.0.5
pyogrio    : 0.9.0
geoalchemy2: 0.15.2
geopy      : 2.4.1
matplotlib : 3.9.1
mapclassify: 2.6.1
fiona      : 1.9.6
psycopg    : 3.2.1
psycopg2   : 2.9.9 (dt dec pq3 ext lo64)
pyarrow    : 16.1.0


In [4]:
dir(pyogrio)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__gdal_geos_version__',
 '__gdal_version__',
 '__gdal_version_string__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_compat',
 '_env',
 '_err',
 '_geometry',
 '_io',
 '_ogr',
 '_version',
 '_vsi',
 'core',
 'detect_write_driver',
 'errors',
 'geopandas',
 'get_gdal_config_option',
 'get_gdal_data_path',
 'list_drivers',
 'list_layers',
 'open_arrow',
 'raw',
 'read_arrow',
 'read_bounds',
 'read_dataframe',
 'read_info',
 'set_gdal_config_options',
 'shapely',
 'util',
 'write_arrow',
 'write_dataframe']

## set coordinate reference system for Crop Sequence Boundary

Based on metadata.

In [5]:
# Set the CRS using PROJ string
# custom coordinate reference system for CropSequenceBoundaries
crs_string = CRS.from_proj4("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs +ellps=GRS80")

## features look like...

```

[{'type': 'Feature',
  'id': '1',
  'properties': OrderedDict([('CSBID', '351623000000001'),
               ('CSBYEARS', '1623'),
               ('CSBACRES', 2.777984416614868),
               ('CDL2016', 24),
               ('CDL2017', 24),
               ('CDL2018', 24),
               ('CDL2019', 152),
               ('CDL2020', 152),
               ('CDL2021', 152),
               ('CDL2022', 152),
               ('CDL2023', 152),
               ('STATEFIPS', '35'),
               ('STATEASD', '3530'),
               ('ASD', '30'),
               ('CNTY', 'Union'),
               ('CNTYFIPS', '059'),
               ('INSIDE_X', -650336.707800001),
               ('INSIDE_Y', 1447440.5062),
               ('Shape_Length', 628.3153094395814),
               ('Shape_Area', 11242.14904625621)]),
  'geometry': {'type': 'MultiPolygon',
   'coordinates': [[[(-650269.8805, 1447270.0971000008),
      (-650293.9328000005, 1447271.9411999993),
      (-650317.9850999992, 1447273.7852999996),
      (-650315.6782000009, 1447303.8725000005),
      (-650291.6260000002, 1447302.0284000002),
      (-650286.2434999999, 1447372.2317999993),
      (-650307.9886000007, 1447404.1631000005),
      (-650364.1096000001, 1447408.4662999995),
      (-650361.0335000008, 1447448.5824999996),
      (-650334.6746999994, 1447476.8254000004),
      (-650254.5023999996, 1447470.6784000006),
      (-650269.8805, 1447270.0971000008)]]]}}]


```

## Extract CSBs for Lower 48 States

FIPS codes are here:
https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt

### extraction by state - lower 48 states

In [6]:
   # state-level    place
   #  FIPS code     name
   # -----------   -------
   #     01        ALABAMA
   #     02        ALASKA
   #     04        ARIZONA
   #     05        ARKANSAS
   #     06        CALIFORNIA
   #     08        COLORADO
   #     09        CONNECTICUT
   #     10        DELAWARE
   #     11        DISTRICT OF COLUMBIA
   #     12        FLORIDA
   #     13        GEORGIA
   #     15        HAWAII
   #     16        IDAHO
   #     17        ILLINOIS
   #     18        INDIANA
   #     19        IOWA
   #     20        KANSAS
   #     21        KENTUCKY
   #     22        LOUISIANA
   #     23        MAINE
   #     24        MARYLAND
   #     25        MASSACHUSETTS
   #     26        MICHIGAN
   #     27        MINNESOTA
   #     28        MISSISSIPPI
   #     29        MISSOURI
   #     30        MONTANA
   #     31        NEBRASKA
   #     32        NEVADA
   #     33        NEW HAMPSHIRE
   #     34        NEW JERSEY
   #     35        NEW MEXICO
   #     36        NEW YORK
   #     37        NORTH CAROLINA
   #     38        NORTH DAKOTA
   #     39        OHIO
   #     40        OKLAHOMA
   #     41        OREGON
   #     42        PENNSYLVANIA
   #     44        RHODE ISLAND
   #     45        SOUTH CAROLINA
   #     46        SOUTH DAKOTA
   #     47        TENNESSEE
   #     48        TEXAS
   #     49        UTAH
   #     50        VERMONT
   #     51        VIRGINIA
   #     53        WASHINGTON
   #     54        WEST VIRGINIA
   #     55        WISCONSIN
   #     56        WYOMING

In [16]:
statefips = [
    '01',        # ALABAMA
    # '02',        # ALASKA
    # '04',        # ARIZONA
    '05',        # ARKANSAS
    '06',        # CALIFORNIA
    # '08',        # COLORADO
    '09',        # CONNECTICUT
    '10',        # DELAWARE
    # '11',        # DISTRICT OF COLUMBIA
    '12',        # FLORIDA
    '13',        # GEORGIA
    # '15',        # HAWAII
    '16',        # IDAHO
    '17',        # ILLINOIS
    '18',        # INDIANA
    '19',        # IOWA
    '20',        # KANSAS
    '21',        # KENTUCKY
    '22',        # LOUISIANA
    '23',        # MAINE
    '24',        # MARYLAND
    '25',        # MASSACHUSETTS
    '26',        # MICHIGAN
    '27',        # MINNESOTA
    '28',        # MISSISSIPPI
    '29',        # MISSOURI
    '30',        # MONTANA
    '31',        # NEBRASKA
    '32',        # NEVADA
    '33',        # NEW HAMPSHIRE
    '34',        # NEW JERSEY
    # '35',        # NEW MEXICO
    '36',        # NEW YORK
    '37',        # NORTH CAROLINA
    '38',        # NORTH DAKOTA
    '39',        # OHIO
    '40',        # OKLAHOMA
    '41',        # OREGON
    '42',        # PENNSYLVANIA
    '44',        # RHODE ISLAND
    '45',        # SOUTH CAROLINA
    '46',        # SOUTH DAKOTA
    '47',        # TENNESSEE
    '48',        # TEXAS
    # '49',        # UTAH
    '50',        # VERMONT
    '51',        # VIRGINIA
    '53',        # WASHINGTON
    '54',        # WEST VIRGINIA
    '55',        # WISCONSIN
    '56',        # WYOMING
]

In [None]:
# File paths
csb_filepath = '../data/agricultural/CSB/NationalCSB_2016-2023_rev23/CSB1623.gdb/'
output_dir = '../data/agricultural/CSB/siads699/'

# Define the custom CRS using the provided PROJ string
crs_string = "+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs +ellps=GRS80"
custom_crs = CRS.from_proj4(crs_string)

# Define the STATEFIPS values to filter
# statefips = ['04', '08', '35', '49']  # Arizona, Colorado, New Mexico, Utah
# statefips = ['04', '08']  # Arizona, Colorado - to check FIPS
# statefips = ['35'] #  New Mexico
# statefips = ['04'] #  Arizona

# Initialize a dictionary to store the features by state
state_features = {fips: [] for fips in statefips}

# Function to read the file in chunks, starting at a specific chunk
def read_chunks(filepath, layer, chunk_size, total_features, start_chunk=0):
    start = start_chunk * chunk_size
    while start < total_features:
        remaining_features = total_features - start
        if remaining_features < chunk_size:
            chunk_size = remaining_features
        chunk = pyogrio.read_dataframe(filepath, layer=layer, skip_features=start, max_features=chunk_size)
        if chunk.empty:
            break
        yield chunk
        start += chunk_size

# Function to monitor memory usage
def print_memory_usage():
    process = psutil.Process()
    memory_info = process.memory_info()
    print(f"Memory usage: {memory_info.rss / 1024**2:.2f} MB")

# Timing the execution
start_time = time.time()

# Get the total number of features in the dataset
info = pyogrio.read_info(csb_filepath, layer=0)

# Print the dataset info in a nicely formatted way
print("Dataset info:")
pprint(info)

total_features = info.get('features') or info.get('feature_count')
if total_features is None:
    raise KeyError("Unable to determine the total number of features in the dataset")

print(f"Total number of features in the dataset: {total_features}")

# Set chunk size
chunk_size = 1000

# Set the starting chunk index
# start_chunk = 15092
start_chunk=0
chunk_index = start_chunk

# Read and process the file in chunks
for chunk in read_chunks(csb_filepath, layer=0, chunk_size=chunk_size, total_features=total_features, start_chunk=start_chunk):
    chunk_start_time = time.time()
    chunk_index += 1

    # Filter the chunk by STATEFIPS
    chunk_filtered = chunk[chunk['STATEFIPS'].isin(statefips)]

    # Collect the filtered features by state
    for state_fips in statefips:
        state_features[state_fips].extend(chunk_filtered[chunk_filtered['STATEFIPS'] == state_fips].to_dict('records'))

    chunk_end_time = time.time()
    print(f"Processed chunk {chunk_index:>6}, total features collected: {sum(len(features) for features in state_features.values()):>8}")

    # Print memory usage
    print_memory_usage()

# Get the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Convert the collected features to a DataFrame and extract geometries
for state_fips, features in state_features.items():
    properties_list = [pd.Series(feature).drop(labels='geometry') for feature in features]
    geometries = [shape(feature['geometry']) for feature in features]

    # Create a GeoDataFrame from the selected features using the custom CRS
    gdf_selected = gpd.GeoDataFrame(properties_list, geometry=geometries, crs=custom_crs)

    # Rename columns to avoid truncation issues with Shapefile format
    gdf_selected = gdf_selected.rename(columns={
        'Shape_Length': 'Shp_Len',
        'Shape_Area': 'Shp_Area',
        # Add other columns if needed
    })

    

    # Create a subfolder for .shp files for each state
    shapefile_dir = os.path.join(output_dir, f'{timestamp}_selected_features_fips_{state_fips}_shape/')
    os.makedirs(shapefile_dir, exist_ok=True)

    # Save the GeoDataFrame to a file (e.g., shapefile, GeoJSON, Parquet) with a timestamp
    gdf_selected.to_file(f'{shapefile_dir}selected_features_fips_{state_fips}.shp')
    gdf_selected.to_file(f'{output_dir}{timestamp}_selected_features_fips_{state_fips}.geojson', driver='GeoJSON')
    gdf_selected.to_parquet(f'{output_dir}{timestamp}_selected_features_fips_{state_fips}.parquet')

# Print the execution time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Script execution time: {elapsed_time:.2f} seconds")

Dataset info:
{'capabilities': {'fast_feature_count': True,
                  'fast_set_next_by_index': True,
                  'fast_spatial_filter': True,
                  'fast_total_bounds': True,
                  'random_read': True},
 'crs': 'PROJCS["Albers_Conic_Equal_Area",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS '
        '1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Albers_Conic_Equal_Area"],PARAMETER["latitude_of_center",23],PARAMETER["longitude_of_center",-96],PARAMETER["standard_parallel_1",29.5],PARAMETER["standard_parallel_2",45.5],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]',
 'dataset_metadata': None,
 'driver': 'OpenFileGDB',
 'dtypes': array(['object', 'object', 'float64', 'int32', 'int32', 'int32', 'int32',
       'int32', 'int32', 'int32', 

In [13]:
gdf_selected.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 86415 entries, 0 to 86414
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   CSBID      86415 non-null  object  
 1   CSBYEARS   86415 non-null  object  
 2   CSBACRES   86415 non-null  float64 
 3   CDL2016    86415 non-null  int64   
 4   CDL2017    86415 non-null  int64   
 5   CDL2018    86415 non-null  int64   
 6   CDL2019    86415 non-null  int64   
 7   CDL2020    86415 non-null  int64   
 8   CDL2021    86415 non-null  int64   
 9   CDL2022    86415 non-null  int64   
 10  CDL2023    86415 non-null  int64   
 11  STATEFIPS  86415 non-null  object  
 12  STATEASD   86415 non-null  object  
 13  ASD        86415 non-null  object  
 14  CNTY       86415 non-null  object  
 15  CNTYFIPS   86415 non-null  object  
 16  INSIDE_X   86415 non-null  float64 
 17  INSIDE_Y   86415 non-null  float64 
 18  Shp_Len    86415 non-null  float64 
 19  Shp_Area   86415 