## import libraries

In [1]:
import pandas as pd
import geopandas as gpd
import folium
import folium.plugins
import branca.colormap as cm

from shapely.geometry import Polygon, Point
import numpy as np
import random

import rasterio
from rasterio.transform import from_origin

In [2]:
# Set GeoPandas to use pyogrio
gpd.options.io_engine = "pyogrio"

## folium map generator function

In [3]:
# List of available tiles with their attributions and layer names (including USGS maps)
tiles_list = [
    {"tiles": "https://basemap.nationalmap.gov/arcgis/rest/services/USGSTopo/MapServer/tile/{z}/{y}/{x}", "attr": 'U.S. Department of the Interior | U.S. Geological Survey', "name": "USGS Topo"},
    {"tiles": "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}", "attr": 'U.S. Department of the Interior | U.S. Geological Survey', "name": "USGS Imagery"},
    {"tiles": "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryTopo/MapServer/tile/{z}/{y}/{x}", "attr": 'U.S. Department of the Interior | U.S. Geological Survey', "name": "USGS Imagery Topo"},
    {"tiles": "OpenStreetMap", "attr": "© OpenStreetMap contributors", "name": "OpenStreetMap"},
    {"tiles": "https://tiles.stadiamaps.com/tiles/stamen_toner_lite/{z}/{x}/{y}{r}.png", "attr": '&copy; <a href="https://stadiamaps.com/" target="_blank">Stadia Maps</a> <a href="https://stamen.com/" target="_blank">&copy; Stamen Design</a> &copy; <a href="https://openmaptiles.org/" target="_blank">OpenMapTiles</a> &copy; <a href="https://www.openstreetmap.org/copyright" target="_blank">OpenStreetMap</a>', "name": "Stamen Toner Lite"},
]

def generate_folium_map_with_csb_centroids_and_colors(gdf, zoom=6, decimal_places=3):
    # Check CRS and convert to EPSG:4326 if needed
    if gdf.crs != "EPSG:4326":
        gdf = gdf.to_crs(epsg=4326)
    
    # Format latitude and longitude to the desired number of decimal places
    gdf['Longitude_formatted'] = gdf['Longitude'].map(lambda x: f'{x:>{decimal_places+4}.{decimal_places}f}')
    gdf['Latitude_formatted'] = gdf['Latitude'].map(lambda x: f'{x:>{decimal_places+4}.{decimal_places}f}')
    
    # Ensure necessary columns are included in the properties
    gdf = gdf[['geometry', 'CDL2023', 'Longitude', 'Latitude', 'Longitude_formatted', 'Latitude_formatted', 'color', 'Crop', 'Elevation']]
    
    def style_function(feature):
        return {
            'fillColor': feature['properties']['color'],
            'color': feature['properties']['color'],
            'weight': 1,
            'fillOpacity': 0.6
        }

    # Create a separate GeoDataFrame for the centroids
    centroids_gdf = gdf.copy()
    # centroids_gdf['geometry'] = centroids_gdf.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
    centroids_gdf['geometry'] = gpd.points_from_xy(centroids_gdf['Longitude'], centroids_gdf['Latitude'])
    
    # Convert geometries to GeoJSON
    geojson_data = gdf.to_json()
    centroids_geojson_data = centroids_gdf.to_json()
    
    # Calculate map center
    minx, miny, maxx, maxy = gdf.total_bounds
    center_longitude = (minx + maxx) / 2
    center_latitude = (miny + maxy) / 2
    
    # Initialize map
    m = folium.Map(location=[center_latitude, center_longitude], zoom_start=zoom, tiles=None)
    
    # Add each tile layer to the map
    for tile_info in tiles_list:
        tiles = tile_info["tiles"]
        attr = tile_info["attr"]
        layer_name = tile_info["name"]
        folium.TileLayer(tiles=tiles, attr=attr, name=layer_name).add_to(m)
    
    # Add GeoJSON layer with custom popups and styles for crop sequence boundaries
    folium.GeoJson(
        geojson_data,
        name='Crop Sequence Boundaries',
        style_function=style_function,
        popup=folium.GeoJsonPopup(fields=['CDL2023', 'Crop', 'Latitude_formatted', 'Longitude_formatted','Elevation'], aliases=['Crop ID:', 'Crop:', 'Latitude:', 'Longitude:', 'Elevation:'])
    ).add_to(m)
    
    # Add GeoJSON layer for centroids with custom CircleMarkers
    folium.GeoJson(
        centroids_geojson_data,
        name='CSB Centroids',
        # marker=folium.CircleMarker(radius=4, fill_color='grey', fill_opacity=1, color='grey', weight=0.2),
        marker=folium.CircleMarker(
            radius=4, 
            fill=True,
            fill_opacity=0.8,
            weight=0.2
        ),
        # marker=folium.CircleMarker(radius=4),
        style_function=lambda x: {'fillColor': x['properties']['color'], 'color': 'grey'},
        popup=folium.GeoJsonPopup(fields=['CDL2023', 'Crop', 'Latitude_formatted', 'Longitude_formatted', 'Elevation'], aliases=['Crop ID:', 'Crop:', 'Latitude:', 'Longitude:', 'Elevation:']),
    ).add_to(m)
    
    folium.plugins.Fullscreen(
        position="topleft",
        title="Fullscreen",
        title_cancel="Exit Fullscreen",
        force_separate_button=True,
    ).add_to(m)

    # Add layer control to the map
    folium.LayerControl().add_to(m)
    
    return m

## load and sample data

In [8]:
# csb_combined = gpd.read_parquet('../data/agricultural/CSB/siads696/20240715_165400_gdf_four_corners_combined.parquet')
csb_combined = gpd.read_parquet('../datasets/fields/csb/20240715_165400_gdf_four_corners_combined.parquet')

In [9]:
# drop the vestigal 'index' column
# csb_combined.columns
csb_combined.drop(columns=['index'], inplace=True)

In [10]:
len(csb_combined)

618694

## include elevations

In [11]:
digital_elevation_file = '../data/elevation/PRISM_us_dem_4km_bil/PRISM_us_dem_4km_bil.bil'

In [12]:
# Extract the latitude and longitude values
latitudes = csb_combined['Latitude'].values
longitudes = csb_combined['Longitude'].values

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(
    csb_combined,
    geometry=gpd.points_from_xy(csb_combined.Longitude, csb_combined.Latitude),
    crs='EPSG:4326'  # Assuming the coordinates are in WGS84
)

In [13]:
with rasterio.open(digital_elevation_file) as dem:
    # Check the CRS of the DEM
    dem_crs = dem.crs
    print(f"DEM CRS: {dem_crs}")

    # Transform the GeoDataFrame to match the DEM CRS
    gdf = gdf.to_crs(dem_crs)

    # Extract transformed coordinates
    coords = [(x, y) for x, y in zip(gdf.geometry.x, gdf.geometry.y)]

    # Sample the DEM at these coordinates
    elevations = [val for val in dem.sample(coords)]
    
    # Flatten the list of elevations
    elevations = [e[0] for e in elevations]
    
    
# Add the elevation values to the DataFrame
csb_combined['Elevation'] = elevations

DEM CRS: OGC:CRS83


In [14]:
csb_combined['Elevation'].min(), csb_combined['Elevation'].max()

(np.int32(26), np.int32(3665))

In [15]:
# list(csb_combined.columns)
reorder_columns = [
    'CSBID',
    'CSBYEARS',
    'CSBACRES',
    'CDL2016',
    'CDL2017',
    'CDL2018',
    'CDL2019',
    'CDL2020',
    'CDL2021',
    'CDL2022',
    'CDL2023',
    'STATEFIPS',
    'STATEASD',
    'ASD',
    'CNTY',
    'CNTYFIPS',
    'INSIDE_X',
    'INSIDE_Y',
    'Shp_Len',
    'Shp_Area',
    'geometry',
    'Longitude',
    'Latitude',
    'Elevation',
    'color',
    'Crop',
]

In [16]:
csb_combined = csb_combined[reorder_columns]

In [17]:
# list(csb_combined.columns)

In [18]:
csb_combined.to_parquet('../datesets/fields/csb/csb_combined_with_elevation.parquet')

In [19]:
sample_size = 1200
csb_samples = csb_combined.sample(n=sample_size, random_state=42)
csb_samples

Unnamed: 0,CSBID,CSBYEARS,CSBACRES,CDL2016,CDL2017,CDL2018,CDL2019,CDL2020,CDL2021,CDL2022,...,INSIDE_X,INSIDE_Y,Shp_Len,Shp_Area,geometry,Longitude,Latitude,Elevation,color,Crop
194047,081623003680567,1623,24.595302,61,24,61,24,42,24,42,...,-1.120167e+06,1.711077e+06,1778.756494,99534.055948,"MULTIPOLYGON (((-1120006.725 1711204.851, -111...",-108.874516,37.752494,2114,#a87000,Winter Wheat
180917,081623001172341,1623,5.813687,1,24,61,1,36,36,1,...,-6.973936e+05,1.704912e+06,1018.698379,23527.251774,"MULTIPOLYGON (((-697143.136 1704945.584, -6971...",-104.039539,38.112161,1328,#bfbf7a,Fallow/Idle Cropland
392157,081623009660832,1623,68.025049,36,36,36,36,36,36,36,...,-1.017694e+06,1.772620e+06,3075.720099,275288.707532,"MULTIPOLYGON (((-1017862.847 1772957.076, -101...",-107.796641,38.415305,1912,#ffa8e3,Alfalfa
120215,351623002569319,1623,6.808717,36,190,36,152,36,36,37,...,-9.885239e+05,1.308874e+06,830.708511,27554.010266,"MULTIPOLYGON (((-988371.522 1308907.044, -9883...",-106.850683,34.334479,1465,#e9ffbe,Grass/Pasture
411765,081623010117052,1623,3.723023,176,24,4,61,24,61,24,...,-6.430940e+05,1.835682e+06,550.657171,15066.598593,"MULTIPOLYGON (((-643066.725 1835749.68, -64303...",-103.535552,39.314620,1699,#bfbf7a,Fallow/Idle Cropland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110152,351623001801779,1623,5.777508,1,2,61,205,1,205,1,...,-6.489027e+05,1.298843e+06,1085.739826,23380.840552,"MULTIPOLYGON (((-648702.809 1298873.001, -6487...",-103.132301,34.532521,1317,#a87000,Winter Wheat
101320,351623000927136,1623,2.950810,4,4,61,24,152,4,4,...,-7.031136e+05,1.215967e+06,622.167642,11941.553840,"MULTIPOLYGON (((-703135.074 1216033.76, -70308...",-103.652638,33.755502,1358,#ff9e0f,Sorghum
247352,081623006700815,1623,7.830029,36,36,176,36,36,195,36,...,-7.206152e+05,1.849777e+06,897.404967,31687.130011,"MULTIPOLYGON (((-720613.545 1849880.018, -7206...",-104.454050,39.381817,1947,#ffa8e3,Alfalfa
450533,081623011958926,1623,2.755083,1,1,1,176,61,176,61,...,-7.229287e+05,1.934624e+06,474.070288,11149.471426,"MULTIPOLYGON (((-722869.696 1934689.09, -72287...",-104.570576,40.134105,1503,#a87000,Winter Wheat


In [22]:
sample_map = generate_folium_map_with_csb_centroids_and_colors(csb_samples, zoom=6, decimal_places=3)

In [23]:
sample_map

In [24]:
def generate_grid(gdf, cell_size):
    # Get the bounds of the GeoDataFrame
    minx, miny, maxx, maxy = gdf.total_bounds
    
    # Generate grid cells
    x_coords = np.arange(minx, maxx, cell_size)
    y_coords = np.arange(miny, maxy, cell_size)
    
    grid_cells = []
    for x in x_coords:
        for y in y_coords:
            grid_cells.append(Polygon([(x, y), (x + cell_size, y), (x + cell_size, y + cell_size), (x, y + cell_size)]))
    
    return gpd.GeoDataFrame(grid_cells, columns=['geometry'])

def spatially_balanced_sample(gdf, num_samples, cell_size):
    grid = generate_grid(gdf, cell_size)
    sampled_points = []
    
    for cell in grid.geometry:
        points_within_cell = gdf[gdf.geometry.within(cell)]
        if not points_within_cell.empty:
            sampled_points.append(points_within_cell.sample(n=1, random_state=42))
    
    # If the number of samples is more than required, randomly reduce
    if len(sampled_points) > num_samples:
        sampled_points = random.sample(sampled_points, num_samples)
    
    # Combine sampled points into a single GeoDataFrame
    sampled_gdf = gpd.GeoDataFrame(pd.concat(sampled_points, ignore_index=False))
    
    return sampled_gdf

In [25]:
csb_combined.crs

<Projected CRS: {"$schema": "https://proj.org/schemas/v0.7/projjso ...>
Name: unknown
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- undefined
Coordinate Operation:
- name: unknown
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [26]:
# Set cell size (adjust as needed)
cell_size = 10000  # in the units of your CRS, e.g., meters

# Specify the number of samples
num_samples = 1200

# Get the spatially balanced sample
geo_balanced_samples = spatially_balanced_sample(csb_combined, num_samples, cell_size)

In [27]:
m_geo_balanced = generate_folium_map_with_csb_centroids_and_colors(geo_balanced_samples, zoom=6, decimal_places=3)

In [28]:
m_geo_balanced

In [29]:
# geo_balanced_samples.to_parquet('../data/agricultural/CSB/siads696/geo_balanced_sample.parquet')
geo_balanced_samples.to_parquet('../datesets/fields/csb/geo_balanced_sample.parquet')

In [30]:
geo_balanced_samples

Unnamed: 0,CSBID,CSBYEARS,CSBACRES,CDL2016,CDL2017,CDL2018,CDL2019,CDL2020,CDL2021,CDL2022,...,INSIDE_X,INSIDE_Y,Shp_Len,Shp_Area,geometry,Longitude,Latitude,Elevation,color,Crop
264164,081623007010852,1623,42.172442,61,21,28,61,61,152,152,...,-8.485271e+05,1.598009e+06,3190.104461,170666.499331,"MULTIPOLYGON (((-848450.608 1598183.245, -8484...",-105.646388,37.037619,2304,#bfbf7a,Fallow/Idle Cropland
252748,081623006852860,1623,4.860087,152,142,152,152,152,152,152,...,-1.028570e+06,1.760491e+06,846.919361,19668.154155,"MULTIPOLYGON (((-1028621.215 1760502.389, -102...",-107.903925,38.295863,2390,#a5f58d,Other Hay/Non Alfalfa
84983,491623015988100,1623,9.501533,36,36,36,36,36,36,37,...,-1.532537e+06,1.703774e+06,1280.502899,38451.493501,"MULTIPOLYGON (((-1532428.499 1703868.12, -1532...",-113.506127,37.102293,823,#ffa8e3,Alfalfa
235900,081623006279628,1623,4.157208,61,61,24,152,176,176,152,...,-6.397665e+05,1.751563e+06,916.143112,16823.690749,"MULTIPOLYGON (((-639769.777 1751684.884, -6397...",-103.420132,38.568996,1386,#c7d79e,Shrubland
257008,081623006998069,1623,2.822090,176,176,205,176,176,176,176,...,-6.313422e+05,1.648109e+06,593.762657,11420.640325,"MULTIPOLYGON (((-631348.266 1648020.38, -63137...",-103.231477,37.654274,1347,#e9ffbe,Grass/Pasture
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123484,351623003238938,1623,3.675615,36,36,36,152,36,36,36,...,-7.822644e+05,1.476764e+06,523.612316,14874.747364,"MULTIPOLYGON (((-782216.336 1476841.652, -7822...",-104.770515,36.017002,1941,#a5f58d,Other Hay/Non Alfalfa
186470,081623001860231,1623,4.106779,36,36,36,36,36,37,176,...,-1.080237e+06,1.682268e+06,661.441173,16619.612976,"MULTIPOLYGON (((-1080172.097 1682327.009, -108...",-108.378030,37.545014,2395,#e9ffbe,Grass/Pasture
606785,041623013878701,1623,6.756011,61,29,61,176,61,176,176,...,-1.581930e+06,1.233847e+06,997.719649,27340.715046,"MULTIPOLYGON (((-1581843.713 1233888.426, -158...",-113.102678,32.890867,225,#e9ffbe,Grass/Pasture
10124,491623005262153,1623,14.987005,1,1,37,36,36,36,36,...,-1.446571e+06,1.793114e+06,1962.223433,60650.501468,"MULTIPOLYGON (((-1446575.805 1793250.042, -144...",-112.722394,38.023977,1782,#ffa8e3,Alfalfa
