## load libraries

In [1]:
import pandas as pd
import geopandas as gpd
import folium
import fiona
import rasterio

from shapely.geometry import Polygon, Point
import numpy as np
import xarray as xr
import random
import os
import time
import re
import math

import matplotlib.pyplot as plt
import altair as alt

from osgeo import ogr

In [2]:
# Set GeoPandas to use pyogrio
gpd.options.io_engine = "pyogrio"

In [3]:
gpd.show_versions()


SYSTEM INFO
-----------
python     : 3.12.4 | packaged by conda-forge | (main, Jun 17 2024, 10:13:44) [Clang 16.0.6 ]
executable : /Users/jwhite/miniforge3/envs/siads699b/bin/python
machine    : macOS-14.5-arm64-arm-64bit

GEOS, GDAL, PROJ INFO
---------------------
GEOS       : 3.12.2
GEOS lib   : None
GDAL       : 3.9.1
GDAL data dir: /Users/jwhite/miniforge3/envs/siads699b/share/gdal/
PROJ       : 9.4.0
PROJ data dir: /Users/jwhite/miniforge3/envs/siads699b/share/proj

PYTHON DEPENDENCIES
-------------------
geopandas  : 1.0.1
numpy      : 2.0.0
pandas     : 2.2.2
pyproj     : 3.6.1
shapely    : 2.0.5
pyogrio    : 0.9.0
geoalchemy2: 0.15.2
geopy      : 2.4.1
matplotlib : 3.9.1
mapclassify: 2.6.1
fiona      : 1.9.6
psycopg    : 3.2.1
psycopg2   : 2.9.9 (dt dec pq3 ext lo64)
pyarrow    : 16.1.0


## load crop sequence boundaries

In [4]:
sample_data_file = '../data/agricultural/CSB/siads696/geo_balanced_sample.parquet'
# sample_data_file = '../data/agricultural/CSB/siads696/csb_sample_with_growing_season_data.parquet'
geo_balanced_csb_samples = gpd.read_parquet(sample_data_file)
geo_balanced_csb_samples

Unnamed: 0,CSBID,CSBYEARS,CSBACRES,CDL2016,CDL2017,CDL2018,CDL2019,CDL2020,CDL2021,CDL2022,...,INSIDE_X,INSIDE_Y,Shp_Len,Shp_Area,geometry,Longitude,Latitude,Elevation,color,Crop
478801,081623012787392,1623,3.791540,4,24,61,61,24,24,61,...,-6.035779e+05,1.704774e+06,532.211689,15343.877410,"MULTIPOLYGON (((-603525.799 1704829.066, -6035...",-102.962080,38.176922,1231,#bfbf7a,Fallow/Idle Cropland
107446,351623001627247,1623,4.325446,225,1,225,152,152,152,36,...,-7.967410e+05,1.149389e+06,723.964572,17504.529797,"MULTIPOLYGON (((-796679.747 1149498.895, -7966...",-104.600667,33.089835,1138,#ffa8e3,Alfalfa
121162,351623002777545,1623,3.222010,61,176,1,176,36,36,1,...,-1.086137e+06,1.590736e+06,549.623304,13039.063523,"MULTIPOLYGON (((-1086091.187 1590803.212, -108...",-108.309751,36.728108,1676,#ffd400,Corn
88813,351623000046581,1623,10.204036,36,36,36,36,152,37,37,...,-9.788489e+05,1.595846e+06,1274.741723,41294.433943,"MULTIPOLYGON (((-978851.432 1595726.11, -97884...",-107.112180,36.891083,2252,#a5f58d,Other Hay/Non Alfalfa
516266,081623013377366,1623,2.918481,24,29,29,24,1,61,61,...,-6.203751e+05,1.992249e+06,556.566951,11810.721459,"MULTIPOLYGON (((-620252.778 1992330.141, -6202...",-103.412893,40.723858,1330,#bfbf7a,Fallow/Idle Cropland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119071,351623002568175,1623,2.957066,176,176,176,61,24,24,24,...,-9.138551e+05,1.348095e+06,599.379620,11966.871324,"MULTIPOLYGON (((-913829.788 1348147.68, -91380...",-106.082615,34.756107,1877,#e9ffbe,Grass/Pasture
540278,081623014401456,1623,9.374885,61,24,61,1,1,1,1,...,-6.892023e+05,1.934718e+06,988.353218,37938.965442,"MULTIPOLYGON (((-689103.22 1934811.717, -68911...",-104.172849,40.161427,1417,#a5f58d,Other Hay/Non Alfalfa
591949,041623013556134,1623,3.770585,72,72,72,190,190,190,190,...,-1.707418e+06,1.273220e+06,523.387689,15259.076277,"MULTIPOLYGON (((-1707373.732 1273277.961, -170...",-114.508535,33.028054,101,#80b3b3,Woody Wetlands
217842,081623005639708,1623,4.307316,61,24,1,61,24,29,61,...,-6.004245e+05,1.898598e+06,634.862780,17431.160158,"MULTIPOLYGON (((-600373.153 1898692.428, -6003...",-103.092389,39.903658,1396,#a87000,Winter Wheat


## load ecocrop data

Imputed, includes USDA Hardiness Zones

In [5]:
ecocrop_file = '../data/agricultural/EcoCrop/siads699/EcoCrop_Clean_Imputed_All.pickle'
ecocrop_all = pd.read_pickle(ecocrop_file)
ecocrop_all

Unnamed: 0,Crop_Code,Scientific_Name,Genus,Species,Variety,Life_Form,Habit,Life_Span,Physiology,Category,...,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Climate_Zone_Trewartha,USDA_Hardiness_Zone,USDA_Hardiness_Zone_Min,USDA_Hardiness_Zone_Max,Datasheet_URL,PFAF_URL
0,289,Abelmoschus esculentus,Abelmoschus,esculentus,,herb,erect,annual,single stem,vegetables,...,180.0,food & beverage,vitamins,fruits,"tropical wet & dry (Aw), tropical wet (Ar), st...",5-11,5,11,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...
1,290,Abelmoschus manihot,Abelmoschus,manihot,,shrub,erect,"annual, perennial","deciduous, multi stem","vegetables, ornamentals/turf, medicinals & aro...",...,365.0,food & beverage,vitamins,leaves,"tropical wet & dry (Aw), tropical wet (Ar)",8-11,8,11,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...
2,291,Abelmoschus moschatus,Abelmoschus,moschatus,,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",...,0.0,environmental,ornamental/turf,entire plant,tropical wet & dry (Aw),8-11,8,11,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...
3,295,Acacia auriculiformis,Acacia,auriculiformis,,tree,erect,perennial,single stem,forest/wood,...,240.0,material,dye/tannin,stems,"tropical wet & dry (Aw), tropical wet (Ar)",10-12,10,12,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...
4,297,Acacia farnesiana,Acacia,farnesiana,,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",...,240.0,environmental,ornamental/turf,entire plant,"tropical wet & dry (Aw), steppe or semiarid (B...",9-11,9,11,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,Chamaecrista,rotundifolia,,herb,prostrate/procumbent/semi-erect,"annual, perennial","deciduous, multi stem","forage/pasture, medicinals & aromatic",...,0.0,animal food (feed),vitamins,entire plant,"tropical wet & dry (Aw), tropical wet (Ar), su...",9-11,9,11,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Cha...
2564,400001,Acacia polyacantha,Acacia,polyacantha,,"shrub, tree",erect,perennial,"single stem, multi stem","forest/wood, environmental",...,0.0,fuels,fuelwood,bark,"desert or arid (Bw), steppe or semiarid (Bs)",10-12,10,12,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...
2565,400002,Prosopis affinis,Prosopis,affinis,,tree,erect,perennial,"deciduous, single stem, C3 photosynthesis",forest/wood,...,0.0,food & beverage,vitamins,bark,"tropical wet & dry (Aw), desert or arid (Bw), ...",10-12,10,12,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Pro...
2566,400003,Vicia dasycarpa,Vicia,dasycarpa,,herb,climber/scrambler/scadent,annual,multi stem,forage/pasture,...,0.0,animal food (feed),minerals,entire plant,"tropical wet & dry (Aw), tropical wet (Ar), st...",10-12,10,12,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Vic...


In [6]:
list(ecocrop_all.columns)

['Crop_Code',
 'Scientific_Name',
 'Genus',
 'Species',
 'Variety',
 'Life_Form',
 'Habit',
 'Life_Span',
 'Physiology',
 'Category',
 'Plant_Attributes',
 'Temp_Opt_Min',
 'Temp_Opt_Max',
 'Temp_Opt_Min_F',
 'Temp_Opt_Max_F',
 'Temp_Abs_Min',
 'Temp_Abs_Max',
 'Temp_Abs_Min_F',
 'Temp_Abs_Max_F',
 'Killing_Temp_Rest',
 'Killing_Temp_Growth',
 'Killing_Temp_Rest_F',
 'Killing_Temp_Growth_F',
 'Rain_Opt_Min',
 'Rain_Opt_Max',
 'Rain_Abs_Min',
 'Rain_Abs_Max',
 'Lat_Opt_Min',
 'Lat_Opt_Max',
 'Lat_Abs_Min',
 'Lat_Abs_Max',
 'Alt_Opt_Min',
 'Alt_Opt_Max',
 'Alt_Abs_Min',
 'Alt_Abs_Max',
 'pH_Opt_Min',
 'pH_Opt_Max',
 'pH_Abs_Min',
 'pH_Abs_Max',
 'Light_Opt_Min',
 'Light_Opt_Max',
 'Light_Abs_Min',
 'Light_Abs_Max',
 'Depth_Opt',
 'Depth_Abs',
 'Texture_Ops',
 'Texture_Abs',
 'Fertility_Ops',
 'Fertility_Abs',
 'Al_Toxicity_Opt',
 'Al_Toxicity_Abs',
 'Salinity_Ops',
 'Salinity_Abs',
 'Drainage_Opt',
 'Drainage_Abs',
 'Photoperiod',
 'Abiotic_Tolererance',
 'Abiotic_Susceptibility',
 'Intr

In [7]:
crop_columns = [
    'Crop_Code',
    'Scientific_Name',
    'Genus',
    'Species',
    # 'Variety',
    # 'Life_Form',
    # 'Habit',
    # 'Life_Span',
    # 'Physiology',
    # 'Category',
    # 'Plant_Attributes',
    # 'Temp_Opt_Min',
    # 'Temp_Opt_Max',
    # 'Temp_Opt_Min_F',
    # 'Temp_Opt_Max_F',
    # 'Temp_Abs_Min',
    # 'Temp_Abs_Max',
    # 'Temp_Abs_Min_F',
    # 'Temp_Abs_Max_F',
    # 'Killing_Temp_Rest',
    # 'Killing_Temp_Growth',
    # 'Killing_Temp_Rest_F',
    # 'Killing_Temp_Growth_F',
    # 'Rain_Opt_Min',
    # 'Rain_Opt_Max',
    # 'Rain_Abs_Min',
    # 'Rain_Abs_Max',
    # 'Lat_Opt_Min',
    # 'Lat_Opt_Max',
    # 'Lat_Abs_Min',
    # 'Lat_Abs_Max',
    # 'Alt_Opt_Min',
    # 'Alt_Opt_Max',
    # 'Alt_Abs_Min',
    # 'Alt_Abs_Max',
    'pH_Opt_Min',
    'pH_Opt_Max',
    'pH_Abs_Min',
    'pH_Abs_Max',
    # 'Light_Opt_Min',
    # 'Light_Opt_Max',
    # 'Light_Abs_Min',
    # 'Light_Abs_Max',
    # 'Depth_Opt',
    # 'Depth_Abs',
    # 'Texture_Ops',
    # 'Texture_Abs',
    # 'Fertility_Ops',
    # 'Fertility_Abs',
    # 'Al_Toxicity_Opt',
    # 'Al_Toxicity_Abs',
    # 'Salinity_Ops',
    # 'Salinity_Abs',
    # 'Drainage_Opt',
    # 'Drainage_Abs',
    # 'Photoperiod',
    # 'Abiotic_Tolererance',
    # 'Abiotic_Susceptibility',
    # 'Introduction_Risks',
    # 'Production_System',
    # 'Cropping_System',
    # 'Subsystem',
    # 'Companion_Species',
    # 'Level_of_Mechanization',
    # 'Labour_Intensity',
    # 'Crop_Cycle_Min',
    # 'Crop_Cycle_Max',
    # 'Use_Main',
    # 'Use_Detailed',
    # 'Use_Part',
    # 'Climate_Zone_Trewartha',
    # 'USDA_Hardiness_Zone',
    # 'USDA_Hardiness_Zone_Min',
    # 'USDA_Hardiness_Zone_Max',
    # 'Datasheet_URL',
    # 'PFAF_URL'
]

In [8]:
crops = ecocrop_all[crop_columns]

In [9]:
crops

Unnamed: 0,Crop_Code,Scientific_Name,Genus,Species,pH_Opt_Min,pH_Opt_Max,pH_Abs_Min,pH_Abs_Max
0,289,Abelmoschus esculentus,Abelmoschus,esculentus,5.500000,7.000000,4.500000,8.700000
1,290,Abelmoschus manihot,Abelmoschus,manihot,5.500000,7.000000,5.000000,8.000000
2,291,Abelmoschus moschatus,Abelmoschus,moschatus,5.500000,7.000000,5.000000,8.000000
3,295,Acacia auriculiformis,Acacia,auriculiformis,6.000000,8.000000,3.000000,9.500000
4,297,Acacia farnesiana,Acacia,farnesiana,6.500000,7.000000,4.300000,8.000000
...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,Chamaecrista,rotundifolia,5.000000,6.500000,4.500000,7.000000
2564,400001,Acacia polyacantha,Acacia,polyacantha,5.766304,7.021739,5.041304,7.698913
2565,400002,Prosopis affinis,Prosopis,affinis,5.900000,7.050000,5.300000,7.770000
2566,400003,Vicia dasycarpa,Vicia,dasycarpa,6.227273,7.090909,5.063636,8.072727


In [10]:
crops.columns

Index(['Crop_Code', 'Scientific_Name', 'Genus', 'Species', 'pH_Opt_Min',
       'pH_Opt_Max', 'pH_Abs_Min', 'pH_Abs_Max'],
      dtype='object')

## look up soil data

In [11]:
! ls '../data/soil/wss_gsmsoil_US_[2016-10-13]/spatial/gsmsoilmu_a_us.shp'
# ! ls  '../data/soil/pH/'

../data/soil/wss_gsmsoil_US_[2016-10-13]/spatial/gsmsoilmu_a_us.shp


In [12]:
# soil_data_folder = '../data/soil/pH/'
soil_data_folder = '../data/soil/wss_gsmsoil_US_[2016-10-13]/spatial/'

soil_filenames = [   
    # 'phgrid.e00',
    'gsmsoilmu_a_us.shp',
    # 'mu_ph.e00',     
]

In [13]:
soil_filepaths = [f'{soil_data_folder}{filename}' for filename in soil_filenames ]
soil_filepath = soil_filepaths[0]
soil_filepath

'../data/soil/wss_gsmsoil_US_[2016-10-13]/spatial/gsmsoilmu_a_us.shp'

In [14]:
# Open the shapefile using GDAL
data_source = ogr.Open(soil_filepath)

# Check the number of layers
layer_count = data_source.GetLayerCount()
print(f"Number of layers: {layer_count}")

# Iterate through layers and print their names
for i in range(layer_count):
    layer = data_source.GetLayerByIndex(i)
    print(f"Layer {i}: {layer.GetName()}")

Number of layers: 1
Layer 0: gsmsoilmu_a_us




In [15]:
soil_gdf = gpd.read_file(soil_filepath)
soil_gdf

Unnamed: 0,AREASYMBOL,SPATIALVER,MUSYM,MUKEY,geometry
0,US,3,s6280,671321,"POLYGON ((-98.38087 34.83919, -98.38163 34.838..."
1,US,3,s6198,671239,"POLYGON ((-99.77481 34.82467, -99.77134 34.824..."
2,US,3,s6301,671342,"POLYGON ((-96.68445 34.84652, -96.68029 34.844..."
3,US,3,s7400,673361,"POLYGON ((-102.26232 34.81751, -102.2618 34.81..."
4,US,3,s4671,664806,"POLYGON ((-76.82996 34.82448, -76.83107 34.824..."
...,...,...,...,...,...
81765,US,3,s3714,669389,"POLYGON ((-95.36505 43.33629, -95.35744 43.336..."
81766,US,3,s9103,677045,"POLYGON ((-108.88566 43.32691, -108.88323 43.3..."
81767,US,3,s5996,666331,"POLYGON ((-74.89392 43.29477, -74.89079 43.297..."
81768,US,3,E26W,2790280,"POLYGON ((180 51.79107, 179.99998 51.79106, 17..."


In [16]:
soil_gdf.columns

Index(['AREASYMBOL', 'SPATIALVER', 'MUSYM', 'MUKEY', 'geometry'], dtype='object')

In [17]:
# # Path to the GeoTIFF file
# geotiff_path = '../data/soil/pH/phgrid.tif'

In [18]:
# soil_filepath = '../data/soil/pH/mu_ph.e00'
# geotiff_path = '../data/soil/pH/mu_ph.tif'

# # Run the ogr2ogr command to convert the .e00 file to a GeoTIFF
# !ogr2ogr -f "GTiff" {geotiff_path} {soil_filepath}

In [19]:
# # Check if the file was created successfully
# !ls -l output.tif

In [20]:
# # Open the GeoTIFF file
# with rasterio.open(geotiff_path) as src:
#     # Read the data into a numpy array
#     data = src.read(1)  # Assuming the pH data is in the first band

#     # Display some metadata
#     print(f"Width: {src.width}, Height: {src.height}")
#     print(f"Coordinate Reference System: {src.crs}")
#     print(f"Transform: {src.transform}")
    
#     # Replace no-data values with NaN for better processing
#     data = np.where(data == -2147483647, np.nan, data)
    
#     # Print out some of the data values
#     print(data)

# # Now `data` contains the grid values, possibly including the soil pH data

In [21]:
# # soil_gdf = gpd.read_file(f'AVCE00:{soil_filepath}')
# # soil_gdf = gpd.read_file(soil_filepath, driver='AVCE00')
# # Open the E00 file using GDAL
# driver = ogr.GetDriverByName('AVCE00')
# driver

In [22]:
# e00_file_path = soil_filepath
# data_source = driver.Open(e00_file_path, 0)  # 0 means read-only mode

In [23]:
# if data_source is None:
#     raise RuntimeError(f"Failed to open file: {e00_file_path}. Please ensure the file exists and is a valid E00 file.")

In [24]:
# # Iterate through the layers and print their names
# for i in range(data_source.GetLayerCount()):
#     layer = data_source.GetLayerByIndex(i)
#     print(f"Layer {i}: {layer.GetName()}")

#     # Convert to a GeoDataFrame for further processing
#     gdf = gpd.GeoDataFrame.from_features(layer)
#     print(gdf.head())

In [25]:
# Path to your decompressed .e00 file
file_path = '../data/soil/pH/mu_ph.e00'



# Initialize lists to store the data
muid_list = []
ph_data = {f'L{i}_PH': [] for i in range(1, 12)}  # Assuming up to L11_PH

# Function to clean and convert pH values
def clean_ph_value(value):
    try:
        return float(value)
    except ValueError:
        # Attempt to clean up and convert incomplete scientific notation
        if 'E+' in value:
            value = value.replace('E+', 'E+0')
        elif 'E-' in value:
            value = value.replace('E-', 'E-0')
        return float(value)

# Read the entire content of the file
with open(file_path, 'r') as file:
    content = file.read()

# Regex pattern to find the start of each line (two capitals followed by three digits)
pattern = re.compile(r'([A-Z]{2}\d{3})')

# Split the content based on the pattern and reinsert the pattern at the beginning of each line
lines = pattern.split(content)

# Reassemble lines to ensure they start with the pattern
lines = [lines[i] + lines[i+1] for i in range(1, len(lines) - 1, 2)]

# Strip any leading/trailing whitespace
lines = [line.strip() for line in lines]

# Print the last 20 lines to verify
print(lines[-20:])

['WY346  10480 6.3000002E+00 6.3000002E+00 6.3000002E+00 6.3000002E+00 6.4000001E+\n00 6.5000000E+00 6.5000000E+00 6.4000001E+00 6.6999998E+00 7.3000002E+00 0.00000\n00E+00', 'WY347  10481 7.0999999E+00 7.0999999E+00 7.1999998E+00 7.0000000E+00 7.0000000E+\n00 7.0000000E+00 7.5000000E+00 7.5999999E+00 7.5999999E+00 7.5999999E+00 0.00000\n00E+00', 'WY348  10482 8.1999998E+00 8.1999998E+00 8.3000002E+00 8.5000000E+00 8.6000004E+\n00 8.6000004E+00 8.6000004E+00 8.6000004E+00 8.5000000E+00 8.3999996E+00 0.00000\n00E+00', 'WY349  10483 7.8000002E+00 7.8000002E+00 7.9000001E+00 8.0000000E+00 8.1000004E+\n00 8.3000002E+00 8.3000002E+00 8.3999996E+00 8.3999996E+00 8.3999996E+00 0.00000\n00E+00', 'WY350  10484 7.8000002E+00 7.8000002E+00 8.0000000E+00 8.1999998E+00 8.3000002E+\n00 8.3999996E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.00000\n00E+00', 'WY351  10485 7.0000000E+00 7.0000000E+00 7.0000000E+00 7.0000000E+00 7.1999998E+\n00 7.1999998E+00 7.3000002E+00 7.4000001E+00 7

In [26]:
for line in lines:
    print(line)
    print('***')

AL001      1 5.0000000E+00 5.0000000E+00 5.0000000E+00 5.0000000E+00 5.0000000E+
00 5.0000000E+00 5.0000000E+00 5.0000000E+00 5.0000000E+00 4.9000001E+00 5.00000
00E+00
***
AL002      2 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.1999998E+00 5.1999998E+
00 5.1999998E+00 5.1999998E+00 5.0999999E+00 5.0999999E+00 5.0999999E+00 5.00000
00E+00
***
AL003      3 5.0999999E+00 5.0999999E+00 5.0999999E+00 5.0999999E+00 5.0999999E+
00 5.0999999E+00 5.0999999E+00 5.0999999E+00 5.0999999E+00 5.0999999E+00 5.00000
00E+00
***
AL004      4 5.0000000E+00 5.0000000E+00 5.0000000E+00 5.0000000E+00 5.0000000E+
00 5.0000000E+00 5.0000000E+00 5.0000000E+00 5.0000000E+00 4.9000001E+00 5.00000
00E+00
***
AL005      5 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.3000002E+
00 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.5000000E+00 0.00000
00E+00
***
AL006      6 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.3000002E+
00 5.3000002E+00 5.3000002E+00 5.3000002E+00 5.3000002

In [27]:
len(lines)

10451

In [28]:
lines[-20:]

['WY346  10480 6.3000002E+00 6.3000002E+00 6.3000002E+00 6.3000002E+00 6.4000001E+\n00 6.5000000E+00 6.5000000E+00 6.4000001E+00 6.6999998E+00 7.3000002E+00 0.00000\n00E+00',
 'WY347  10481 7.0999999E+00 7.0999999E+00 7.1999998E+00 7.0000000E+00 7.0000000E+\n00 7.0000000E+00 7.5000000E+00 7.5999999E+00 7.5999999E+00 7.5999999E+00 0.00000\n00E+00',
 'WY348  10482 8.1999998E+00 8.1999998E+00 8.3000002E+00 8.5000000E+00 8.6000004E+\n00 8.6000004E+00 8.6000004E+00 8.6000004E+00 8.5000000E+00 8.3999996E+00 0.00000\n00E+00',
 'WY349  10483 7.8000002E+00 7.8000002E+00 7.9000001E+00 8.0000000E+00 8.1000004E+\n00 8.3000002E+00 8.3000002E+00 8.3999996E+00 8.3999996E+00 8.3999996E+00 0.00000\n00E+00',
 'WY350  10484 7.8000002E+00 7.8000002E+00 8.0000000E+00 8.1999998E+00 8.3000002E+\n00 8.3999996E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.00000\n00E+00',
 'WY351  10485 7.0000000E+00 7.0000000E+00 7.0000000E+00 7.0000000E+00 7.1999998E+\n00 7.1999998E+00 7.3000002E+00 7.4000001E

In [29]:
# muid_list

In [30]:
ph_data.keys()

dict_keys(['L1_PH', 'L2_PH', 'L3_PH', 'L4_PH', 'L5_PH', 'L6_PH', 'L7_PH', 'L8_PH', 'L9_PH', 'L10_PH', 'L11_PH'])

In [31]:
ph_data['L11_PH'][:20]

[]

In [32]:
len(ph_data['L1_PH'])

0

In [33]:
len(muid_list)

0

In [34]:
soil_gdf

Unnamed: 0,AREASYMBOL,SPATIALVER,MUSYM,MUKEY,geometry
0,US,3,s6280,671321,"POLYGON ((-98.38087 34.83919, -98.38163 34.838..."
1,US,3,s6198,671239,"POLYGON ((-99.77481 34.82467, -99.77134 34.824..."
2,US,3,s6301,671342,"POLYGON ((-96.68445 34.84652, -96.68029 34.844..."
3,US,3,s7400,673361,"POLYGON ((-102.26232 34.81751, -102.2618 34.81..."
4,US,3,s4671,664806,"POLYGON ((-76.82996 34.82448, -76.83107 34.824..."
...,...,...,...,...,...
81765,US,3,s3714,669389,"POLYGON ((-95.36505 43.33629, -95.35744 43.336..."
81766,US,3,s9103,677045,"POLYGON ((-108.88566 43.32691, -108.88323 43.3..."
81767,US,3,s5996,666331,"POLYGON ((-74.89392 43.29477, -74.89079 43.297..."
81768,US,3,E26W,2790280,"POLYGON ((180 51.79107, 179.99998 51.79106, 17..."


In [35]:
# Path to your .gdb folder
gdb_path = "../data/soil/gNATSGO_CONUS/gNATSGO_CONUS.gdb"  # Replace with your .gdb folder path

# List the layers available in the .gdb file
layers = fiona.listlayers(gdb_path)
print("Available layers:", layers)

Available layers: ['chaashto', 'chconsistence', 'chdesgnsuffix', 'chfrags', 'chorizon', 'chpores', 'chstruct', 'chstructgrp', 'chtext', 'chtexture', 'chtexturegrp', 'chtexturemod', 'chunified', 'cocanopycover', 'cocropyld', 'codiagfeatures', 'coecoclass', 'coeplants', 'coerosionacc', 'coforprod', 'coforprodo', 'cogeomordesc', 'cohydriccriteria', 'cointerp', 'comonth', 'component', 'copm', 'copmgrp', 'copwindbreak', 'corestrictions', 'cosoilmoist', 'cosoiltemp', 'cosurffrags', 'cosurfmorphgc', 'cosurfmorphhpp', 'cosurfmorphmr', 'cosurfmorphss', 'cotaxfmmin', 'cotaxmoistcl', 'cotext', 'cotreestomng', 'cotxfmother', 'distinterpmd', 'distlegendmd', 'distmd', 'featdesc', 'laoverlap', 'legend', 'legendtext', 'mapunit', 'month', 'muaggatt', 'muaoverlap', 'mucropyld', 'mutext', 'sacatalog', 'sainterp', 'sdvalgorithm', 'sdvattribute', 'sdvfolder', 'sdvfolderattribute', 'mdstatdomdet', 'mdstatdommas', 'mdstatidxdet', 'mdstatidxmas', 'mdstatrshipdet', 'mdstatrshipmas', 'mdstattabcols', 'mdstattab

In [36]:
candidate_layers = [
    'chaashto',
    'chconsistence',
    'chdesgnsuffix',
    'chfrags',
    'chorizon',
    'chpores',
    'chstruct',
    'chstructgrp',
    'chtext',
    'chtexture',
    'chtexturegrp',
    'chtexturemod',
    'chunified',
    'cocanopycover',
    'cocropyld',
    'codiagfeatures',
    'coecoclass',
    'coeplants',
    'coerosionacc',
    'coforprod',
    'coforprodo',
    'cogeomordesc',
    'cohydriccriteria',
    'cointerp',
    'comonth',
    'component',
    'copm',
    'copmgrp',
    'copwindbreak',
    'corestrictions',
    'cosoilmoist',
    'cosoiltemp',
    'cosurffrags',
    'cosurfmorphgc',
    'cosurfmorphhpp',
    'cosurfmorphmr',
    'cosurfmorphss',
    'cotaxfmmin',
    'cotaxmoistcl',
    'cotext',
    'cotreestomng',
    'cotxfmother',
    'distinterpmd',
    'distlegendmd',
    'distmd',
    'featdesc',
    'laoverlap',
    'legend',
    'legendtext',
    'mapunit',
    'month',
    'muaggatt',
    'muaoverlap',
    'mucropyld',
    'mutext',
    'sacatalog',
    'sainterp',
    'sdvalgorithm',
    'sdvattribute',
    'sdvfolder',
    'sdvfolderattribute',
    'mdstatdomdet',
    'mdstatdommas',
    'mdstatidxdet',
    'mdstatidxmas',
    'mdstatrshipdet',
    'mdstatrshipmas',
    'mdstattabcols',
    'mdstattabs',
    'SAPOLYGON',
    'DominantComponent'
]

In [37]:
# list(candidate_layers)

In [38]:
# List of candidate layers to inspect
candidate_layers = ['chorizon', 'component', 'muaggatt']

# dataframes = []
for layer in candidate_layers:
    print(f"Inspecting layer: {layer}")
    dataframe_name = f'{layer}_gdf'
    dataframe_name = gpd.read_file(gdb_path, layer=layer)
    print(dataframe_name.columns)
    print(dataframe_name.head(), "\n")
    # dataframes.append(dataframe_name)

Inspecting layer: chorizon
Index(['hzname', 'desgndisc', 'desgnmaster', 'desgnmasterprime', 'desgnvert',
       'hzdept_l', 'hzdept_r', 'hzdept_h', 'hzdepb_l', 'hzdepb_r',
       ...
       'ph2osoluble_l', 'ph2osoluble_r', 'ph2osoluble_h', 'ptotal_l',
       'ptotal_r', 'ptotal_h', 'excavdifcl', 'excavdifms', 'cokey', 'chkey'],
      dtype='object', length=171)
    hzname  desgndisc desgnmaster desgnmasterprime  desgnvert  hzdept_l  \
0        C        NaN           C             None        2.0       NaN   
1        A        NaN           A             None        1.0       NaN   
2  C1...C5        NaN           C             None        2.0       NaN   
3       Ap        NaN           A             None        1.0       NaN   
4       Bt        NaN           B             None        2.0       NaN   

   hzdept_r  hzdept_h  hzdepb_l  hzdepb_r  ...  ph2osoluble_l  ph2osoluble_r  \
0        20       NaN       NaN       152  ...            NaN            NaN   
1         0       NaN   

In [41]:
# Load a specific layer into a GeoDataFrame

gdf_soil = gpd.read_file(gdb_path, layer='chorizon')  
# gdf_soil = gpd.read_parquet("../data/soil/gNATSGO_CONUS/chorizon_gdb.parquet")

gdf_soil

Unnamed: 0,hzname,desgndisc,desgnmaster,desgnmasterprime,desgnvert,hzdept_l,hzdept_r,hzdept_h,hzdepb_l,hzdepb_r,...,ph2osoluble_l,ph2osoluble_r,ph2osoluble_h,ptotal_l,ptotal_r,ptotal_h,excavdifcl,excavdifms,cokey,chkey
0,C,,C,,2.0,,20,,,152,...,,,,,,,,,24629637,73203533
1,A,,A,,1.0,,0,,,20,...,,,,,,,,,24629637,73203532
2,C1...C5,,C,,2.0,,20,,,152,...,,,,,,,,,24630059,73204462
3,Ap,,A,,1.0,,0,,,20,...,,,,,,,,,24630059,73204461
4,Bt,,B,,2.0,,13,,,23,...,,,,,,,,,24630061,73204464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563603,2A,2.0,A,,,118.0,168,200.0,148.0,179,...,,,,,,,,,3314515:2808111,3314515:11333113
3563604,2Bt,2.0,B,,,148.0,179,200.0,200.0,200,...,,,,,,,,,3314515:2808111,3314515:11333112
3563605,Oi,,O,,,0.0,0,0.0,0.0,30,...,,,,,,,,,3314515:2808112,3314515:11333115
3563606,Oa,,O,,,0.0,30,46.0,30.0,91,...,,,,,,,,,3314515:2808112,3314515:11333116


In [42]:
list(gdf_soil.columns)

['hzname',
 'desgndisc',
 'desgnmaster',
 'desgnmasterprime',
 'desgnvert',
 'hzdept_l',
 'hzdept_r',
 'hzdept_h',
 'hzdepb_l',
 'hzdepb_r',
 'hzdepb_h',
 'hzthk_l',
 'hzthk_r',
 'hzthk_h',
 'fraggt10_l',
 'fraggt10_r',
 'fraggt10_h',
 'frag3to10_l',
 'frag3to10_r',
 'frag3to10_h',
 'sieveno4_l',
 'sieveno4_r',
 'sieveno4_h',
 'sieveno10_l',
 'sieveno10_r',
 'sieveno10_h',
 'sieveno40_l',
 'sieveno40_r',
 'sieveno40_h',
 'sieveno200_l',
 'sieveno200_r',
 'sieveno200_h',
 'sandtotal_l',
 'sandtotal_r',
 'sandtotal_h',
 'sandvc_l',
 'sandvc_r',
 'sandvc_h',
 'sandco_l',
 'sandco_r',
 'sandco_h',
 'sandmed_l',
 'sandmed_r',
 'sandmed_h',
 'sandfine_l',
 'sandfine_r',
 'sandfine_h',
 'sandvf_l',
 'sandvf_r',
 'sandvf_h',
 'silttotal_l',
 'silttotal_r',
 'silttotal_h',
 'siltco_l',
 'siltco_r',
 'siltco_h',
 'siltfine_l',
 'siltfine_r',
 'siltfine_h',
 'claytotal_l',
 'claytotal_r',
 'claytotal_h',
 'claysizedcarb_l',
 'claysizedcarb_r',
 'claysizedcarb_h',
 'om_l',
 'om_r',
 'om_h',
 'dbte

In [43]:
candidate_layers = [
    'mapunit',
    'muaggatt',
    'muaoverlap',
    'mucropyld',
    'mutext',
    'SAPOLYGON',
]
for layer in candidate_layers:
    print(f"Inspecting layer: {layer}")
    dataframe_name = f'{layer}_gdf'
    dataframe_name = gpd.read_file(gdb_path, layer=layer)
    print(dataframe_name.columns)
    print(dataframe_name.head(), "\n")
    # dataframes.append(dataframe_name)

Inspecting layer: mapunit
Index(['musym', 'muname', 'mukind', 'mustatus', 'muacres', 'mapunitlfw_l',
       'mapunitlfw_r', 'mapunitlfw_h', 'mapunitpfa_l', 'mapunitpfa_r',
       'mapunitpfa_h', 'farmlndcl', 'muhelcl', 'muwathelcl', 'muwndhelcl',
       'interpfocus', 'invesintens', 'iacornsr', 'nhiforsoigrp', 'nhspiagr',
       'vtsepticsyscl', 'mucertstat', 'lkey', 'mukey'],
      dtype='object')
  musym                                             muname        mukind  \
0  1030                    Udorthents-Pits, gravel complex       Complex   
1  1033                            Beaches-Menahga complex       Complex   
2  1059                                     Wega silt loam  Consociation   
3  1066            Rock outcrop-Garnes complex, very stony       Complex   
4  1067  Waupaca-Eutroboralfs complex, 0 to 60 percent ...   Association   

  mustatus  muacres  mapunitlfw_l  mapunitlfw_r  mapunitlfw_h  mapunitpfa_l  \
0     None    785.0           NaN           NaN           NaN 

  return ogr_read(


Index(['AREASYMBOL', 'SPATIALVER', 'LKEY', 'Shape_Length', 'Shape_Area',
       'SOURCE', 'geometry'],
      dtype='object')
  AREASYMBOL  SPATIALVER   LKEY   Shape_Length    Shape_Area  SOURCE  \
0      CO636         3.0  14371  148538.744211  3.062635e+08  SSURGO   
1      KS145        11.0  10599  194571.262320  1.954684e+09  SSURGO   
2      CO099        13.0  10655  262716.959114  4.258098e+09  SSURGO   
3      CO011        12.0  10644  254960.334741  3.991455e+09  SSURGO   
4      CO089        11.0  10653  265676.565583  3.286150e+09  SSURGO   

                                            geometry  
0  MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...  
1  MULTIPOLYGON (((-262470.1 1706056.6, -262483.9...  
2  MULTIPOLYGON (((-523488.3 1709604.5, -523532.5...  
3  MULTIPOLYGON (((-583830.5 1713440.2, -583877.8...  
4  MULTIPOLYGON (((-649477.6 1718195.3, -647923 1...   



In [44]:
list(gdf_soil.columns)

['hzname',
 'desgndisc',
 'desgnmaster',
 'desgnmasterprime',
 'desgnvert',
 'hzdept_l',
 'hzdept_r',
 'hzdept_h',
 'hzdepb_l',
 'hzdepb_r',
 'hzdepb_h',
 'hzthk_l',
 'hzthk_r',
 'hzthk_h',
 'fraggt10_l',
 'fraggt10_r',
 'fraggt10_h',
 'frag3to10_l',
 'frag3to10_r',
 'frag3to10_h',
 'sieveno4_l',
 'sieveno4_r',
 'sieveno4_h',
 'sieveno10_l',
 'sieveno10_r',
 'sieveno10_h',
 'sieveno40_l',
 'sieveno40_r',
 'sieveno40_h',
 'sieveno200_l',
 'sieveno200_r',
 'sieveno200_h',
 'sandtotal_l',
 'sandtotal_r',
 'sandtotal_h',
 'sandvc_l',
 'sandvc_r',
 'sandvc_h',
 'sandco_l',
 'sandco_r',
 'sandco_h',
 'sandmed_l',
 'sandmed_r',
 'sandmed_h',
 'sandfine_l',
 'sandfine_r',
 'sandfine_h',
 'sandvf_l',
 'sandvf_r',
 'sandvf_h',
 'silttotal_l',
 'silttotal_r',
 'silttotal_h',
 'siltco_l',
 'siltco_r',
 'siltco_h',
 'siltfine_l',
 'siltfine_r',
 'siltfine_h',
 'claytotal_l',
 'claytotal_r',
 'claytotal_h',
 'claysizedcarb_l',
 'claysizedcarb_r',
 'claysizedcarb_h',
 'om_l',
 'om_r',
 'om_h',
 'dbte

In [46]:
pH_columns_to_keep = [
    'hzname',
    'desgndisc',
    'desgnmaster',
    # 'desgnmasterprime',
    # 'desgnvert',
    # 'hzdept_l',
    # 'hzdept_r',
    # 'hzdept_h',
    # 'hzdepb_l',
    # 'hzdepb_r',
    # 'hzdepb_h',
    # 'hzthk_l',
    # 'hzthk_r',
    # 'hzthk_h',
    # 'fraggt10_l',
    # 'fraggt10_r',
    # 'fraggt10_h',
    # 'frag3to10_l',
    # 'frag3to10_r',
    # 'frag3to10_h',
    # 'sieveno4_l',
    # 'sieveno4_r',
    # 'sieveno4_h',
    # 'sieveno10_l',
    # 'sieveno10_r',
    # 'sieveno10_h',
    # 'sieveno40_l',
    # 'sieveno40_r',
    # 'sieveno40_h',
    # 'sieveno200_l',
    # 'sieveno200_r',
    # 'sieveno200_h',
    # 'sandtotal_l',
    # 'sandtotal_r',
    # 'sandtotal_h',
    # 'sandvc_l',
    # 'sandvc_r',
    # 'sandvc_h',
    # 'sandco_l',
    # 'sandco_r',
    # 'sandco_h',
    # 'sandmed_l',
    # 'sandmed_r',
    # 'sandmed_h',
    # 'sandfine_l',
    # 'sandfine_r',
    # 'sandfine_h',
    # 'sandvf_l',
    # 'sandvf_r',
    # 'sandvf_h',
    # 'silttotal_l',
    # 'silttotal_r',
    # 'silttotal_h',
    # 'siltco_l',
    # 'siltco_r',
    # 'siltco_h',
    # 'siltfine_l',
    # 'siltfine_r',
    # 'siltfine_h',
    # 'claytotal_l',
    # 'claytotal_r',
    # 'claytotal_h',
    # 'claysizedcarb_l',
    # 'claysizedcarb_r',
    # 'claysizedcarb_h',
    # 'om_l',
    # 'om_r',
    # 'om_h',
    # 'dbtenthbar_l',
    # 'dbtenthbar_r',
    # 'dbtenthbar_h',
    # 'dbthirdbar_l',
    # 'dbthirdbar_r',
    # 'dbthirdbar_h',
    # 'dbfifteenbar_l',
    # 'dbfifteenbar_r',
    # 'dbfifteenbar_h',
    # 'dbovendry_l',
    # 'dbovendry_r',
    # 'dbovendry_h',
    # 'partdensity',
    # 'ksat_l',
    # 'ksat_r',
    # 'ksat_h',
    # 'awc_l',
    # 'awc_r',
    # 'awc_h',
    # 'wtenthbar_l',
    # 'wtenthbar_r',
    # 'wtenthbar_h',
    # 'wthirdbar_l',
    # 'wthirdbar_r',
    # 'wthirdbar_h',
    # 'wfifteenbar_l',
    # 'wfifteenbar_r',
    # 'wfifteenbar_h',
    # 'wsatiated_l',
    # 'wsatiated_r',
    # 'wsatiated_h',
    # 'lep_l',
    # 'lep_r',
    # 'lep_h',
    # 'll_l',
    # 'll_r',
    # 'll_h',
    # 'pi_l',
    # 'pi_r',
    # 'pi_h',
    # 'aashind_l',
    # 'aashind_r',
    # 'aashind_h',
    # 'kwfact',
    # 'kffact',
    # 'caco3_l',
    # 'caco3_r',
    # 'caco3_h',
    # 'gypsum_l',
    # 'gypsum_r',
    # 'gypsum_h',
    # 'sar_l',
    # 'sar_r',
    # 'sar_h',
    # 'ec_l',
    # 'ec_r',
    # 'ec_h',
    # 'cec7_l',
    # 'cec7_r',
    # 'cec7_h',
    # 'ecec_l',
    # 'ecec_r',
    # 'ecec_h',
    # 'sumbases_l',
    # 'sumbases_r',
    # 'sumbases_h',
    'ph1to1h2o_l',
    'ph1to1h2o_r',
    'ph1to1h2o_h',
    'ph01mcacl2_l',
    'ph01mcacl2_r',
    'ph01mcacl2_h',
    # 'freeiron_l',
    # 'freeiron_r',
    # 'freeiron_h',
    # 'feoxalate_l',
    # 'feoxalate_r',
    # 'feoxalate_h',
    # 'extracid_l',
    # 'extracid_r',
    # 'extracid_h',
    # 'extral_l',
    # 'extral_r',
    # 'extral_h',
    # 'aloxalate_l',
    # 'aloxalate_r',
    # 'aloxalate_h',
    # 'pbray1_l',
    # 'pbray1_r',
    # 'pbray1_h',
    # 'poxalate_l',
    # 'poxalate_r',
    # 'poxalate_h',
    # 'ph2osoluble_l',
    # 'ph2osoluble_r',
    # 'ph2osoluble_h',
    # 'ptotal_l',
    # 'ptotal_r',
    # 'ptotal_h',
    # 'excavdifcl',
    # 'excavdifms',
    'cokey',
    'chkey'   
]

In [47]:
gdf_soil[pH_columns_to_keep]

Unnamed: 0,hzname,desgndisc,desgnmaster,ph1to1h2o_l,ph1to1h2o_r,ph1to1h2o_h,ph01mcacl2_l,ph01mcacl2_r,ph01mcacl2_h,cokey,chkey
0,C,,C,5.6,6.7,7.8,,,,24629637,73203533
1,A,,A,4.5,5.5,6.5,,,,24629637,73203532
2,C1...C5,,C,5.6,7.0,8.4,,,,24630059,73204462
3,Ap,,A,5.6,6.7,7.8,,,,24630059,73204461
4,Bt,,B,6.6,7.2,7.8,,,,24630061,73204464
...,...,...,...,...,...,...,...,...,...,...,...
3563603,2A,2.0,A,5.1,6.0,6.5,,,,3314515:2808111,3314515:11333113
3563604,2Bt,2.0,B,5.1,6.5,7.3,,,,3314515:2808111,3314515:11333112
3563605,Oi,,O,,,,3.5,4.0,4.5,3314515:2808112,3314515:11333115
3563606,Oa,,O,,,,3.5,4.0,4.5,3314515:2808112,3314515:11333116


In [48]:
# list(gdf_soil[columns_to_keep]['desgndisc'].unique())
gdf_soil[pH_columns_to_keep]['ph1to1h2o_l'].unique()

array([ 5.6,  4.5,  6.6,  6.1,  7.4,  5.1,  nan,  5.5,  3.5,  3.6,  5.8,
        7.9,  7.5,  6.2,  7.8,  7. ,  6.4,  6.8,  7.6,  6. ,  5.2,  6.5,
        5. ,  6.3,  7.1,  7.2,  8. ,  4.8,  5.4,  4.2,  4.4,  4. ,  4.3,
        4.9,  4.6,  5.3,  4.7,  7.3,  6.9,  8.2,  8.4,  8.6,  7.7,  8.5,
        5.7,  5.9,  3.8,  9.1,  6.7,  3. ,  9. ,  8.8,  8.1,  3.2,  3.7,
        4.1,  3.4,  3.9,  3.3,  2.5,  2.7,  3.1,  2.1,  9.5,  2.6,  2.8,
        2. ,  8.3,  8.7,  8.9,  9.2,  1.8,  9.4, 10. ,  2.9,  1. ,  2.2,
        9.3,  1.9,  9.6,  9.9, 10.2], dtype=float32)

In [49]:
gdf_soil[pH_columns_to_keep]['ph1to1h2o_r'].unique()

array([ 6.7,  5.5,  7. ,  7.2,  7.9,  7.5,  5.9,  5.6,  6.6,  6.1,  8. ,
        nan,  6.5,  7.3,  5.4,  5. ,  6.2,  7.6,  5.8,  6. ,  6.3,  8.2,
        7.4,  7.8,  8.3,  6.4,  5.3,  4.6,  4.8,  4.3,  4.5,  4. ,  5.1,
        5.2,  5.7,  4.1,  6.9,  6.8,  8.4,  7.7,  7.1,  8.1,  4.4,  4.7,
        4.9,  8.5,  8.6,  4.2,  8.7,  9. ,  8.8,  9.1,  9.2,  9.3,  8.9,
       10.1,  9.4,  9.6,  3.9,  3.8,  3.6,  9.8, 10. ,  9.5,  3.5,  3.3,
        3.7,  3.1,  2.8,  3.2, 10.7, 10.8, 10.2,  3.4,  9.7,  2. ,  2.9,
       10.3,  9.9, 10.5,  3. , 10.6,  2.7, 10.4], dtype=float32)

In [50]:
gdf_soil[pH_columns_to_keep]['ph1to1h2o_h'].unique()

array([ 7.8,  6.5,  8.4,  7.3,  6. ,  nan,  5.5,  5. ,  4.4,  4.5,  7. ,
        9. ,  8.2,  6.8,  7.4,  7.2,  8. ,  8.5,  7.6,  7.5,  5.9,  6.6,
        7.7,  6.2,  5.8,  5.6,  6.4,  6.3,  6.1,  5.7,  6.7,  7.1,  5.4,
        6.9,  8.6,  8.3,  8.8,  8.1,  9.6, 10. ,  9.4, 11. ,  7.9,  8.7,
        4.7,  5.2,  5.3,  4.8,  8.9,  9.5,  9.3,  9.2,  4.6,  4.9,  5.1,
        4.1,  4.2,  9.1, 10.5,  4. ,  3.8,  3.6,  9.9,  9.8,  9.7, 10.1,
        3.4, 10.4, 10.2,  4.3, 10.8, 10.3,  3.5,  3.7,  3.2, 10.7, 10.6],
      dtype=float32)

In [51]:
gdf_soil[pH_columns_to_keep]['ph01mcacl2_l'].unique()

array([nan, 4.5, 5.6, 5.1, 3.5, 3. , 6.6, 3.6, 6.1, 5.4, 4. , 6. , 5.3,
       5. , 4.7, 4.4, 4.2, 4.6, 5.5, 7.4, 7.2, 7.9, 7.1, 6.3, 7. , 6.5,
       7.5, 6.9, 7.3, 7.7, 8.5, 6.2, 6.8, 6.4, 7.8, 8. , 3.7, 4.1, 8.6,
       8.3, 5.7, 2. , 7.6, 2.5, 6.7, 8.2, 4.9, 3.8, 2.9, 2.7, 3.4, 3.1,
       4.8, 8.4, 3.2, 3.3, 3.9, 4.3, 5.2, 5.8, 8.1, 5.9, 1.8, 9. , 8.7,
       2.8, 1.5, 2.3, 2.4, 1.9, 2.6], dtype=float32)

In [52]:
gdf_soil[pH_columns_to_keep]['ph01mcacl2_r'].unique()

array([ nan,  6.2,  7. ,  6.7,  6.1,  4.6,  5.1,  5.4,  5. ,  5.7,  5.5,
        6.4,  4. ,  6.5,  7.2,  7.5,  4.3,  5.3,  4.8,  4.5,  6. ,  4.7,
        5.8,  5.2,  4.9,  5.6,  6.3,  7.8,  7.9,  6.8,  7.6,  7.1,  6.9,
        7.4,  8.1,  8.5,  8. ,  7.7,  6.6,  8.2,  7.3,  8.3,  5.9,  8.4,
        8.9,  3.9,  4.1,  4.2,  3.5,  8.8,  8.6,  9. ,  8.7,  3.2,  3. ,
        2. ,  3.1,  4.4,  3.8,  3.7,  3.3,  3.4,  3.6,  2.4,  9.2,  2.8,
        2.7,  2.9,  2.6,  9.1,  9.5,  9.6,  9.4,  9.7, 11. ,  2.3,  2.2,
        1.8,  1. ], dtype=float32)

In [53]:
gdf_soil[pH_columns_to_keep]['ph01mcacl2_h'].unique()

array([ nan,  7.8,  8.4,  7.3,  6. ,  6.5,  4.5,  5. ,  5.5,  6.2,  5.8,
        7.1,  5.9,  6.1,  6.7,  7. ,  9. ,  8.1,  8. ,  7.5,  8.5,  7.9,
        7.6,  8.6, 11. ,  6.3,  7.7,  6.9,  8.2,  8.3,  9.5,  4. ,  4.4,
        4.6,  6.8,  7.2,  6.6,  4.8,  4.9,  9.1,  8.7,  3.7,  5.6,  7.4,
        9.6,  8.8,  5.1,  5.2,  5.4,  5.3,  4.7,  6.4,  4.2,  3.5,  5.7,
        8.9, 10.2,  3.3, 10.5,  3.9,  9.3, 10. ,  3.8,  4.1,  4.3, 10.4,
        9.4,  3.4,  9.9,  3.6], dtype=float32)

In [54]:
mapunit_df = gpd.read_file(gdb_path, layer='mapunit')
mapunit_df

Unnamed: 0,musym,muname,mukind,mustatus,muacres,mapunitlfw_l,mapunitlfw_r,mapunitlfw_h,mapunitpfa_l,mapunitpfa_r,...,muwndhelcl,interpfocus,invesintens,iacornsr,nhiforsoigrp,nhspiagr,vtsepticsyscl,mucertstat,lkey,mukey
0,1030,"Udorthents-Pits, gravel complex",Complex,,785.0,,,,,,...,,,,,,,,,13785,398852
1,1033,Beaches-Menahga complex,Complex,,598.0,,,,,,...,,,,,,,,,13785,398853
2,1059,Wega silt loam,Consociation,,2875.0,,,,,,...,,,,,,,,,13785,398854
3,1066,"Rock outcrop-Garnes complex, very stony",Complex,,733.0,,,,,,...,,,,,,,,,13785,398855
4,1067,"Waupaca-Eutroboralfs complex, 0 to 60 percent ...",Association,,8224.0,,,,,,...,,,,,,,,,13785,398856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317725,308y0,"Ronneby-Giese, frequently ponded, complex, 0 t...",Complex,Correlated,5407.0,,,,,,...,,Wildlife Management,Order 2,,,,,"certified, all components",114876,3314511
317726,308xy,"Millward-Mora complex, 2 to 20 percent slopes,...",Complex,Correlated,577.0,,,,,,...,,Forest Management,Order 2,,,,,"certified, all components",114876,3314512
317727,308y8,"Milaca and Greysolon soils, 2 to 20 percent sl...",Undifferentiated group,Correlated,14930.0,,,,,,...,,Forest Management,Order 2,,,,,"certified, all components",114876,3314513
317728,308xm,"Milaca silt loam, 5 to 30 percent slopes, stony",Consociation,Correlated,108.0,,,,,,...,,Forest Management,Order 2,,,,,"certified, all components",114876,3314514


In [55]:
len(mapunit_df['lkey'].unique())

3220

In [56]:
SAPOLYGON_gdf = gpd.read_file(gdb_path, layer='SAPOLYGON')
SAPOLYGON_gdf

Unnamed: 0,AREASYMBOL,SPATIALVER,LKEY,Shape_Length,Shape_Area,SOURCE,geometry
0,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3..."
1,KS145,11.0,10599,1.945713e+05,1.954684e+09,SSURGO,"MULTIPOLYGON (((-262470.1 1706056.6, -262483.9..."
2,CO099,13.0,10655,2.627170e+05,4.258098e+09,SSURGO,"MULTIPOLYGON (((-523488.3 1709604.5, -523532.5..."
3,CO011,12.0,10644,2.549603e+05,3.991455e+09,SSURGO,"MULTIPOLYGON (((-583830.5 1713440.2, -583877.8..."
4,CO089,11.0,10653,2.656766e+05,3.286150e+09,SSURGO,"MULTIPOLYGON (((-649477.6 1718195.3, -647923 1..."
...,...,...,...,...,...,...,...
16587,MN,0.0,87281,1.080000e+03,2.790000e+04,DRSS,"MULTIPOLYGON (((-46185 2497485, -46185 2497425..."
16588,MN,0.0,87281,1.020000e+03,2.700000e+04,DRSS,"MULTIPOLYGON (((-46245 2498205, -46245 2498175..."
16589,MN,0.0,87281,1.800000e+03,4.320000e+04,DRSS,"MULTIPOLYGON (((-45585 2498205, -45585 2498145..."
16590,MN,0.0,87281,9.600000e+02,1.980000e+04,DRSS,"MULTIPOLYGON (((-46095 2498265, -46095 2498235..."


In [58]:
len(SAPOLYGON_gdf['LKEY'].unique())

3187

In [59]:
# Get the unique values from both columns
sapolygon_lkey_unique = set(SAPOLYGON_gdf['LKEY'].unique())
mapunit_lkey_unique = set(mapunit_df['lkey'].unique())

# Find common keys
common_keys = sapolygon_lkey_unique.intersection(mapunit_lkey_unique)
print(f"Number of common keys: {len(common_keys)}")

# Find keys in SAPOLYGON_df but not in mapunit_df
sapolygon_not_in_mapunit = sapolygon_lkey_unique.difference(mapunit_lkey_unique)
print(f"Number of keys in SAPOLYGON but not in mapunit: {len(sapolygon_not_in_mapunit)}")

# Find keys in mapunit_df but not in SAPOLYGON_df
mapunit_not_in_sapolygon = mapunit_lkey_unique.difference(sapolygon_lkey_unique)
print(f"Number of keys in mapunit but not in SAPOLYGON: {len(mapunit_not_in_sapolygon)}")

# Optionally, print some of these keys for inspection
print("Some keys in SAPOLYGON but not in mapunit:", list(sapolygon_not_in_mapunit)[:10])
print("Some keys in mapunit but not in SAPOLYGON:", list(mapunit_not_in_sapolygon)[:10])

Number of common keys: 3187
Number of keys in SAPOLYGON but not in mapunit: 0
Number of keys in mapunit but not in SAPOLYGON: 33
Some keys in SAPOLYGON but not in mapunit: []
Some keys in mapunit but not in SAPOLYGON: ['10388', '10321', '14434', '10322', '10298', '14121', '14175', '10285', '11288', '14388']


In [61]:
# list(gdf_soil.columns)
gdf_small = gdf_soil[pH_columns_to_keep]
gdf_small

Unnamed: 0,hzname,desgndisc,desgnmaster,ph1to1h2o_l,ph1to1h2o_r,ph1to1h2o_h,ph01mcacl2_l,ph01mcacl2_r,ph01mcacl2_h,cokey,chkey
0,C,,C,5.6,6.7,7.8,,,,24629637,73203533
1,A,,A,4.5,5.5,6.5,,,,24629637,73203532
2,C1...C5,,C,5.6,7.0,8.4,,,,24630059,73204462
3,Ap,,A,5.6,6.7,7.8,,,,24630059,73204461
4,Bt,,B,6.6,7.2,7.8,,,,24630061,73204464
...,...,...,...,...,...,...,...,...,...,...,...
3563603,2A,2.0,A,5.1,6.0,6.5,,,,3314515:2808111,3314515:11333113
3563604,2Bt,2.0,B,5.1,6.5,7.3,,,,3314515:2808111,3314515:11333112
3563605,Oi,,O,,,,3.5,4.0,4.5,3314515:2808112,3314515:11333115
3563606,Oa,,O,,,,3.5,4.0,4.5,3314515:2808112,3314515:11333116


In [62]:
gdf_small.columns

Index(['hzname', 'desgndisc', 'desgnmaster', 'ph1to1h2o_l', 'ph1to1h2o_r',
       'ph1to1h2o_h', 'ph01mcacl2_l', 'ph01mcacl2_r', 'ph01mcacl2_h', 'cokey',
       'chkey'],
      dtype='object')

In [63]:
mapunit_df.columns

Index(['musym', 'muname', 'mukind', 'mustatus', 'muacres', 'mapunitlfw_l',
       'mapunitlfw_r', 'mapunitlfw_h', 'mapunitpfa_l', 'mapunitpfa_r',
       'mapunitpfa_h', 'farmlndcl', 'muhelcl', 'muwathelcl', 'muwndhelcl',
       'interpfocus', 'invesintens', 'iacornsr', 'nhiforsoigrp', 'nhspiagr',
       'vtsepticsyscl', 'mucertstat', 'lkey', 'mukey'],
      dtype='object')

In [64]:
component_df = gpd.read_file(gdb_path, layer='component')

In [65]:
list(component_df.columns)

['comppct_l',
 'comppct_r',
 'comppct_h',
 'compname',
 'compkind',
 'majcompflag',
 'otherph',
 'localphase',
 'slope_l',
 'slope_r',
 'slope_h',
 'slopelenusle_l',
 'slopelenusle_r',
 'slopelenusle_h',
 'runoff',
 'tfact',
 'wei',
 'weg',
 'erocl',
 'earthcovkind1',
 'earthcovkind2',
 'hydricon',
 'hydricrating',
 'drainagecl',
 'elev_l',
 'elev_r',
 'elev_h',
 'aspectccwise',
 'aspectrep',
 'aspectcwise',
 'geomdesc',
 'albedodry_l',
 'albedodry_r',
 'albedodry_h',
 'airtempa_l',
 'airtempa_r',
 'airtempa_h',
 'map_l',
 'map_r',
 'map_h',
 'reannualprecip_l',
 'reannualprecip_r',
 'reannualprecip_h',
 'ffd_l',
 'ffd_r',
 'ffd_h',
 'nirrcapcl',
 'nirrcapscl',
 'nirrcapunit',
 'irrcapcl',
 'irrcapscl',
 'irrcapunit',
 'cropprodindex',
 'constreeshrubgrp',
 'wndbrksuitgrp',
 'rsprod_l',
 'rsprod_r',
 'rsprod_h',
 'foragesuitgrpid',
 'wlgrain',
 'wlgrass',
 'wlherbaceous',
 'wlshrub',
 'wlconiferous',
 'wlhardwood',
 'wlwetplant',
 'wlshallowwat',
 'wlrangeland',
 'wlopenland',
 'wlwood

In [67]:
component_columns_to_keep = [
    # 'comppct_l',
    # 'comppct_r',
    # 'comppct_h',
    # 'compname',
    # 'compkind',
    # 'majcompflag',
    # 'otherph',
    # 'localphase',
    # 'slope_l',
    # 'slope_r',
    # 'slope_h',
    # 'slopelenusle_l',
    # 'slopelenusle_r',
    # 'slopelenusle_h',
    # 'runoff',
    # 'tfact',
    # 'wei',
    # 'weg',
    # 'erocl',
    # 'earthcovkind1',
    # 'earthcovkind2',
    # 'hydricon',
    # 'hydricrating',
    # 'drainagecl',
    # 'elev_l',
    # 'elev_r',
    # 'elev_h',
    # 'aspectccwise',
    # 'aspectrep',
    # 'aspectcwise',
    # 'geomdesc',
    # 'albedodry_l',
    # 'albedodry_r',
    # 'albedodry_h',
    # 'airtempa_l',
    # 'airtempa_r',
    # 'airtempa_h',
    'map_l',
    'map_r',
    'map_h',
    'reannualprecip_l',
    'reannualprecip_r',
    'reannualprecip_h',
    # 'ffd_l',
    # 'ffd_r',
    # 'ffd_h',
    # 'nirrcapcl',
    # 'nirrcapscl',
    # 'nirrcapunit',
    # 'irrcapcl',
    # 'irrcapscl',
    # 'irrcapunit',
    # 'cropprodindex',
    # 'constreeshrubgrp',
    # 'wndbrksuitgrp',
    # 'rsprod_l',
    # 'rsprod_r',
    # 'rsprod_h',
    # 'foragesuitgrpid',
    # 'wlgrain',
    # 'wlgrass',
    # 'wlherbaceous',
    # 'wlshrub',
    # 'wlconiferous',
    # 'wlhardwood',
    # 'wlwetplant',
    # 'wlshallowwat',
    # 'wlrangeland',
    # 'wlopenland',
    # 'wlwoodland',
    # 'wlwetland',
    # 'soilslippot',
    # 'frostact',
    # 'initsub_l',
    # 'initsub_r',
    # 'initsub_h',
    # 'totalsub_l',
    # 'totalsub_r',
    # 'totalsub_h',
    # 'hydgrp',
    # 'corcon',
    # 'corsteel',
    # 'taxclname',
    # 'taxorder',
    # 'taxsuborder',
    # 'taxgrtgroup',
    # 'taxsubgrp',
    # 'taxpartsize',
    # 'taxpartsizemod',
    # 'taxceactcl',
    # 'taxreaction',
    # 'taxtempcl',
    # 'taxmoistscl',
    # 'taxtempregime',
    # 'soiltaxedition',
    # 'castorieindex',
    # 'flecolcomnum',
    # 'flhe',
    # 'flphe',
    # 'flsoilleachpot',
    # 'flsoirunoffpot',
    # 'fltemik2use',
    # 'fltriumph2use',
    # 'indraingrp',
    # 'innitrateleachi',
    # 'misoimgmtgrp',
    # 'vasoimgtgrp',
    'mukey',
    'cokey'
]

In [68]:
# component_small = component_df[['mukey','cokey']]
component_small = component_df[component_columns_to_keep]
component_small[['reannualprecip_l', 'reannualprecip_r', 'reannualprecip_h']]
component_small

Unnamed: 0,map_l,map_r,map_h,reannualprecip_l,reannualprecip_r,reannualprecip_h,mukey,cokey
0,711.0,864.0,1016.0,,,,398852,24629854
1,,,,,,,398852,24629855
2,559.0,699.0,838.0,,,,398853,24629637
3,,,,,,,398853,24629638
4,,,,,,,398853,24629639
...,...,...,...,...,...,...,...,...
1131657,690.0,785.0,910.0,,,,3314514,3314514:2808079
1131658,690.0,785.0,910.0,,,,3314514,3314514:2808146
1131659,690.0,785.0,910.0,,,,3314515,3314515:2808110
1131660,690.0,785.0,910.0,,,,3314515,3314515:2808111


In [69]:
component_small['reannualprecip_l'].unique()

array([  nan,  710.,  686.,  560.,  457.,  559.,  610.,  510.,  390.,
        410.,  380.,  350.,  370.,  340.,  500.,  635.,  950.,  900.,
       1000., 1070., 1270., 1275., 1475.,  614.,  539.,  564.,  689.,
        700.,  400., 1050., 1250., 2025.,  600.,  850., 1100.,  625.,
        725.,  575.,  356.,  512.,  889.,  762., 1016., 1143.,  860.,
        694.,  711., 1397.,  668., 1168.,  660., 1600.,  508.,  584.,
       1300., 1950., 1150., 1350., 1450., 1650.,  650.,  875.,  750.,
        640.,  775., 1200.,  825.,  800., 1280.,  925.,  645.,  760.,
       1145.,  685.,  533., 1015.,  890.,  480.,  815.,  642.,  647.,
        630.,  675.,  460.,  430.,  535.,  585.,  381.,  432.,  406.,
        483.,  485.,  355.,  450.,  254.,  405.,  360.,  507.,  735.,
        357.,  634.,  684.,  425.,  279.,  975.,  281.,  837.,  306.,
       1345.,  530.,  375.,  550.,  525., 1075., 1175., 1225., 1125.,
        665., 1025., 1775., 1975., 1500.,  475.,  385.,  712.,  304.,
        930.,  305.,

In [70]:
component_small['reannualprecip_r'].unique()

array([  nan,  801.,  762.,  751.,  533.,  610.,  660.,  559.,  420.,
        480.,  430.,  530.,  470.,  390.,  400.,  380.,  600.,  825.,
       1150., 1100., 1500., 1260., 1460., 1400., 1600.,  670.,  595.,
        745.,  800.,  525.,  625., 1175., 1375., 2275., 1625.,  725.,
        975., 1225.,  750.,  850.,  775.,  650.,  394.,  661.,  686.,
        737., 1080., 1016., 1143., 1334., 1397.,  991.,  953.,  767.,
       1524., 1207.,  734., 1270., 1219.,  711., 1850.,  889.,  673.,
        699.,  965., 1125., 1350., 1250., 1900., 1700., 2150., 1550.,
       1650., 1750., 1950.,  700., 1050., 1000., 1825.,  950., 1075.,
       1450.,  900., 1200., 1275., 1280.,  890.,  761.,  546., 1015.,
       1145.,  550.,  760.,  685.,  710.,  580.,  815.,  915.,  560.,
        813.,  703.,  715.,  646.,  696.,  643.,  925.,  535.,  735.,
        865.,  508.,  406.,  381.,  432.,  483.,  457.,  405.,  570.,
        545.,  635.,  460.,  585.,  510.,  500.,  369.,  490.,  740.,
        840., 1295.,

In [71]:
component_small['reannualprecip_h'].unique()

array([  nan, 1091.,  838.,  941.,  610.,  711.,  762.,  490.,  560.,
        660.,  700.,  690.,  430.,  420.,  390.,  460.,  800.,  530.,
        510., 1016., 1650., 1600., 2000., 1450., 1525., 1725.,  850.,
        875.,  725.,  900.,  650.,  750., 1300., 1500., 2550., 1750.,
       1100., 1350.,  975.,  950.,  825.,  432., 1000.,  864., 1270.,
       1651., 1397., 1230., 1143.,  840.,  889.,  799., 1463., 1524.,
       2100.,  635.,  737., 1067., 1550., 2200., 2050., 1400., 2300.,
       2500., 2150., 2250., 2650., 1850., 1280., 1200., 1900., 1870.,
       1015.,  837.,  686., 1145.,  620.,  890.,  760.,  915.,  815.,
       1118.,  706.,  724.,  648.,  707., 1800., 1475., 1825., 1775.,
        765.,  865., 1020.,  559.,  406.,  483.,  508.,  457.,  485.,
        482.,  480.,  550.,  813.,  940.,  710., 1115., 1120.,  887.,
        619.,  786.,  836.,  475.,  625.,  675.,  575.,  408.,  687.,
       1141., 1599., 1345., 2107.,  584.,  433.,  535.,  964.,  775.,
        880., 1150.,

In [72]:
component_small['map_l'].unique()

array([ 711.,   nan,  559., ..., 1918., 1680., 1901.])

In [73]:
component_small['map_r'].unique()

array([ 864.,   nan,  699., ..., 1893., 1889., 1919.])

In [74]:
component_small['map_h'].unique()

array([1016.,   nan,  838., ..., 2279., 2029., 2228.])

In [75]:
mapunit_small = mapunit_df[['lkey', 'mukey']]
mapunit_small

Unnamed: 0,lkey,mukey
0,13785,398852
1,13785,398853
2,13785,398854
3,13785,398855
4,13785,398856
...,...,...
317725,114876,3314511
317726,114876,3314512
317727,114876,3314513
317728,114876,3314514


In [76]:
ph_small = gdf_soil[pH_columns_to_keep]
ph_small

Unnamed: 0,hzname,desgndisc,desgnmaster,ph1to1h2o_l,ph1to1h2o_r,ph1to1h2o_h,ph01mcacl2_l,ph01mcacl2_r,ph01mcacl2_h,cokey,chkey
0,C,,C,5.6,6.7,7.8,,,,24629637,73203533
1,A,,A,4.5,5.5,6.5,,,,24629637,73203532
2,C1...C5,,C,5.6,7.0,8.4,,,,24630059,73204462
3,Ap,,A,5.6,6.7,7.8,,,,24630059,73204461
4,Bt,,B,6.6,7.2,7.8,,,,24630061,73204464
...,...,...,...,...,...,...,...,...,...,...,...
3563603,2A,2.0,A,5.1,6.0,6.5,,,,3314515:2808111,3314515:11333113
3563604,2Bt,2.0,B,5.1,6.5,7.3,,,,3314515:2808111,3314515:11333112
3563605,Oi,,O,,,,3.5,4.0,4.5,3314515:2808112,3314515:11333115
3563606,Oa,,O,,,,3.5,4.0,4.5,3314515:2808112,3314515:11333116


## joins 

In [77]:
polygon_mapunit_merge = SAPOLYGON_gdf.merge(mapunit_small, left_on='LKEY', right_on='lkey', how='left')
polygon_mapunit_merge

Unnamed: 0,AREASYMBOL,SPATIALVER,LKEY,Shape_Length,Shape_Area,SOURCE,geometry,lkey,mukey
0,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509804
1,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509806
2,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509808
3,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509809
4,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509811
...,...,...,...,...,...,...,...,...,...
4754244,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314488
4754245,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314489
4754246,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314490
4754247,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314491


In [78]:
# Check data types
print(f"mukey type in polygon_mapunit_merge: {polygon_mapunit_merge['mukey'].dtype}")
print(f"cokey type in component_small: {component_small['cokey'].dtype}")

# Inspect a few sample keys from both DataFrames
print("Sample mukey values from polygon_mapunit_merge:")
print(polygon_mapunit_merge['mukey'].head())

print("Sample cokey values from component_small:")
print(component_small['cokey'].head())

# Check for leading/trailing spaces or format issues
print("Checking for leading/trailing spaces or format issues:")
print(polygon_mapunit_merge['mukey'].str.strip().head())  # Remove spaces if any
print(component_small['cokey'].str.strip().head())  # Remove spaces if any

# Check the number of unique keys in both DataFrames
print(f"Unique mukey values in polygon_mapunit_merge: {polygon_mapunit_merge['mukey'].nunique()}")
print(f"Unique cokey values in component_small: {component_small['cokey'].nunique()}")

mukey type in polygon_mapunit_merge: object
cokey type in component_small: object
Sample mukey values from polygon_mapunit_merge:
0    509804
1    509806
2    509808
3    509809
4    509811
Name: mukey, dtype: object
Sample cokey values from component_small:
0    24629854
1    24629855
2    24629637
3    24629638
4    24629639
Name: cokey, dtype: object
Checking for leading/trailing spaces or format issues:
0    509804
1    509806
2    509808
3    509809
4    509811
Name: mukey, dtype: object
0    24629854
1    24629855
2    24629637
3    24629638
4    24629639
Name: cokey, dtype: object
Unique mukey values in polygon_mapunit_merge: 317697
Unique cokey values in component_small: 1131662


In [79]:
# Perform the join between the merged data and the component data
# polygon_mapunit_component_merge = polygon_mapunit_merge.merge(component_small, left_on='mukey', right_on='cokey', how='left')
polygon_mapunit_component_merge = polygon_mapunit_merge.merge(component_small, on='mukey', how='left')

In [80]:
polygon_mapunit_component_merge.columns

Index(['AREASYMBOL', 'SPATIALVER', 'LKEY', 'Shape_Length', 'Shape_Area',
       'SOURCE', 'geometry', 'lkey', 'mukey', 'map_l', 'map_r', 'map_h',
       'reannualprecip_l', 'reannualprecip_r', 'reannualprecip_h', 'cokey'],
      dtype='object')

In [81]:
polygon_mapunit_component_merge

Unnamed: 0,AREASYMBOL,SPATIALVER,LKEY,Shape_Length,Shape_Area,SOURCE,geometry,lkey,mukey,map_l,map_r,map_h,reannualprecip_l,reannualprecip_r,reannualprecip_h,cokey
0,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509804,,,,,,,23663415
1,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509804,,,,,,,23663416
2,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509804,635.0,813.0,1016.0,,,,23663417
3,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509804,635.0,813.0,1016.0,,,,23663418
4,CO636,3.0,14371,1.485387e+05,3.062635e+08,SSURGO,"MULTIPOLYGON (((-778418.1 1630725.1, -778446.3...",14371,509806,,,,,,,23663603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39466973,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314490,580.0,670.0,770.0,,,,3314490:2795959
39466974,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314491,580.0,670.0,770.0,,,,3314491:2795955
39466975,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314491,580.0,670.0,770.0,,,,3314491:2795956
39466976,MN,0.0,87281,4.572480e+06,2.435075e+09,DRSS,"MULTIPOLYGON (((-3675 2585325, -3525 2585325, ...",87281,3314492,580.0,670.0,770.0,,,,3314492:2795970


In [82]:
ph_small.columns

Index(['hzname', 'desgndisc', 'desgnmaster', 'ph1to1h2o_l', 'ph1to1h2o_r',
       'ph1to1h2o_h', 'ph01mcacl2_l', 'ph01mcacl2_r', 'ph01mcacl2_h', 'cokey',
       'chkey'],
      dtype='object')

In [None]:
polygon_mapunit_component_merge.to_pickle("../data/public/polygon_mapunit_component_merge.pickle")

In [None]:
ph_small.to_pickle("../data/public/ph_small.pickle")

In [None]:
soil_ph_gdf = polygon_mapunit_component_merge.merge(ph_small, on='cokey', how='left')
soil_ph_gdf

In [19]:
gdf_soil.to_parquet("../data/soil/gNATSGO_CONUS/chorizon_gdb.parquet")
gdf_soil.to_file("../data/soil/gNATSGO_CONUS/chorizon_gdb.geojson", driver="GeoJSON")

AttributeError: 'DataFrame' object has no attribute 'to_file'

In [20]:
gpd.read_parquet("../data/soil/gNATSGO_CONUS/chorizon_gdb.parquet")

ValueError: Missing geo metadata in Parquet/Feather file.
            Use pandas.read_parquet/read_feather() instead.

In [172]:
# Convert to GeoDataFrame with NAD83 CRS
gdf = gpd.GeoDataFrame(
    geo_balanced_csb_samples,
    geometry=gpd.points_from_xy(geo_balanced_csb_samples.Longitude, geo_balanced_csb_samples.Latitude),
    crs='EPSG:4269'  # NAD83
)

latitudes = gdf['Latitude']
longitudes = gdf['Longitude']

In [18]:
# Function to extract data variable names from a NetCDF file
def extract_data_variables(file_path):
    ds = xr.open_dataset(file_path)
    print(list(ds.data_vars))
    return list(ds.data_vars)

# Function to get climate values for specific locations
def get_climate_values_for_locations(latitudes, longitudes, file_path):
    ds = xr.open_dataset(file_path)

    variable_name = list(ds.data_vars)[0]
    print(f"Variable name: {variable_name}")

    # Check if the dataset has a time dimension and select the first time slice if present
    if 'time' in ds.dims:
        ds = ds.isel(time=0)

    # Ensure the variable name exists in the dataset
    if variable_name not in ds.data_vars:
        raise ValueError(f"Variable {variable_name} does not exist in the dataset.")

    # Check if longitudes in the dataset are in the range [0, 360]
    lon_min = ds.lon.min().values
    lon_max = ds.lon.max().values

    if lon_max > 180:
        # Adjust input longitudes to match the NetCDF file's 0-360 range
        longitudes = (longitudes + 360) % 360

    # Create a DataArray for the interpolation points
    points = xr.DataArray(
        np.array([latitudes, longitudes]).T,
        dims=["points", "coords"],
        coords={"points": range(len(latitudes)), "coords": ["lat", "lon"]}
    )

    # Interpolate the data
    climate_values = ds[variable_name].interp(lat=points[:, 0], lon=points[:, 1], method='linear')
    print(climate_values.values[:10])
    # Return the interpolated values for the specified variable
    return climate_values.values

In [19]:
# Time the step of adding hardiness data to the dataframe

latitudes = gdf['Latitude']
longitudes = gdf['Longitude']

start_time = time.time()
for scenario_name, file_path in hardiness_scenarios_data.items():
    # Extract column name from the scenario name
    column_name = scenario_name
    print(f"Column name: {column_name}")
    
    if column_name:
        interpolated_values = get_climate_values_for_locations(
            latitudes,
            longitudes,
            file_path,
        )
        # Ensure the length of interpolated values matches the DataFrame's index length
        if len(interpolated_values) == len(gdf):
            # print(interpolated_values[:10])
            gdf[column_name] = interpolated_values
        else:
            raise ValueError(f"Length of interpolated values ({len(interpolated_values)}) does not match the length of the DataFrame's index ({len(gdf)})")

end_time = time.time()

# Print the execution time
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Column name: usda_plant_hardiness_1971_2000_historical
Variable name: cold_hardiness_zone
[6.         7.5        7.         6.         5.5        5.5
 7.         6.         6.11417072 6.15530776]
Column name: usda_plant_hardiness_2040_2069_rcp45
Variable name: cold_hardiness_zone
[6.5        8.         7.5        6.5        6.         6.5
 7.5        6.88226846 7.         7.        ]
Column name: usda_plant_hardiness_2040_2069_rcp85
Variable name: cold_hardiness_zone
[6.5        8.         7.56270243 6.69060233 6.5        6.5
 8.         7.         7.37645844 7.15530776]
Execution time: 0.6591417789459229 seconds


In [20]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1200 entries, 478801 to 308295
Data columns (total 29 columns):
 #   Column                                     Non-Null Count  Dtype   
---  ------                                     --------------  -----   
 0   CSBID                                      1200 non-null   object  
 1   CSBYEARS                                   1200 non-null   object  
 2   CSBACRES                                   1200 non-null   float64 
 3   CDL2016                                    1200 non-null   int64   
 4   CDL2017                                    1200 non-null   int64   
 5   CDL2018                                    1200 non-null   int64   
 6   CDL2019                                    1200 non-null   int64   
 7   CDL2020                                    1200 non-null   int64   
 8   CDL2021                                    1200 non-null   int64   
 9   CDL2022                                    1200 non-null   int64   
 10  CD

In [30]:
gdf

Unnamed: 0,CSBID,CSBYEARS,CSBACRES,CDL2016,CDL2017,CDL2018,CDL2019,CDL2020,CDL2021,CDL2022,...,Shp_Area,geometry,Longitude,Latitude,Elevation,color,Crop,usda_plant_hardiness_1971_2000_historical,usda_plant_hardiness_2040_2069_rcp45,usda_plant_hardiness_2040_2069_rcp85
478801,081623012787392,1623,3.791540,4,24,61,61,24,24,61,...,15343.877410,POINT (-102.96208 38.17692),-102.962080,38.176922,1231,#bfbf7a,Fallow/Idle Cropland,6.0,6.500000,6.500000
107446,351623001627247,1623,4.325446,225,1,225,152,152,152,36,...,17504.529797,POINT (-104.60067 33.08984),-104.600667,33.089835,1138,#ffa8e3,Alfalfa,7.5,8.000000,8.000000
121162,351623002777545,1623,3.222010,61,176,1,176,36,36,1,...,13039.063523,POINT (-108.30975 36.72811),-108.309751,36.728108,1676,#ffd400,Corn,7.0,7.500000,7.562702
88813,351623000046581,1623,10.204036,36,36,36,36,152,37,37,...,41294.433943,POINT (-107.11218 36.89108),-107.112180,36.891083,2252,#a5f58d,Other Hay/Non Alfalfa,6.0,6.500000,6.690602
516266,081623013377366,1623,2.918481,24,29,29,24,1,61,61,...,11810.721459,POINT (-103.41289 40.72386),-103.412893,40.723858,1330,#bfbf7a,Fallow/Idle Cropland,5.5,6.000000,6.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119071,351623002568175,1623,2.957066,176,176,176,61,24,24,24,...,11966.871324,POINT (-106.08262 34.75611),-106.082615,34.756107,1877,#e9ffbe,Grass/Pasture,6.5,7.000000,7.500000
540278,081623014401456,1623,9.374885,61,24,61,1,1,1,1,...,37938.965442,POINT (-104.17285 40.16143),-104.172849,40.161427,1417,#a5f58d,Other Hay/Non Alfalfa,5.5,6.000000,6.102624
591949,041623013556134,1623,3.770585,72,72,72,190,190,190,190,...,15259.076277,POINT (-114.50854 33.02805),-114.508535,33.028054,101,#80b3b3,Woody Wetlands,10.0,10.000000,10.500000
217842,081623005639708,1623,4.307316,61,24,1,61,24,29,61,...,17431.160158,POINT (-103.09239 39.90366),-103.092389,39.903658,1396,#a87000,Winter Wheat,5.5,6.008018,6.500000


In [22]:
fields = gdf.copy()

In [23]:
for column in hardiness_scenarios:
    print(f'{column:>60} {gdf[column].min():>8.2f} (Min),   {gdf[column].max():>8.1f} (Max)')

                   usda_plant_hardiness_1971_2000_historical     4.50 (Min),       10.0 (Max)
                        usda_plant_hardiness_2040_2069_rcp45     5.00 (Min),       10.0 (Max)
                        usda_plant_hardiness_2040_2069_rcp85     5.06 (Min),       10.5 (Max)


In [24]:
crops.columns

Index(['Crop_Code', 'Scientific_Name', 'Genus', 'Species',
       'USDA_Hardiness_Zone_Min', 'USDA_Hardiness_Zone_Max'],
      dtype='object')

In [25]:
crop_hardiness_columns = ['USDA_Hardiness_Zone_Min', 'USDA_Hardiness_Zone_Max']

for column in crop_hardiness_columns:
    print(f'{column:>60} {crops[column].min():>8.2f} (Min),   {crops[column].max():>8.1f} (Max)')

                                     USDA_Hardiness_Zone_Min     1.00 (Min),       11.0 (Max)
                                     USDA_Hardiness_Zone_Max     5.00 (Min),       12.0 (Max)


In [28]:
scenario_names = [
    'historical', 
    'mid_century_medium_CO2', 
    'mid_century_high_CO2'
]

scenario_columns = [
    'usda_plant_hardiness_1971_2000_historical',
    'usda_plant_hardiness_2040_2069_rcp45',
    'usda_plant_hardiness_2040_2069_rcp85',
]

In [31]:
crops.columns

Index(['Crop_Code', 'Scientific_Name', 'Genus', 'Species',
       'USDA_Hardiness_Zone_Min', 'USDA_Hardiness_Zone_Max'],
      dtype='object')

In [42]:
# Example initialization of data (replace with actual data)
num_fields = len(fields)
num_crops = len(crops)
num_scenarios = len(scenario_names)
hardiness_scores = np.zeros((num_fields, num_crops, num_scenarios))

# Extract crop-specific data
crop_hardiness_min = crops['USDA_Hardiness_Zone_Min'].values[np.newaxis, :].to_numpy()  # Shape: (1, num_crops)
crop_hardiness_max = crops['USDA_Hardiness_Zone_Max'].values[np.newaxis, :].to_numpy()  # Shape: (1, num_crops)

# Iterate over scenarios
for scenario_idx, (scenario_name, scenario_column) in enumerate(zip(scenario_names, scenario_columns)):
    start_time = time.time()
    
    # Extract the relevant average daily precipitation data for the scenario
    field_zone = fields[scenario_column].values[:, np.newaxis]  # Shape: (num_fields, 1)
    
    # Apply vectorized comparisons
    within_range = (field_zone >= crop_hardiness_min) & (field_zone <= crop_hardiness_max)
    # missing_data = (crop_hardiness_min == None) | (crop_hardiness_max == None) | (field_zone == None)
    missing_data = np.isnan(crop_hardiness_min) | np.isnan(crop_hardiness_max) | np.isnan(field_zone)
    
    # Initialize scores
    scores = np.zeros((num_fields, num_crops), dtype=int)
    
    # Assign scores
    scores[within_range] = 1
    scores[missing_data] = 0
    scores[~within_range & ~missing_data] = -1
    
    # Store the scores in the matrix
    hardiness_scores[:, :, scenario_idx] = scores

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.6f} seconds for scenario {scenario_name}")


Execution time: 0.059423 seconds for scenario historical
Execution time: 0.040687 seconds for scenario mid_century_medium_CO2
Execution time: 0.017730 seconds for scenario mid_century_high_CO2


In [44]:
np.unique(hardiness_scores, return_counts=True)

(array([-1.,  1.]), array([6835776, 2409024]))

In [63]:
scenarios = ['historical', 'mid_century_medium_CO2', 'mid_century_high_CO2']

# Convert the results matrix to an xarray DataArray
results_da = xr.DataArray(
    hardiness_scores,
    dims=['fields', 'crops', 'scenarios'],
    coords={
        'fields': fields['CSBID'],
        'crops': crops['Scientific_Name'],
        'scenarios': scenarios,
    },
    name='crop_suitability_usda_hardiness'
)

In [64]:
# Save the DataArray to a NetCDF file
results_da.to_netcdf('../data/scores/score_matrix_hardiness.nc')

## save field usda hardiness data

In [65]:
fields.to_crs(crs='EPSG:4269', inplace=True)

In [66]:
fields.to_parquet('../data/public/csb_sample_with_hardiness_data.parquet')

In [67]:
fields['CSBID'].info()

<class 'pandas.core.series.Series'>
Index: 1200 entries, 478801 to 308295
Series name: CSBID
Non-Null Count  Dtype 
--------------  ----- 
1200 non-null   object
dtypes: object(1)
memory usage: 18.8+ KB


In [68]:
hardiness_scores.shape

(1200, 2568, 3)

In [69]:
## analyze temperature recommendations

In [70]:
def get_crops_with_score_1(scores_matrix, crop_data):
    suitable_crops = {}
    for field_idx in range(scores_matrix.shape[0]):
        suitable_crops[field_idx] = [crop_data['Scientific_Name'][crop_idx] for crop_idx in range(scores_matrix.shape[1]) if scores_matrix[field_idx, crop_idx] == 1]
    return suitable_crops

In [71]:
hardiness_scores

array([[[ 1.,  1.,  1.],
        [-1., -1., -1.],
        [-1., -1., -1.],
        ...,
        [-1., -1., -1.],
        [-1., -1., -1.],
        [ 1.,  1.,  1.]],

       [[ 1.,  1.,  1.],
        [-1.,  1.,  1.],
        [-1.,  1.,  1.],
        ...,
        [-1., -1., -1.],
        [-1., -1., -1.],
        [ 1.,  1.,  1.]],

       [[ 1.,  1.,  1.],
        [-1., -1., -1.],
        [-1., -1., -1.],
        ...,
        [-1., -1., -1.],
        [-1., -1., -1.],
        [ 1.,  1.,  1.]],

       ...,

       [[ 1.,  1.,  1.],
        [ 1.,  1.,  1.],
        [ 1.,  1.,  1.],
        ...,
        [ 1.,  1.,  1.],
        [ 1.,  1.,  1.],
        [-1., -1., -1.]],

       [[ 1.,  1.,  1.],
        [-1., -1., -1.],
        [-1., -1., -1.],
        ...,
        [-1., -1., -1.],
        [-1., -1., -1.],
        [-1.,  1.,  1.]],

       [[ 1.,  1.,  1.],
        [-1., -1., -1.],
        [-1., -1., -1.],
        ...,
        [-1., -1., -1.],
        [-1., -1., -1.],
        [-1.,  1.,  1.]]

In [72]:
fields = gdf.copy()

In [73]:
# Initialize the DataFrame to store the results using CSBID from fields
crops_with_score_1_df = pd.DataFrame({'CSBID': range(hardiness_scores.shape[0])})

# Process each scenario and add the results to the DataFrame
for scenario_idx, scenario in enumerate(scenario_names):
    # Extract crops with a score of 1 for this scenario
    crops_with_score_1 = get_crops_with_score_1(hardiness_scores[:, :, scenario_idx], crops)
    
    # Convert the results to a DataFrame for easier visualization
    scenario_column = f'Suitable_Crops_{scenario}'
    number_column = f'Number_of_Suitable_Crops_{scenario}'
    temp_df = pd.DataFrame(list(crops_with_score_1.items()), columns=['CSBID', scenario_column])
    temp_df[number_column] = temp_df[scenario_column].apply(len)

    # Merge with the main DataFrame
    crops_with_score_1_df = crops_with_score_1_df.merge(temp_df, on='CSBID', how='outer')

crops_with_score_1_df['CSBID'] = list(fields['CSBID'])

# Output the resulting DataFrame
# print(crops_with_score_1_df)

In [74]:
crops_with_score_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 7 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   CSBID                                            1200 non-null   object
 1   Suitable_Crops_historical                        1200 non-null   object
 2   Number_of_Suitable_Crops_historical              1200 non-null   int64 
 3   Suitable_Crops_mid_century_medium_CO2            1200 non-null   object
 4   Number_of_Suitable_Crops_mid_century_medium_CO2  1200 non-null   int64 
 5   Suitable_Crops_mid_century_high_CO2              1200 non-null   object
 6   Number_of_Suitable_Crops_mid_century_high_CO2    1200 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 65.8+ KB


In [75]:
data_for_plot = crops_with_score_1_df[['CSBID', 'Number_of_Suitable_Crops_historical', 'Number_of_Suitable_Crops_mid_century_medium_CO2', 'Number_of_Suitable_Crops_mid_century_high_CO2',]]
data_for_plot
# data_for_plot.info()

Unnamed: 0,CSBID,Number_of_Suitable_Crops_historical,Number_of_Suitable_Crops_mid_century_medium_CO2,Number_of_Suitable_Crops_mid_century_high_CO2
0,081623012787392,639,631,631
1,351623001627247,661,816,816
2,351623002777545,742,661,661
3,351623000046581,639,631,631
4,081623013377366,531,639,631
...,...,...,...,...
1195,351623002568175,631,661,661
1196,081623014401456,531,639,631
1197,041623013556134,2035,2035,1868
1198,081623005639708,531,631,631


In [76]:
crop_suitability_columns = [
    'Number_of_Suitable_Crops_historical', 
    'Number_of_Suitable_Crops_mid_century_medium_CO2', 
    'Number_of_Suitable_Crops_mid_century_high_CO2', 
]

for column in crop_suitability_columns:
    print(f'{column:>50}  {crops_with_score_1_df[column].min():>8.2f} (Min),  {crops_with_score_1_df[column].max():>8.1f} (Max),  {crops_with_score_1_df[column].mean():>8.1f} (Avg)')

               Number_of_Suitable_Crops_historical    394.00 (Min),    2035.0 (Max),     622.5 (Avg)
   Number_of_Suitable_Crops_mid_century_medium_CO2    531.00 (Min),    2035.0 (Max),     676.8 (Avg)
     Number_of_Suitable_Crops_mid_century_high_CO2    531.00 (Min),    2035.0 (Max),     708.1 (Avg)


In [78]:
# Set bin size
bin_size = 20  # You can adjust the bin size as needed
color='cornflowerblue'
opacity=0.6

# Create Altair charts for each scenario
chart_h = alt.Chart(data_for_plot).mark_bar(color=color, opacity=opacity).encode(
    x=alt.X('Number_of_Suitable_Crops_historical:Q', bin=alt.Bin(step=bin_size), title='Number of Suitable Crops (Historical 1970-2000)', axis=alt.Axis(offset=10)),
    y=alt.Y('count()', title='Frequency', axis=alt.Axis(offset=10)),
).properties(
    title='Historical Scenario (1970-2000)',
    width=600,
    height=200
)

chart_45 = alt.Chart(data_for_plot).mark_bar(color=color, opacity=opacity).encode(
    x=alt.X('Number_of_Suitable_Crops_mid_century_medium_CO2:Q', bin=alt.Bin(step=bin_size), title='Number of Suitable Crops (RCP 4.5, 2036-2065)', axis=alt.Axis(offset=10)),
    y=alt.Y('count()', title='Frequency', axis=alt.Axis(offset=10)),
).properties(
    title='RCP 4.5 Scenario (2036-2065)',
    width=600,
    height=200
)

chart_85 = alt.Chart(data_for_plot).mark_bar(color=color, opacity=opacity).encode(
    x=alt.X('Number_of_Suitable_Crops_mid_century_high_CO2:Q', bin=alt.Bin(step=bin_size), title='Number of Suitable Crops (RCP 8.5, 2036-2065)', axis=alt.Axis(offset=10)),
    y=alt.Y('count()', title='Frequency', axis=alt.Axis(offset=10)),
).properties(
    title='RCP 8.5 Scenario (2036-2065)',
    width=600,
    height=200
)

# Combine the charts with shared axis domains
combined_chart = alt.vconcat(
    chart_h, 
    chart_45, 
    chart_85
).resolve_scale(
    x='shared',
    y='shared'
)

combined_chart