In [7]:
from pygbif import occurrences
import pandas as pd

data = occurrences.search(
    taxon_key = 8577467,
    geometry="POLYGON((-91.513 36.970, -87.495 36.970, -87.495 42.508, -91.513 42.508, -91.513 36.970))",
    hasCoordinate=True,
    limit=1000,
    basisOfRecord="PRESERVED_SPECIMEN",
    year="2000,2023", # range of years, can narrow down
)

df = pd.DataFrame(data['results'])
df = df[['decimalLongitude', 'decimalLatitude', 'eventDate', 'basisOfRecord']]

df.to_csv('amaranthus_occurrences.csv', index=False)
df.head()


Unnamed: 0,decimalLongitude,decimalLatitude,eventDate,basisOfRecord
0,-90.375983,38.623817,2023-08-29,PRESERVED_SPECIMEN
1,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN
2,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN
3,-90.636111,38.5325,2021-08-21,PRESERVED_SPECIMEN
4,-90.635833,38.532408,2020-09-01,PRESERVED_SPECIMEN


In [None]:
import geopandas as gpd
from shapely.geometry import Point, Polygon

geometry = [Point(lon, lat) for lon, lat in zip(df['decimalLongitude'], df['decimalLatitude'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

gdf_utm = gdf.to_crs("EPSG:32616")
gdf_utm['buffer'] = gdf_utm.geometry.buffer(1000)
gdf_utm.head()

Unnamed: 0,decimalLongitude,decimalLatitude,eventDate,basisOfRecord,geometry,buffer
0,-90.375983,38.623817,2023-08-29,PRESERVED_SPECIMEN,POINT (206088.199 4280440.95),"POLYGON ((207088.199 4280440.95, 207083.384 42..."
1,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN,POINT (159704.858 4260941.616),"POLYGON ((160704.858 4260941.616, 160700.043 4..."
2,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN,POINT (159704.858 4260941.616),"POLYGON ((160704.858 4260941.616, 160700.043 4..."
3,-90.636111,38.5325,2021-08-21,PRESERVED_SPECIMEN,POINT (183033.318 4271169.926),"POLYGON ((184033.318 4271169.926, 184028.502 4..."
4,-90.635833,38.532408,2020-09-01,PRESERVED_SPECIMEN,POINT (183057.154 4271158.753),"POLYGON ((184057.154 4271158.753, 184052.339 4..."


In [13]:
import numpy as np

def create_control_buffers():
    bbox = ((-91.513, 36.970), (-87.495, 36.970), (-87.495, 42.508), (-91.513, 42.508), (-91.513, 36.970))
    illinois_bbox = Polygon(bbox)
    min_x, min_y, max_x, max_y = illinois_bbox.bounds


    n_controls = len(gdf_utm)
    random_points = [
        Point(np.random.uniform(min_x, max_x),
            np.random.uniform(min_y, max_y))
        for _ in range(n_controls)
    ]
    gdf_controls = gpd.GeoDataFrame(geometry=random_points, crs="EPSG:4326")
    gdf_controlsutm = gdf_controls.to_crs("EPSG:32616").buffer(1000)

    return gdf_controlsutm

gdf_controls_utm = create_control_buffers()
gdf_controls_utm.head()

0    POLYGON ((177658.027 4478105.512, 177653.212 4...
1    POLYGON ((453867.607 4695674.084, 453862.792 4...
2    POLYGON ((217838.417 4397387.644, 217833.602 4...
3    POLYGON ((226791.451 4482239.129, 226786.636 4...
4    POLYGON ((353846.093 4139716.607, 353841.278 4...
dtype: geometry

In [22]:
import zipfile
import os

zip_file_path = "../_NLCD_519W897hevlAKeQOwMVE.zip"
extract_dir = "./nlcd_data"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [26]:
import rasterio
from rasterio.warp import calculate_default_transform, reproject

dst_crs = "EPSG:4326"
output_tif = os.path.join(extract_dir, "IL_NLCD_2023_WGS84.tif")

with rasterio.open('nlcd_data\\Annual_NLCD_LndCov_2023_CU_C1V0_519W897hevlAKeQOwMVE.tiff') as src:
    transform, width, height = calculate_default_transform(
        src.crs, dst_crs, src.width, src.height, *src.bounds
    )
    kwargs = src.meta.copy()
    kwargs.update({
        'crs': dst_crs,
        'transform': transform,
        'width': width,
        'height': height
    })

    with rasterio.open(output_tif, 'w', **kwargs) as dst:
        reproject(
            source=rasterio.band(src, 1),
            destination=rasterio.band(dst, 1),
            src_transform=src.transform,
            src_crs=src.crs,
            dst_transform=transform,
            dst_crs=dst_crs,
            resampling=rasterio.enums.Resampling.nearest
        )
    

In [53]:
from rasterstats import zonal_stats

nlcd_classes = {
    11: "Water",
    12: "Perennial Ice/Snow",
    21: "Developed, Open Space",
    22: "Developed, Low Intensity",
    23: "Developed, Medium Intensity",
    24: "Developed, High Intensity",
    31: "Barren Land",
    41: "Deciduous Forest",
    42: "Evergreen Forest",
    43: "Mixed Forest",
    51: "Shrub/Scrub",
    52: "Grassland/Herbaceous",
    71: "Pasture/Hay",
    81: "Cultivated Crops",
    82: "Woody Wetlands",
    90: "Emergent Herbaceous Wetlands",
    95: "Emergent Herbaceous Wetlands"
}

stats = zonal_stats(
    gdf_utm['buffer'].to_crs(dst_crs), 
    'nlcd_data\\IL_NLCD_2023_WGS84.tif', 
    categorical=True,
    category_map=nlcd_classes,
    geojson_out=True,
    all_touched = False
)

control_stats = zonal_stats(
    gdf_controls_utm.to_crs(dst_crs), 
    'nlcd_data\\IL_NLCD_2023_WGS84.tif', 
    categorical=True,
    category_map=nlcd_classes,
    geojson_out=True,
    all_touched = False
)


results = []
control_results = []
for zone in stats:
    props = zone['properties']
    total = sum(props.values())  #total pixels in buffer
    percentages = {k: (v / total) * 100 for k, v in props.items() if k != 'id'}
    percentages['id'] = zone['id']
    results.append(percentages)

for zone in control_stats:
    props = zone['properties']
    total = sum(props.values())  #total pixels in buffer
    percentages = {k: (v / total) * 100 for k, v in props.items() if k != 'id'}
    percentages['id'] = zone['id']
    control_results.append(percentages)

def mean_clean_df(input_results):
    data_frame = pd.DataFrame(input_results).fillna(0)
    if 'id' in data_frame.columns:
        data_frame = data_frame.drop(columns=['id'])
    data_frame = data_frame.apply(pd.to_numeric, errors='coerce')
    return data_frame.mean()

print(mean_clean_df(results))
print(mean_clean_df(control_results))



Developed, Open Space            6.543377
Developed, Low Intensity         6.013073
Developed, Medium Intensity      5.230188
Developed, High Intensity        2.861358
Deciduous Forest                18.682292
Mixed Forest                     1.343899
Cultivated Crops                 6.782508
Emergent Herbaceous Wetlands     1.723993
Water                            8.847485
Barren Land                      0.530819
Evergreen Forest                 0.314213
Grassland/Herbaceous             0.054477
Pasture/Hay                      0.576442
Woody Wetlands                  24.495876
dtype: float64
Water                            1.476312
Developed, Open Space            3.715776
Developed, Low Intensity         4.670958
Deciduous Forest                12.151639
Cultivated Crops                 8.171710
Woody Wetlands                  50.143385
Developed, Medium Intensity      1.857717
Barren Land                      0.181876
Pasture/Hay                      0.221079
Emergent Herbaceous