In [2]:
from pygbif import occurrences
import pandas as pd

def get_amaranthus_occurences(bounding_box) :
    data = occurrences.search(
    taxon_key = 8577467,
    geometry=bounding_box,
    hasCoordinate=True,
    limit=1000,
    basisOfRecord="PRESERVED_SPECIMEN",
    year="2000,2023", # range of years, can narrow down
)

    df = pd.DataFrame(data['results'])
    df = df[['decimalLongitude', 'decimalLatitude', 'eventDate', 'basisOfRecord']]

    df.to_csv('amaranthus_occurrences.csv', index=False)
    return df

il_bbox = "POLYGON((-91.513 36.970, -87.495 36.970, -87.495 42.508, -91.513 42.508, -91.513 36.970))"
all_bbox = "POLYGON(())"
df = get_amaranthus_occurences(il_bbox)
df.head()


Unnamed: 0,decimalLongitude,decimalLatitude,eventDate,basisOfRecord
0,-90.375983,38.623817,2023-08-29,PRESERVED_SPECIMEN
1,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN
2,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN
3,-90.636111,38.5325,2021-08-21,PRESERVED_SPECIMEN
4,-90.635833,38.532408,2020-09-01,PRESERVED_SPECIMEN


In [3]:
#inaturalist observations
from pyinaturalist import get_observations, Observation

def get_inaturalist_observations(geometry) -> pd.DataFrame:
    response = get_observations(
    taxon_id=75400,       # species ID for Amaranthus tuberculatus
    geoprivacy='open',    # only include observations with open location data
    geoframe=geometry,  
    page='all'
)
    observations = Observation.from_json_list(response)

    obs_data = []
    for obs in observations:
        obs_data.append({
        'date': obs.observed_on,
        'location': obs.location,
    })

    obs_df = pd.DataFrame(obs_data)
    obs_df.to_csv('amaranthus_tuberculatus_observations.csv', index=False)
    return obs_df

us_bbox = (24.6, -124.8, 49.0, -66.9)
inat_obs_df = get_inaturalist_observations(us_bbox)
print(len(inat_obs_df))

INFO:Request:
GET https://api.inaturalist.org/v1/observations?taxon_id=75400&geoprivacy=open&geoframe=24.6%2C-124.8%2C49.0%2C-66.9&per_page=200&order_by=id&order=asc
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate
Accept: application/json
Connection: keep-alive

INFO:This query will fetch 1050 results in 6 requests. Estimated total request time: 5 seconds
INFO:Request:
GET https://api.inaturalist.org/v1/observations?taxon_id=75400&geoprivacy=open&geoframe=24.6%2C-124.8%2C49.0%2C-66.9&id_above=91636608&per_page=200&order_by=id&order=asc
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate
Accept: application/json
Connection: keep-alive

INFO:Request:
GET https://api.inaturalist.org/v1/observations?taxon_id=75400&geoprivacy=open&geoframe=24.6%2C-124.8%2C49.0%2C-66.9&id_above=137472843&per_page=200&order_by=id&order=asc
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate
Accept: app

1050


In [4]:
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString

def create_buffers():
    geometry = [Point(lon, lat) for lon, lat in zip(df['decimalLongitude'], df['decimalLatitude'])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

    gdf_utm = gdf.to_crs("EPSG:32616")
    gdf_utm['buffer'] = gdf_utm.geometry.buffer(1000)
    return gdf_utm

gdf_utm = create_buffers()
gdf_utm.head()

Unnamed: 0,decimalLongitude,decimalLatitude,eventDate,basisOfRecord,geometry,buffer
0,-90.375983,38.623817,2023-08-29,PRESERVED_SPECIMEN,POINT (206088.199 4280440.95),"POLYGON ((207088.199 4280440.95, 207083.384 42..."
1,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN,POINT (159704.858 4260941.616),"POLYGON ((160704.858 4260941.616, 160700.043 4..."
2,-90.898206,38.431925,2022-09-16,PRESERVED_SPECIMEN,POINT (159704.858 4260941.616),"POLYGON ((160704.858 4260941.616, 160700.043 4..."
3,-90.636111,38.5325,2021-08-21,PRESERVED_SPECIMEN,POINT (183033.318 4271169.926),"POLYGON ((184033.318 4271169.926, 184028.502 4..."
4,-90.635833,38.532408,2020-09-01,PRESERVED_SPECIMEN,POINT (183057.154 4271158.753),"POLYGON ((184057.154 4271158.753, 184052.339 4..."


In [5]:
import numpy as np

def create_control_buffers():
    bbox = ((-91.513, 36.970), (-87.495, 36.970), (-87.495, 42.508), (-91.513, 42.508), (-91.513, 36.970))
    illinois_bbox = Polygon(bbox)
    min_x, min_y, max_x, max_y = illinois_bbox.bounds


    n_controls = len(gdf_utm)
    random_points = [
        Point(np.random.uniform(min_x, max_x),
            np.random.uniform(min_y, max_y))
        for _ in range(n_controls)
    ]
    gdf_controls = gpd.GeoDataFrame(geometry=random_points, crs="EPSG:4326")
    gdf_controlsutm = gdf_controls.to_crs("EPSG:32616").buffer(1000)

    return gdf_controlsutm

gdf_controls_utm = create_control_buffers()
gdf_controls_utm.head()


[1;36m0[0m    POLYGON [1m([0m[1m([0m[1;36m386736.781[0m [1;36m4498973.044[0m, [1;36m386731.965[0m [1;36m4[0m[33m...[0m
[1;36m1[0m    POLYGON [1m([0m[1m([0m[1;36m258044.031[0m [1;36m4449628.64[0m, [1;36m258039.216[0m [1;36m44[0m[33m...[0m
[1;36m2[0m    POLYGON [1m([0m[1m([0m[1;36m346955.7[0m [1;36m4160952.901[0m, [1;36m346950.885[0m [1;36m416[0m[33m...[0m
[1;36m3[0m    POLYGON [1m([0m[1m([0m[1;36m264205.79[0m [1;36m4609424.585[0m, [1;36m264200.974[0m [1;36m46[0m[33m...[0m
[1;36m4[0m    POLYGON [1m([0m[1m([0m[1;36m362420.569[0m [1;36m4547571.378[0m, [1;36m362415.754[0m [1;36m4[0m[33m...[0m
dtype: geometry

In [6]:
import zipfile
import os

zip_file_path = "../_NLCD_519W897hevlAKeQOwMVE.zip"
extract_dir = "./nlcd_data"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


In [7]:
extract_dir = "us_il_shapefiles"
zip_file_path = os.path.join(extract_dir, "IL_BNDY_State.zip")

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

shapefile_path = os.path.join(extract_dir, "IL_BNDY_State_Ln.shp")
il_gdf = gpd.read_file(shapefile_path)

il_coordinates = [
    (-91.5136, 37.3957),
    (-87.5, 37.3957), 
    (-87.5, 42.5),
    (-89.5, 42.5),
    (-90.5, 42.3),
    (-91.5, 42.3),
    (-91.5136, 37.3957)   
]

il_polygon = Polygon(il_coordinates)
il_gdf = gpd.GeoDataFrame({'geometry': [il_polygon]})
il_gdf = il_gdf.set_crs("EPSG:4326")

In [8]:
import rasterio
from rasterio.warp import calculate_default_transform, reproject

dst_crs = "EPSG:4326"

def reproject_CRS(input_path, output_path) -> None:
    with rasterio.open(input_path) as src:
        transform, width, height = calculate_default_transform(
            src.crs, dst_crs, src.width, src.height, *src.bounds
        )
        kwargs = src.meta.copy()
        kwargs.update({
            'crs': dst_crs,
            'transform': transform,
            'width': width,
            'height': height
        })

        with rasterio.open(output_path, 'w', **kwargs) as dst:
            reproject(
                source=rasterio.band(src, 1),
                destination=rasterio.band(dst, 1),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=rasterio.enums.Resampling.nearest
            )
 
in_tiff_file_path = 'nlcd_data\\Annual_NLCD_LndCov_2023_CU_C1V0_519W897hevlAKeQOwMVE.tiff'
out_tif_file_path = os.path.join(extract_dir, "IL_NLCD_2023_WGS84.tif")
reproject_CRS(in_tiff_file_path, out_tif_file_path)
        

In [9]:
nlcd_classes = {
    11: "Water",
    12: "Perennial Ice/Snow",
    21: "Developed, Open Space",
    22: "Developed, Low Intensity",
    23: "Developed, Medium Intensity",
    24: "Developed, High Intensity",
    31: "Barren Land",
    41: "Deciduous Forest",
    42: "Evergreen Forest",
    43: "Mixed Forest",
    52: "Shrub/Scrub",
    71: "Grassland/Herbaceous",
    81: "Pasture/Hay",
    82: "Cultivated Crops",
    90: "Woody Wetlands",
    95: "Emergent Herbaceous Wetlands"
}

In [None]:
from rasterstats import zonal_stats
from diversity_indices import calculate_shannon_index

def il_nlcd_zonal_stats(geometry: gpd.GeoSeries|gpd.GeoDataFrame) -> list[dict[str, float|str]]:   
    result = [] 
    stats = zonal_stats(
        geometry, 
        'nlcd_data\\IL_NLCD_2023_WGS84.tif', 
        categorical=True,
        category_map=nlcd_classes,
        geojson_out=True,
        all_touched = False
    )

    for zone in stats:
        props = zone['properties']
        total = sum(props.values())  #total pixels in buffer
        percentages = {k: (v / total) * 100 for k, v in props.items() if k != 'id'}
        percentages['id'] = zone['id']
        result.append(percentages)

        proportions = np.array(list(percentages.values()))
        shannon_index = calculate_shannon_index(proportions)

        percentages['shannon_index'] = shannon_index
    return result

def mean_clean_df(input_results: list[dict[str, float|str]]) -> pd.Series:
    data_frame = pd.DataFrame(input_results).fillna(0)
    if 'id' in data_frame.columns:
        data_frame = data_frame.drop(columns=['id'])
    data_frame = data_frame.apply(pd.to_numeric, errors='coerce')
    
    # get the mean shannon index over all the buffers, then drop that column
    mean_shannon_index = data_frame['shannon_index'].mean()
    data_frame = data_frame.drop(columns=['shannon_index'])
    
    # sort alphabetically by land use class
    data_frame = data_frame.sort_index(axis=1)
    return data_frame.mean(), mean_shannon_index

def out_df(geometry: gpd.GeoSeries | gpd.GeoDataFrame) -> pd.DataFrame:
    vector = geometry.to_crs(dst_crs)
    result = il_nlcd_zonal_stats(vector)
    result_series, mean_shannon_index = mean_clean_df(result)
    result_df = pd.DataFrame({'lu_class':result_series.index, 'proportions':result_series.values})
    return result_df, mean_shannon_index

results_df, waterhemp_shannon_idx = out_df(gdf_utm['buffer'])
control_results_df, control_shannon_idx = out_df(gdf_controls_utm)
il_results_df, _ = out_df(il_gdf)
print(results_df)
print(f"Shannon diversity index of land within 1000m of waterhemp occurences: {waterhemp_shannon_idx},\n\
Control Shannon diversity index: {control_shannon_idx}")

results_df.to_csv("amaranthus_results.csv", index=False)
control_results_df.to_csv("control_results.csv", index=False)
il_results_df.to_csv("il_results.csv", index=False)


                        lu_class  proportions
0                    Barren Land     0.485015
1               Cultivated Crops    22.610664
2               Deciduous Forest    17.174381
3      Developed, High Intensity     2.755278
4       Developed, Low Intensity     5.800443
5    Developed, Medium Intensity     5.070751
6          Developed, Open Space     6.347548
7   Emergent Herbaceous Wetlands     1.229355
8               Evergreen Forest     0.312665
9           Grassland/Herbaceous     0.375094
10                  Mixed Forest     1.316239
11                   Pasture/Hay     6.522418
12                   Shrub/Scrub     0.036560
13                         Water     7.881152
14                Woody Wetlands     6.082436
Shannon diversity index of land within 1000m of waterhemp occurences:1.3918815320376716,
Control Shannon diversity index: 0.8653418087507354
