# Bayesian biodiversity: Geodata processing


- Sources: population density raster (.tif), road network (.shp) and site coordinates (.shp)
- Target output: Population and road density at 1, 10, 50 and 100km
- Target output should be appended to the site coordinates geodataframe 
- Find the values: rasterio.sample
- Aggregate: use the pygeoprocessing function. Simple scaling of the pixel_size
- Use shapely to buffer a pixel around each site point coordinate
- Then intersect this with the road density shapefile

In [32]:
import pygeoprocessing as pgp
import geopandas as gpd
import pandas as pd
import rasterio
from rasterio.enums import Resampling
import shapely
import numpy as np
import time
from rasterstats import zonal_stats
import json
from pyproj import Transformer

In [4]:
# Load black for formatting
import jupyter_black

jupyter_black.load()

# Adjust display settings for pandas
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

## Load coordinate data from PREDICTS

In [69]:
gdf_sites = gpd.read_file("../../data/PREDICTS/site_coord.shp")
gdf_sites.head()

Unnamed: 0,SSS,geometry
0,AD1_2001__Liow 1 1,POINT (103.77861 1.35194)
1,AD1_2001__Liow 1 2,POINT (103.80806 1.35472)
2,AD1_2001__Liow 1 3,POINT (103.81167 1.39472)
3,AD1_2001__Liow 1 4,POINT (103.78722 1.32694)
4,AD1_2001__Liow 1 5,POINT (103.80361 1.28278)


In [6]:
# Get a list of coordinates from geometry
coord_list = [(x, y) for x, y in zip(gdf_sites["geometry"].x, gdf_sites["geometry"].y)]

## Gridded Population of the World (GPW), v4

https://sedac.ciesin.columbia.edu/data/set/gpw-v4-population-density-adjusted-to-2015-unwpp-country-totals-rev11

In [7]:
pop_density_path_2000 = "../../data/GPW/gpw_v4_2000_30_sec.tif"
# pop_density_path_2005 = ("../../data/GPW/gpw_v4_2005_30_sec.tif")
# pop_density_path_2010 = ("../../data/GPW/gpw_v4_2010_30_sec.tif")
# pop_density_path_2015 = ("../../data/GPW/gpw_v4_2015_30_sec.tif")
# pop_density_path_2020 = ("../../data/GPW/gpw_v4_2020_30_sec.tif")

In [46]:
pgp.get_raster_info(pop_density_path_2000)

{'file_list': ['../../data/GPW/gpw_v4_2000_30_sec.tif'],
 'projection_wkt': 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]',
 'geotransform': (-180.0,
  0.00833333333333387,
  0.0,
  90.00000000001157,
  0.0,
  -0.00833333333333387),
 'pixel_size': (0.00833333333333387, -0.00833333333333387),
 'raster_size': (43200, 21600),
 'n_bands': 1,
 'nodata': [-3.4028230607370965e+38],
 'overviews': [],
 'block_size': [43200, 1],
 'bounding_box': [-180.0, -90.0, 180.00000000002314, 90.00000000001157],
 'datatype': 6,
 'numpy_type': numpy.float32}

### Method 1: Rasterio sample

Get data for the grid in which the coordinates fall. This can work for high resolution data, but might be inaccurate when downsampling the population density data and then matching pixels with the site coordinates.

In [14]:
start = time.time()

# Load the raster file
raster_data = rasterio.open("../../data/GPW/gpw_v4_2000_30_sec.tif")

# Find the value used for nodata
nodata_value = raster_data.nodatavals[0]

# Extract population density from the grids for each coordinate
pop_density = np.array(list(raster_data.sample(coord_list))).flatten()

# Set density to np.nan if it seems to be a nodata pixel
if nodata_value is not None:
    pop_density[np.isclose(pop_density, nodata_value, equal_nan=True)] = np.nan

# Append new column to the dataframe
gdf_sites_point = gdf_sites.copy()
gdf_sites_point["Pop_density_2000_1km"] = pop_density

end = time.time()
print(end - start)

2.707219123840332


In [15]:
gdf_sites_point

Unnamed: 0,SSS,geometry,Pop_density_2000_1km
0,AD1_2001__Liow 1 1,POINT (103.77861 1.35194),12423.108398
1,AD1_2001__Liow 1 2,POINT (103.80806 1.35472),0.000000
2,AD1_2001__Liow 1 3,POINT (103.81167 1.39472),1935.903442
3,AD1_2001__Liow 1 4,POINT (103.78722 1.32694),4328.725586
4,AD1_2001__Liow 1 5,POINT (103.80361 1.28278),10090.784180
...,...,...,...
35731,YY1_2018__Guillemot 1 66,POINT (75.48022 12.19213),590.265381
35732,YY1_2018__Guillemot 1 67,POINT (75.48012 12.21315),590.265320
35733,YY1_2018__Guillemot 1 7,POINT (75.52462 12.26442),115.935951
35734,YY1_2018__Guillemot 1 8,POINT (75.52417 12.26392),115.935951


### Method 2: Rasterstats zonal statistics

Buffer a circle around each coordinate, and count the weighted average density under each circle, using rasterstats

In [70]:
start = time.time()

# Make a copy of the original dataframe
gdf_sites_buff = gdf_sites.copy()

# Set buffer distance (half a pixel radius corresponds to 1km^2)
buffer_dist_degrees = 0.00833333333333387 / 2

# Perform the buffering to square polygons
gdf_sites_buff["geometry"] = gdf_sites_buff["geometry"].buffer(
    buffer_dist_degrees, cap_style="square"
)

# Save the file
gdf_sites_buff.to_file("../../data/PREDICTS/site_coord_buffered.shp")

end = time.time()
print(end - start)


  gdf_sites_buff["geometry"] = gdf_sites_buff["geometry"].buffer(


2.553041934967041


In [71]:
start = time.time()

# Calculate zonal statistics
stats = zonal_stats(
    vectors="../../data/PREDICTS/site_coord_buffered.shp",
    raster="../../data/GPW/gpw_v4_2000_30_sec.tif",
    stats="mean",
    all_touched=True,
)

# Get values and add columns to original dataframe
pop_density = [x["mean"] for x in stats]
gdf_sites_buff = gdf_sites_buff.copy()
gdf_sites_buff["Pop_density_2000_1km"] = pop_density

end = time.time()
print(end - start)

70.3704149723053


In [72]:
gdf_sites_buff

Unnamed: 0,SSS,geometry,Pop_density_2000_1km
0,AD1_2001__Liow 1 1,"POLYGON ((103.78278 1.35611, 103.78278 1.34778...",11020.768555
1,AD1_2001__Liow 1 2,"POLYGON ((103.81222 1.35889, 103.81222 1.35056...",0.000000
2,AD1_2001__Liow 1 3,"POLYGON ((103.81583 1.39889, 103.81583 1.39056...",483.975861
3,AD1_2001__Liow 1 4,"POLYGON ((103.79139 1.33111, 103.79139 1.32278...",4338.080566
4,AD1_2001__Liow 1 5,"POLYGON ((103.80778 1.28694, 103.80778 1.27861...",7937.915039
...,...,...,...
35731,YY1_2018__Guillemot 1 66,"POLYGON ((75.48439 12.19630, 75.48439 12.18796...",590.265381
35732,YY1_2018__Guillemot 1 67,"POLYGON ((75.48429 12.21732, 75.48429 12.20898...",590.265320
35733,YY1_2018__Guillemot 1 7,"POLYGON ((75.52879 12.26859, 75.52879 12.26025...",110.101952
35734,YY1_2018__Guillemot 1 8,"POLYGON ((75.52834 12.26809, 75.52834 12.25975...",110.101952


### Downsample population data

In [73]:
downscale_factor = 10  # Go from 1km to 10km

with rasterio.open("../../data/GPW/gpw_v4_2000_30_sec.tif") as dataset:

    # Read dataset and specify output shape
    # Use the average population density of the pixels being aggregated
    data = dataset.read(
        out_shape=(
            dataset.count,
            int(dataset.height / downscale_factor),
            int(dataset.width / downscale_factor),
        ),
        resampling=Resampling.average,
    )

    # Scale image transform
    transform = dataset.transform * dataset.transform.scale(
        (dataset.width / data.shape[-1]), (dataset.height / data.shape[-2])
    )

    # Define the metadata for the new downsampled raster
    out_meta = dataset.meta.copy()
    out_meta.update(
        {
            "driver": "GTiff",
            "height": data.shape[1],
            "width": data.shape[2],
            "transform": transform,
        }
    )

    # Write the downsampled raster to a new file
    with rasterio.open("../../data/GPW/gpw_v4_2000_10_km.tif", "w", **out_meta) as dest:
        dest.write(data)

### Use Method 1 on downsampled data

In [74]:
start = time.time()

# Load the raster file
raster_data = rasterio.open("../../data/GPW/gpw_v4_2000_10_km.tif")

# Find the value used for nodata
nodata_value = raster_data.nodatavals[0]

# Extract population density from the grids for each coordinate
pop_density = np.array(list(raster_data.sample(coord_list))).flatten()

# Set density to np.nan if it seems to be a nodata pixel
if nodata_value is not None:
    pop_density[np.isclose(pop_density, nodata_value, equal_nan=True)] = np.nan

# Append new column to the dataframe
gdf_sites_point = gdf_sites_point.copy()
gdf_sites_point["Pop_density_2000_10km"] = pop_density

end = time.time()
print(end - start)

0.9055449962615967


In [75]:
gdf_sites_point

Unnamed: 0,SSS,geometry,Pop_density_2000_1km,Pop_density_2000_10km
0,AD1_2001__Liow 1 1,POINT (103.77861 1.35194),12423.108398,5424.973145
1,AD1_2001__Liow 1 2,POINT (103.80806 1.35472),0.000000,5424.973145
2,AD1_2001__Liow 1 3,POINT (103.81167 1.39472),1935.903442,5424.973145
3,AD1_2001__Liow 1 4,POINT (103.78722 1.32694),4328.725586,6793.770996
4,AD1_2001__Liow 1 5,POINT (103.80361 1.28278),10090.784180,6793.770996
...,...,...,...,...
35731,YY1_2018__Guillemot 1 66,POINT (75.48022 12.19213),590.265381,590.265320
35732,YY1_2018__Guillemot 1 67,POINT (75.48012 12.21315),590.265320,590.265320
35733,YY1_2018__Guillemot 1 7,POINT (75.52462 12.26442),115.935951,129.538223
35734,YY1_2018__Guillemot 1 8,POINT (75.52417 12.26392),115.935951,129.538223


### Use Method 2 on downsampled data

In [79]:
start = time.time()

# Make a copy of the original dataframe
gdf_sites_buff = gdf_sites.copy()

# Set buffer distance (half a pixel radius corresponds to 1km^2)
buffer_dist_degrees = (0.00833333333333387 / 2) * 10

# Perform the buffering to square polygons
gdf_sites_buff["geometry"] = gdf_sites_buff["geometry"].buffer(
    buffer_dist_degrees, cap_style="square"
)

# Save the file
gdf_sites_buff.to_file("../../data/PREDICTS/site_coord_buffered.shp")

end = time.time()
print(end - start)


  gdf_sites_buff["geometry"] = gdf_sites_buff["geometry"].buffer(


2.557081699371338


In [80]:
start = time.time()

# Calculate zonal statistics
stats = zonal_stats(
    vectors="../../data/PREDICTS/site_coord_buffered.shp",
    raster="../../data/GPW/gpw_v4_2000_10_km.tif",
    stats="mean",
    all_touched=True,
)

# Get values and add columns to original dataframe
pop_density = [x["mean"] for x in stats]
gdf_sites_buff = gdf_sites_buff.copy()
gdf_sites_buff["Pop_density_2000_10km"] = pop_density

end = time.time()
print(end - start)

33.16411209106445


In [81]:
gdf_sites_buff

Unnamed: 0,SSS,geometry,Pop_density_2000_10km
0,AD1_2001__Liow 1 1,"POLYGON ((103.82028 1.39361, 103.82028 1.31028...",4751.166992
1,AD1_2001__Liow 1 2,"POLYGON ((103.84972 1.39639, 103.84972 1.31306...",8288.630859
2,AD1_2001__Liow 1 3,"POLYGON ((103.85333 1.43639, 103.85333 1.35306...",5845.929688
3,AD1_2001__Liow 1 4,"POLYGON ((103.82889 1.36861, 103.82889 1.28528...",4751.166992
4,AD1_2001__Liow 1 5,"POLYGON ((103.84528 1.32444, 103.84528 1.24111...",4157.579590
...,...,...,...
35731,YY1_2018__Guillemot 1 66,"POLYGON ((75.52189 12.23380, 75.52189 12.15046...",531.914062
35732,YY1_2018__Guillemot 1 67,"POLYGON ((75.52179 12.25482, 75.52179 12.17148...",359.694519
35733,YY1_2018__Guillemot 1 7,"POLYGON ((75.56629 12.30609, 75.56629 12.22275...",359.694519
35734,YY1_2018__Guillemot 1 8,"POLYGON ((75.56584 12.30559, 75.56584 12.22225...",359.694519


### Method 3: Something more complicated with projections...

In [37]:
def project_to_utm(point):

    # Get the coordinate point values
    long, lat = point.x, point.y

    # Determine the UTM zone and hemisphere
    zone_number = int((long + 180) // 6) + 1
    epsg_code = f"EPSG:{32700 + zone_number if lat < 0 else 32600 + zone_number}"

    # Initialize the Transformer object
    transformer = Transformer.from_crs("EPSG:4326", epsg_code, always_xy=True)

    # Transform the latitude and longitude to the UTM coordinates
    utm_x, utm_y = transformer.transform(long, lat)

    return shapely.geometry.Point(utm_x, utm_y)

In [47]:
# Make a copy of the original dataframe
gdf_sites_buff = gdf_sites.copy()

# Project each point using the function
gdf_sites_buff["utm_coords"] = gdf_sites_buff.apply(
    lambda row: project_to_utm(row["geometry"]), axis=1
)

In [None]:
# Set buffer radius
buffer_radius = 1000  # 1km

# Buffer points into circle polygons
gdf_sites_buff["buffered_geometry"] = gdf_sites_buff["utm_coords"].apply(
    lambda row: row.buffer(buffer_radius, cap_style="round")
)

# Drop intermediary columns
gdf_sites_buff = gdf_sites_buff.drop(["geometry", "utm_coords"], axis="columns").rename(
    columns={"buffered_geometry": "geometry"}
)

# Reproject the data
raster_crs = "EPSG:4326"
gdf_sites_buff_reproj = gdf_sites_buff.to_crs(raster_crs)

## Global Roads Open Access Data Set (gROADS), v1

https://sedac.ciesin.columbia.edu/data/set/groads-global-roads-open-access-v1

One data set available for each continent.

### Metadata description

- `OBJECTID`: Object ID
- `SourceID`: Source ID
- `Picture`: Picture
- `Exs`: Existence Category. Options include 1=Definite, 2=Doubtful, 0=Unspecified
- `Notes`: Notes
- `RoadID`: Road ID
- `ONme`: Official Road Name
- `RteNme`: Route Name
- `NtlClass`: National Inventory Road Class
- `FClass`: Functional Class with options 1=Highway, 2=Primary, 3=Secondary, 4=Tertiary, 5=Local/ Urban, 6=Trail, 7=Private, 0=Unspecified
- `Crgway`: Carriageways. Options include 1=Single, 2=Dual, 0=Unspecified
- `NumLanes`: Number of lanes
- `LneWidthM`: Lane Width in meters
- `RdWidthM`: Road Width in meters
- `AxleLoadMT`: Maximum Axle Loading in MT
- `TotLoadMT`: Maximum Total Loading in MT
- `SrfTpe`: Surface Type with options 1=Paved, 2=Gravel, 3=Dirt/Sand, 4=Steel, 5=Wood, 6=Grass, 0=Unspecified
- `SrfCond`: Surface Condition with options 1=Rough (<40kph), 2=Smooth (>40kph), 3=Snow/Ice, 4=Mud, 0=Unspecified
- `SrfPrep`: Surface Preparation with options 1=Natural Compaction, 2=Traffic Compaction, 3=Engineered Compaction, 4=Uncompacted, 0=Unspecified
- `IsSeasonal`: Affected by Season. Options include 1=Yes, 2=No, 0=Unspecified
- `CurntPrac`: Current Road Practicability with options 1=Non-motorized, 2=Motorbike, 3=4WD <3.5MT, 4=Light Truck <10MT, 5=Heavy Truck <20MT, 6=Truck + Trailer >20MT, 0=Unspecified
- `GdWthrPrac`: Good Weather Road Practicability with options 1=Non-motorized, 2=Motorbike, 3=4WD <3.5MT, 4=Light Truck <10MT, 5=Heavy Truck <20MT, 6=Truck + Trailer >20MT, 0=Unspecified
- `BdWthrPrac`: Bad Weather Road Practicability. Options include 1=Non-motorized, 2=Motorbike, 3=4WD <3.5MT, 4=Light Truck <10MT, 5=Heavy Truck <20MT, 6=Truck + Trailer >20MT, 0=Unspecified
- `SpeedLimit`: Speed Limit in Km/hr
- `CurntSpeed`: Current Average Speed
- `GnralSpeed`: General Average Speed
- `IsUndrCstr`: Is under Construction / Repairs. Options include 1=Yes, 2=No, 0=Unspecified
- `CstWrkETC`: Construction Work Estimated Completion Date
- `GradDeg`: Gradient in degrees
- `Sec`: Road Security Category. Options include 1=Category A (low risk), 2=Category B (low to medium risk), 3=Category C (medium to high risk), 4=Category D (high risk), 5=Category E (critical risk), 0=Unspecified
- `HasShouldr`: Has Shoulder. Options include 1=Yes, 2=No, 0=Unspecified
- `HasSidewalk`: Has Sidewalk. Options include 1=Yes, 2=No, 0=Unspecified
- `DrivSide`: Driving Side. Options include 1=Left, 2=Right, 0=Unspecified
- `IsElevated`: Is elevated / suspended above ground/water. Options include 1=Yes, 2=No, 0=Unspecified
- `HasMedian`: Has Median. Options include 1=Yes, 2=No, 0=Unspecified
- `OpStatus`: Operational Status. Options include 1=Open, 2=Restricted, 3=Closed, 4=Abandoned/Disused, 0=Unspecified
- `Shape_Length`: Length of segment
- `Length_KM`: Length of segment in kilometers



In [9]:
roads_americas = "../../data/gROADS/groads-v1-americas-shp/groads-v1-americas.shp"
# groads_africa =
# groads_asia =
# groads_europe =
# groads_oceania_east =
# groads_oceania_west =

In [11]:
gdf_roads = gpd.read_file(roads_americas)
gdf_roads.head()

Unnamed: 0,SOURCEID,EXS,NOTES,ROADID,ONME,RTENME,NTLCLASS,FCLASS,CRGWAY,NUMLANES,LNEWIDTHM,RDWIDTHM,AXLELOADMT,TOTLOADMT,SRFTPE,SRFCOND,SRFPREP,ISSEASONAL,CURNTPRAC,GDWTHRPRAC,BDWTHRPRAC,SPEEDLIMIT,CURNTSPEED,GNRALSPEED,ISUNDRCSTR,CSTWRKETC,GRADDEG,SEC,HASSHOULDR,HASSIDEWLK,DRIVSIDE,ISELEVATED,HASMEDIAN,OPSTATUS,LENGTH_KM,Shape_Leng,geometry
0,s034_0001,0.0,,ITOS156229,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.086124,0.216327,"LINESTRING (-46.64422 -22.52663, -46.63456 -22..."
1,s034_0001,0.0,,ITOS156230,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.525002,0.338829,"LINESTRING (-49.61825 -23.06787, -49.54568 -23..."
2,s034_0001,0.0,,ITOS156231,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.546781,0.268044,"LINESTRING (-49.23048 -24.08635, -49.20698 -24..."
3,s034_0001,0.0,,ITOS156232,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.047282,0.214377,"LINESTRING (-52.35325 -16.15168, -52.37517 -16..."
4,s034_0001,0.0,,ITOS156233,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.087837,0.065068,"LINESTRING (-52.50087 -17.59160, -52.52445 -17..."


## WorldClim elevation data

https://worldclim.org/data/worldclim21.html#google_vignette

## WorldClim bioclimatic data

https://worldclim.org/data/bioclim.html