# Bayesian biodiversity: Experimental model pipeline

In [97]:
import polars as pl
import numpy as np
import pandas as pd
import datetime as dt

In [36]:
# Load black for formatting
import jupyter_black

jupyter_black.load()

# Adjust display settings for polars
pl.Config(tbl_rows=100)
pl.Config(tbl_cols=50)

<polars.config.Config at 0x29653fbc0>

## Basic data processing

In [109]:
# Load merged PREDICTS data
df_predicts = pl.read_csv("../../data/PREDICTS/merged_data.csv")

In [4]:
df_predicts.shape

(4318808, 68)

In [5]:
# Load population density data
df_pop_density = pl.read_parquet("../../data/GPW/output/pop_density.parquet")

In [7]:
df_pop_density.shape

(35736, 16)

In [6]:
# Load road density data
df_road_density = pl.read_parquet("../../data/gROADS/output/road_density.parquet")

In [8]:
df_road_density.shape

(35736, 4)

### Clean up PREDICTS data

Remove columns that are not needed for analysis or modelling.

In [110]:
predicts_cols = [
    "SS",
    "SSB",
    "SSBS",
    "Longitude",
    "Latitude",
    "Country",
    "UN_region",
    "UN_subregion",
    "Realm",
    "Biome",
    "Ecoregion",
    "Wilderness_area",
    "Hotspot",
    "Sample_midpoint",
    "Sampling_method",
    "Sampling_effort",
    "Rescaled_sampling_effort",
    "Max_linear_extent_metres",
    "Taxon_name_entered",
    "COL_ID",
    "Kingdom",
    "Phylum",
    "Class",
    "Order",
    "Family",
    "Genus",
    "Species",
    "Diversity_metric_type",
    "Measurement",
    "Effort_corrected_measurement",
    "Predominant_land_use",
    "Use_intensity",
    "Habitat_patch_area_square_metres",
    "Km_to_nearest_edge_of_habitat",
    "Years_since_fragmentation_or_conversion",
]

df_predicts = df_predicts.select(predicts_cols)

In [111]:
df_predicts.shape

(4318808, 35)

### Join PREDICTS and road density data

In [15]:
df_road_density.head()

SSBS,Road_density_1km,Road_density_10km,Road_density_50km
str,f64,f64,f64
"""AD1_2005__Blan…",2081.750094,190364.497423,2084100.0
"""AD1_2005__Blan…",1119.81094,187993.994423,2072400.0
"""AD1_2005__Blan…",0.0,98624.640142,1935900.0
"""AD1_2005__Blan…",1186.374931,86832.033723,1826100.0
"""AD1_2005__Blan…",0.0,68115.435147,1757900.0


In [112]:
df_predicts_roads = df_predicts.join(
    df_road_density, on="SSBS", how="left", validate="m:1"
)

In [113]:
df_predicts_roads.shape

(4318808, 38)

In [114]:
df_predicts_roads.head()

SS,SSB,SSBS,Longitude,Latitude,Country,UN_region,UN_subregion,Realm,Biome,Ecoregion,Wilderness_area,Hotspot,Sample_midpoint,Sampling_method,Sampling_effort,Rescaled_sampling_effort,Max_linear_extent_metres,Taxon_name_entered,COL_ID,Kingdom,Phylum,Class,Order,Family,Genus,Species,Diversity_metric_type,Measurement,Effort_corrected_measurement,Predominant_land_use,Use_intensity,Habitat_patch_area_square_metres,Km_to_nearest_edge_of_habitat,Years_since_fragmentation_or_conversion,Road_density_1km,Road_density_10km,Road_density_50km
str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,f64,str,str,str,str,str,str,str,str,f64,f64,str,str,str,str,f64,f64,f64,f64
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.590365,48.472153,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,"""2002-07-02""","""flight trap""",5.0,1.0,1414.214,"""Lasioglossum m…",6967008.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Halictidae""","""Lasioglossum""","""morio""","""Abundance""",0.0,0.0,"""Cropland""","""Minimal use""",,,13.5,419.89974,437683.782299,10687000.0
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.590365,48.472153,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,"""2002-07-02""","""flight trap""",5.0,1.0,1414.214,"""Lasioglossum p…",6967187.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Halictidae""","""Lasioglossum""","""pauxillum""","""Abundance""",0.0,0.0,"""Cropland""","""Minimal use""",,,13.5,419.89974,437683.782299,10687000.0
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.610663,48.540593,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,"""2002-07-02""","""flight trap""",5.0,1.0,1414.214,"""Andrena helvol…",6960605.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Andrenidae""","""Andrena""","""helvola""","""Abundance""",0.0,0.0,"""Cropland""","""Light use""",,,63.5,810.058171,436846.596911,10873000.0
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.610663,48.540593,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,"""2002-07-02""","""flight trap""",5.0,1.0,1414.214,"""Andrena ovatul…",6960904.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Andrenidae""","""Andrena""","""ovatula""","""Abundance""",0.0,0.0,"""Cropland""","""Light use""",,,63.5,810.058171,436846.596911,10873000.0
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.610663,48.540593,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,"""2002-07-02""","""flight trap""",5.0,1.0,1414.214,"""Andrena spinig…",13049592.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Andrenidae""","""Andrena""",,"""Abundance""",0.0,0.0,"""Cropland""","""Light use""",,,63.5,810.058171,436846.596911,10873000.0


### Interpolation of population density data to join with PREDICTS

In [118]:
df_pop_density.head()

SSBS,Pop_density_1km_2000,Pop_density_1km_2005,Pop_density_1km_2010,Pop_density_1km_2015,Pop_density_1km_2020,Pop_density_10km_2000,Pop_density_10km_2005,Pop_density_10km_2010,Pop_density_10km_2015,Pop_density_10km_2020,Pop_density_50km_2000,Pop_density_50km_2005,Pop_density_50km_2010,Pop_density_50km_2015,Pop_density_50km_2020
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""AD1_2001__Liow…",8665.053819,10686.046875,12856.830729,14912.848958,16510.614583,6093.886228,6958.095808,7791.201098,8467.353293,8861.665669,933.654075,1093.091219,1268.813392,1454.161213,1641.065066
"""AD1_2001__Liow…",1131.090088,1245.123128,1336.200765,1381.482259,1362.475993,7262.850962,8273.240919,9271.036325,10134.848291,10751.494658,940.373195,1101.212981,1278.487799,1465.453558,1653.969848
"""AD1_2001__Liow…",4979.729167,5457.96875,5831.039931,6000.927951,5890.359375,6354.087041,7373.591876,8452.087041,9503.366538,10438.625725,898.962319,1050.302434,1216.123064,1389.497541,1562.316572
"""AD1_2001__Liow…",4332.883681,4888.942708,5376.256944,5694.303819,5751.630208,6440.715232,7292.890728,8099.86755,8738.147903,9092.586093,951.244901,1114.757922,1295.39474,1486.577678,1680.202951
"""AD1_2001__Liow…",8465.158203,9318.025391,9998.738932,10336.432943,10192.90625,6748.165722,7515.062323,8172.850567,8578.686969,8606.26983,979.929484,1149.967869,1338.408612,1538.764551,1742.833948


In [133]:
# Split the dataframe to one for each resolution
df_pop_1km = df_pop_density.select([pl.col("SSBS"), pl.col("^.*1km.*$")]).rename(
    lambda col: col if col == "SSBS" else col[-4:]
)

df_pop_10km = df_pop_density.select([pl.col("SSBS"), pl.col("^.*10km.*$")]).rename(
    lambda col: col if col == "SSBS" else col[-4:]
)

df_pop_50km = df_pop_density.select([pl.col("SSBS"), pl.col("^.*50km.*$")]).rename(
    lambda col: col if col == "SSBS" else col[-4:]
)

In [120]:
df_pop_1km.head()

SSBS,2000,2005,2010,2015,2020
str,f64,f64,f64,f64,f64
"""AD1_2001__Liow…",8665.053819,10686.046875,12856.830729,14912.848958,16510.614583
"""AD1_2001__Liow…",1131.090088,1245.123128,1336.200765,1381.482259,1362.475993
"""AD1_2001__Liow…",4979.729167,5457.96875,5831.039931,6000.927951,5890.359375
"""AD1_2001__Liow…",4332.883681,4888.942708,5376.256944,5694.303819,5751.630208
"""AD1_2001__Liow…",8465.158203,9318.025391,9998.738932,10336.432943,10192.90625


In [134]:
def population_density_interpolation(df, resolution):

    # Define intervals and years to interpolate
    intervals = [(1984, 2000), (2000, 2005), (2005, 2010), (2010, 2015), (2015, 2020)]

    # Function to calculate growth rate
    def _calculate_growth_rate(df, start_year, end_year):
        return np.log(df[str(end_year)] / df[str(start_year)]) / (end_year - start_year)

    # Extrapolate back to 1984 using the growth rate from 2000 to 2005
    r_2000_2005 = _calculate_growth_rate(df, 2000, 2005)
    df = df.with_columns(
        (df["2000"] * np.exp(r_2000_2005 * (1984 - 2000))).alias("1984")
    )

    # Loop through each interval to calculate growth rates and interpolate
    for start_year, end_year in intervals:
        r = _calculate_growth_rate(df, start_year, end_year)
        for year in range(start_year, end_year + 1):
            if year not in df.columns:
                df = df.with_columns(
                    (df[str(start_year)] * np.exp(r * (year - start_year))).alias(
                        str(year)
                    )
                )

    # Reorder the columns to have them in chronological order
    df = df[["SSBS"] + sorted(df.columns[1:], key=int)]

    # Melt dataframe to go from wide to long format
    df = df.melt(
        id_vars=["SSBS"],
        value_vars=df.columns[1:],
        variable_name="Year",
        value_name=f"Pop_density_{resolution}",
    ).sort(["SSBS", "Year"])

    # Convert to datetime format
    df = df.with_columns(pl.col("Year").str.strptime(pl.Datetime, "%Y").dt.year())

    return df

In [136]:
df_pop_1km = population_density_interpolation(df_pop_1km, resolution="1km")
df_pop_10km = population_density_interpolation(df_pop_10km, resolution="10km")
df_pop_50km = population_density_interpolation(df_pop_50km, resolution="50km")
df_pop_1km.head()

SSBS,Year,Pop_density_1km
str,i32,f64
"""AD1_2001__Liow…",1984,4430.213653
"""AD1_2001__Liow…",1985,4619.913384
"""AD1_2001__Liow…",1986,4817.73597
"""AD1_2001__Liow…",1987,5024.02923
"""AD1_2001__Liow…",1988,5239.155872


In [115]:
# Convert to datetime format
df_predicts_roads = df_predicts_roads.with_columns(
    pl.col("Sample_midpoint").str.to_datetime("%Y-%m-%d")
)

# Extract the year information
df_predicts_roads = df_predicts_roads.with_columns(
    pl.col("Sample_midpoint").dt.year().alias("Sample_year")
)

In [137]:
# Join the population densities of the year matching the sample year
df_all = df_predicts_roads.clone()
for df in [df_pop_1km, df_pop_10km, df_pop_50km]:
    df_all = df_all.join(
        df,
        how="left",
        left_on=["SSBS", "Sample_year"],
        right_on=["SSBS", "Year"],
    )

In [138]:
df_all.head()

SS,SSB,SSBS,Longitude,Latitude,Country,UN_region,UN_subregion,Realm,Biome,Ecoregion,Wilderness_area,Hotspot,Sample_midpoint,Sampling_method,Sampling_effort,Rescaled_sampling_effort,Max_linear_extent_metres,Taxon_name_entered,COL_ID,Kingdom,Phylum,Class,Order,Family,Genus,Species,Diversity_metric_type,Measurement,Effort_corrected_measurement,Predominant_land_use,Use_intensity,Habitat_patch_area_square_metres,Km_to_nearest_edge_of_habitat,Years_since_fragmentation_or_conversion,Road_density_1km,Road_density_10km,Road_density_50km,Sample_year,Pop_density_1km,Pop_density_10km,Pop_density_50km
str,str,str,f64,f64,str,str,str,str,str,str,str,str,datetime[μs],str,f64,f64,f64,str,f64,str,str,str,str,str,str,str,str,f64,f64,str,str,str,str,f64,f64,f64,f64,i32,f64,f64,f64
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.590365,48.472153,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,2002-07-02 00:00:00,"""flight trap""",5.0,1.0,1414.214,"""Lasioglossum m…",6967008.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Halictidae""","""Lasioglossum""","""morio""","""Abundance""",0.0,0.0,"""Cropland""","""Minimal use""",,,13.5,419.89974,437683.782299,10687000.0,2002,34.898771,47.860182,125.423294
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.590365,48.472153,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,2002-07-02 00:00:00,"""flight trap""",5.0,1.0,1414.214,"""Lasioglossum p…",6967187.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Halictidae""","""Lasioglossum""","""pauxillum""","""Abundance""",0.0,0.0,"""Cropland""","""Minimal use""",,,13.5,419.89974,437683.782299,10687000.0,2002,34.898771,47.860182,125.423294
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.610663,48.540593,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,2002-07-02 00:00:00,"""flight trap""",5.0,1.0,1414.214,"""Andrena helvol…",6960605.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Andrenidae""","""Andrena""","""helvola""","""Abundance""",0.0,0.0,"""Cropland""","""Light use""",,,63.5,810.058171,436846.596911,10873000.0,2002,52.808231,54.903219,120.921166
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.610663,48.540593,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,2002-07-02 00:00:00,"""flight trap""",5.0,1.0,1414.214,"""Andrena ovatul…",6960904.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Andrenidae""","""Andrena""","""ovatula""","""Abundance""",0.0,0.0,"""Cropland""","""Light use""",,,63.5,810.058171,436846.596911,10873000.0,2002,52.808231,54.903219,120.921166
"""AD1_2008__Bill…","""AD1_2008__Bill…","""AD1_2008__Bill…",-1.610663,48.540593,"""France""","""Europe""","""Western Europe…","""Palearctic""","""Temperate Broa…","""Atlantic Mixed…",,,2002-07-02 00:00:00,"""flight trap""",5.0,1.0,1414.214,"""Andrena spinig…",13049592.0,"""Animalia""","""Arthropoda""","""Insecta""","""Hymenoptera""","""Andrenidae""","""Andrena""",,"""Abundance""",0.0,0.0,"""Cropland""","""Light use""",,,63.5,810.058171,436846.596911,10873000.0,2002,52.808231,54.903219,120.921166


In [139]:
df_all.shape

(4318808, 42)

In [13]:
# Filter out observations that are not abundances
df_abundance = df_predicts.filter(pl.col("Diversity_metric_type") == "Abundance")