# Bayesian biodiversity: PREDICTS data processing

In [1]:
import pandas as pd
import geopandas as geopd
from shapely.geometry import Point

In [2]:
# Load black for formatting
import jupyter_black
jupyter_black.load()

# Adjust display settings for pandas
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

## Load and merge the two releases of the database

### 2016 release

https://data.nhm.ac.uk/dataset/the-2016-release-of-the-predicts-database-v1-1

**Summary**
- 3,278,056 measurements
- 26,194 sampling locations
- 94 countries
- 47,089 species
- Based on 480 studies

In [3]:
# Load the original predicts data
df_predicts_orig = pd.read_csv("../../data/PREDICTS/PREDICTS_2016/data.csv")

  df_predicts_orig = pd.read_csv("../../data/PREDICTS/PREDICTS_2016/data.csv")


In [6]:
df_predicts_orig.shape

(3278056, 68)

### 2022 release of additional data

https://data.nhm.ac.uk/dataset/release-of-data-added-to-the-predicts-database-november-2022

**Summary**
- 1,040,752 measurements
- 9,544 sampling locations
- 46 countries
- 10,635 species
- Based on 115 studies

In [4]:
# Load the new 2022 predicts data
df_predicts_new = pd.read_csv("../../data/PREDICTS/PREDICTS_2022/data.csv")

  df_predicts_new = pd.read_csv("../../data/PREDICTS/PREDICTS_2022/data.csv")


In [5]:
df_predicts_new.shape

(1040752, 72)

### Merge 2016 and 2022 data

In [7]:
# Find out if there are any columns that are not overlapping
unique_2016 = list(set(df_predicts_orig.columns) - set(df_predicts_new.columns))
unique_2022 = list(set(df_predicts_new.columns) - set(df_predicts_orig.columns))
print(unique_2016)
print(unique_2022)

[]
['Max_linear_extent', 'Source_for_predominant_habitat', 'Predominant_habitat', 'Eco_region_distance_metres']


In [8]:
# Drop non-overlapping columns from 2022 dataframe
df_predicts_new = df_predicts_new.drop(
    [
        "Max_linear_extent",
        "Eco_region_distance_metres",
        "Predominant_habitat",
        "Source_for_predominant_habitat",
    ],
    axis="columns",
)

# Make sure we have the same column order
df_predicts_new = df_predicts_new[df_predicts_orig.columns]

# Append new data to old with matching column order
df_predicts = pd.concat([df_predicts_orig, df_predicts_new], ignore_index=True)

In [9]:
# Reorganize the columns in the df to a logical structure
# See the metadata description in data exploration notebook for details
col_order = [
    "_id",
    "Source_ID",
    "Reference",
    "Study_number",
    "Study_name",
    "Block",
    "Site_number",
    "Site_name",
    "SS",
    "SSS",
    "SSB",
    "SSBS",
    "Longitude",
    "Latitude",
    "Coordinates_method",
    "Country",
    "Country_distance_metres",
    "UN_region",
    "UN_subregion",
    "Realm",
    "Biome",
    "Ecoregion",
    "Ecoregion_distance_metres",
    "Wilderness_area",
    "Hotspot",
    "Study_common_taxon",
    "Rank_of_study_common_taxon",
    "Sample_start_earliest",
    "Sample_end_latest",
    "Sample_midpoint",
    "Sample_date_resolution",
    "Sampling_method",
    "Sampling_effort",
    "Rescaled_sampling_effort",
    "Sampling_effort_unit",
    "Max_linear_extent_metres",
    "Transect_details",
    "Taxon",
    "Taxon_number",
    "Taxon_name_entered",
    "Parsed_name",
    "Best_guess_binomial",
    "COL_ID",
    "Kingdom",
    "Phylum",
    "Class",
    "Order",
    "Family",
    "Genus",
    "Species",
    "Higher_taxon",
    "Indication",
    "Name_status",
    "Rank",
    "Diversity_metric_type",
    "Diversity_metric",
    "Diversity_metric_is_effort_sensitive",
    "Diversity_metric_is_suitable_for_Chao",
    "Diversity_metric_unit",
    "Measurement",
    "Effort_corrected_measurement",
    "Predominant_land_use",
    "Source_for_predominant_land_use",
    "Use_intensity",
    "Habitat_as_described",
    "Habitat_patch_area_square_metres",
    "Km_to_nearest_edge_of_habitat",
    "Years_since_fragmentation_or_conversion",
]

df_predicts = df_predicts[col_order]
df_predicts.head()

Unnamed: 0,_id,Source_ID,Reference,Study_number,Study_name,Block,Site_number,Site_name,SS,SSS,SSB,SSBS,Longitude,Latitude,Coordinates_method,Country,Country_distance_metres,UN_region,UN_subregion,Realm,Biome,Ecoregion,Ecoregion_distance_metres,Wilderness_area,Hotspot,Study_common_taxon,Rank_of_study_common_taxon,Sample_start_earliest,Sample_end_latest,Sample_midpoint,Sample_date_resolution,Sampling_method,Sampling_effort,Rescaled_sampling_effort,Sampling_effort_unit,Max_linear_extent_metres,Transect_details,Taxon,Taxon_number,Taxon_name_entered,Parsed_name,Best_guess_binomial,COL_ID,Kingdom,Phylum,Class,Order,Family,Genus,Species,Higher_taxon,Indication,Name_status,Rank,Diversity_metric_type,Diversity_metric,Diversity_metric_is_effort_sensitive,Diversity_metric_is_suitable_for_Chao,Diversity_metric_unit,Measurement,Effort_corrected_measurement,Predominant_land_use,Source_for_predominant_land_use,Use_intensity,Habitat_as_described,Habitat_patch_area_square_metres,Km_to_nearest_edge_of_habitat,Years_since_fragmentation_or_conversion
0,26004,AD1_2008__Billeter,Billeter et al. 2008,8,Greenveins2001_France02,F2,32,F2.P,AD1_2008__Billeter 8,AD1_2008__Billeter 8 32,AD1_2008__Billeter 8 F2,AD1_2008__Billeter 8 F2 32,-1.590365,48.472153,Direct from publication / author,France,0.0,Europe,Western Europe,Palearctic,Temperate Broadleaf & Mixed Forests,Atlantic Mixed Forests,0.0,,,Hymenoptera,Order,2002-01-01,2002-12-31,2002-07-02,year,flight trap,5.0,1.0,week,1414.214,Ecotone between a Green-veins habitat and an a...,Lasioglossum morio,49,Lasioglossum morio,Lasioglossum morio,Lasioglossum morio,6967008.0,Animalia,Arthropoda,Insecta,Hymenoptera,Halictidae,Lasioglossum,morio,Hymenoptera,Hymenoptera: Apidae sensu lato,accepted name,Species,Abundance,abundance,True,True,individuals,0.0,0.0,Cropland,Direct from publication / author,Minimal use,,,,13.5
1,26006,AD1_2008__Billeter,Billeter et al. 2008,8,Greenveins2001_France02,F2,32,F2.P,AD1_2008__Billeter 8,AD1_2008__Billeter 8 32,AD1_2008__Billeter 8 F2,AD1_2008__Billeter 8 F2 32,-1.590365,48.472153,Direct from publication / author,France,0.0,Europe,Western Europe,Palearctic,Temperate Broadleaf & Mixed Forests,Atlantic Mixed Forests,0.0,,,Hymenoptera,Order,2002-01-01,2002-12-31,2002-07-02,year,flight trap,5.0,1.0,week,1414.214,Ecotone between a Green-veins habitat and an a...,Lasioglossum pauxillum,51,Lasioglossum pauxillum,Lasioglossum pauxillum,Lasioglossum pauxillum,6967187.0,Animalia,Arthropoda,Insecta,Hymenoptera,Halictidae,Lasioglossum,pauxillum,Hymenoptera,Hymenoptera: Apidae sensu lato,accepted name,Species,Abundance,abundance,True,True,individuals,0.0,0.0,Cropland,Direct from publication / author,Minimal use,,,,13.5
2,26024,AD1_2008__Billeter,Billeter et al. 2008,8,Greenveins2001_France02,F3,33,F3.A,AD1_2008__Billeter 8,AD1_2008__Billeter 8 33,AD1_2008__Billeter 8 F3,AD1_2008__Billeter 8 F3 33,-1.610663,48.540593,Direct from publication / author,France,0.0,Europe,Western Europe,Palearctic,Temperate Broadleaf & Mixed Forests,Atlantic Mixed Forests,0.0,,,Hymenoptera,Order,2002-01-01,2002-12-31,2002-07-02,year,flight trap,5.0,1.0,week,1414.214,Ecotone between a Green-veins habitat and an a...,Andrena helvola,11,Andrena helvola,Andrena helvola,Andrena helvola,6960605.0,Animalia,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,helvola,Hymenoptera,Hymenoptera: Apidae sensu lato,accepted name,Species,Abundance,abundance,True,True,individuals,0.0,0.0,Cropland,Direct from publication / author,Light use,,,,63.5
3,26031,AD1_2008__Billeter,Billeter et al. 2008,8,Greenveins2001_France02,F3,33,F3.A,AD1_2008__Billeter 8,AD1_2008__Billeter 8 33,AD1_2008__Billeter 8 F3,AD1_2008__Billeter 8 F3 33,-1.610663,48.540593,Direct from publication / author,France,0.0,Europe,Western Europe,Palearctic,Temperate Broadleaf & Mixed Forests,Atlantic Mixed Forests,0.0,,,Hymenoptera,Order,2002-01-01,2002-12-31,2002-07-02,year,flight trap,5.0,1.0,week,1414.214,Ecotone between a Green-veins habitat and an a...,Andrena ovatula,18,Andrena ovatula,Andrena ovatula,Andrena ovatula,6960904.0,Animalia,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,ovatula,Hymenoptera,Hymenoptera: Apidae sensu lato,accepted name,Species,Abundance,abundance,True,True,individuals,0.0,0.0,Cropland,Direct from publication / author,Light use,,,,63.5
4,26032,AD1_2008__Billeter,Billeter et al. 2008,8,Greenveins2001_France02,F3,33,F3.A,AD1_2008__Billeter 8,AD1_2008__Billeter 8 33,AD1_2008__Billeter 8 F3,AD1_2008__Billeter 8 F3 33,-1.610663,48.540593,Direct from publication / author,France,0.0,Europe,Western Europe,Palearctic,Temperate Broadleaf & Mixed Forests,Atlantic Mixed Forests,0.0,,,Hymenoptera,Order,2002-01-01,2002-12-31,2002-07-02,year,flight trap,5.0,1.0,week,1414.214,Ecotone between a Green-veins habitat and an a...,Andrena,19,Andrena spinigera,Andrena spinigera,Andrena spinigera,13049592.0,Animalia,Arthropoda,Insecta,Hymenoptera,Andrenidae,Andrena,,Hymenoptera,Hymenoptera: Apidae sensu lato,accepted name,Genus,Abundance,abundance,True,True,individuals,0.0,0.0,Cropland,Direct from publication / author,Light use,,,,63.5


In [10]:
# Save the merged dataframe as a csv file
df_predicts.to_csv("../../data/PREDICTS/merged_data.csv")

## Extract coordinate data to use with raster data 

In [25]:
# Get the coordinates for each site
df_site_long_lat = pd.DataFrame(
    df_predicts.groupby("SSS")[["Longitude", "Latitude"]].min()
).reset_index()

# Generate coordinate tuples from dataframe
coordinates = zip(
    df_site_long_lat["Longitude"].tolist(), df_site_long_lat["Latitude"].tolist()
)

# Create point geometries
geometry = [Point(x, y) for x, y in coordinates]

# Create a geodataframe
gdf_sites = geopd.GeoDataFrame({"SSS": df_site_long_lat["SSS"], "geometry": geometry})
gdf_sites.crs = "EPSG:4326"

# Save as shapefile
gdf_sites.to_file("../../data/PREDICTS/site_coord.shp")