# Biodiversity notebook


In [None]:
!pip install ecopy
!pip install plotly
!pip install scikit-fda
!pip install scipy
!pip install gdown
!pip install h3
!pip install geojson

In [None]:
!gdown https://drive.google.com/uc?id=1QoLyZGY2DNMO_gsvg3OULfddJSLtp3Qn
!gdown https://drive.google.com/uc?id=1oUoQD_jdq9n8-nIrmhEcP7yu0luQ_hnb

## Trees sample

In [None]:
import requests
import json
import pandas as pd
import numpy as np
from scipy.special import comb
import scipy.misc
scipy.misc.comb = comb
from ecopy import diversity

In [None]:
# API endpoint
URL = 'https://api.laji.fi/v0'
# Your email
EMAIL = 'hungmguyen13102003@gmail.com'
# Token ID
TOKEN = 'ffaOqj71Ro1sqR0QAVYkmOFY6vG0qDtS0ESEjvanTgsihNq69zLNMgwjIelO0zmd'

In [None]:
# How to request a token
# Just query a POST with your email. You will get the token in an email they sent you.
response = requests.post(
  url = URL + '/api-users',
  data = {
    'email': EMAIL
  })

# Check if it has been registered to the site
responseContent = json.loads(response.content.decode('utf-8'))
responseContent

In [None]:
# Initial request to get the last page
response = requests.get(
  url = URL + '/warehouse/query/unit/list',
  params={
    'access_token': TOKEN,
    'page': 1,
    'pageSize': 10000, # Limit 10,000 per page
    'informalTaxonGroupId': 'MVL.1083', # {Vascular plants: {Plant life forms: {Trees; Evergreen trees} } }
    'orderBy': ['gathering.eventDate.begin DESC', 'document.loadDate DESC', 'unit.taxonVerbatim ASC'] # Default sorting
  })

responseContent = response.content
results = json.loads(responseContent.decode('utf-8'))

In [None]:
lastPage = results['lastPage']

# Lists of variables
date = []
latitude = []
longitude = []
municipality = []
taxonId = []
science = []
verbatim = []

for pageIndex in range(1, lastPage + 1):
    # Request data from API and go through all the pages in the loop (approximately 8m with 12 pages and 115739 rows)
    response = requests.get(
        url = URL + '/warehouse/query/unit/list',
        params={
            'access_token': TOKEN,
            'page': pageIndex,
            'pageSize': 10000, # Limit 10,000 per page
            'informalTaxonGroupId': 'MVL.1083', # {Vascular plants: {Plant life forms: {Trees; Evergreen trees} } }
            'orderBy': ['gathering.eventDate.begin DESC', 'document.loadDate DESC', 'unit.taxonVerbatim ASC'] # Default sorting
    })

    responseContent = response.content
    dataset = json.loads(responseContent.decode('utf-8'))['results']

    # Extract relevant information, sometimes they might be missing
    for x in dataset:
        # Date of collection; normally it would be "DD-MM-YYYY", sometimes it could be a range "DD-MM-YYY - DD-MM-YYYY"
        if 'displayDateTime' in x['gathering']:
            date.append(x['gathering']['displayDateTime'])
        else:
            date.append(None)

        # Latitude and Longitude
        if 'conversions' in x['gathering']:
            latitude.append(x['gathering']['conversions']['wgs84CenterPoint']['lat'])
            longitude.append(x['gathering']['conversions']['wgs84CenterPoint']['lon'])
        else:
            latitude.append(None)
            longitude.append(None)

        # Municipality
        if 'interpretations' in x['gathering']  and 'municipalityDisplayname' in x['gathering']['interpretations']:
            municipality.append(x['gathering']['interpretations']['municipalityDisplayname'])
        else:
            municipality.append(None)

        # Taxon ID and Scientific Name
        if 'unit' in x and 'linkings' in x['unit']:
            taxonId.append(x['unit']['linkings']['taxon']['id'])
            science.append(x['unit']['linkings']['taxon']['scientificName'])
        else:
            taxonId.append(None)
            science.append(None)

        # Taxon Verbatim
        if 'unit' in x and 'taxonVerbatim' in x['unit']:
            verbatim.append(x['unit']['taxonVerbatim'])
        else:
            verbatim.append(None)


In [None]:
# Merge data into a dataframe; there are a total of 115739 observations, but only 115722 rows in the dataframe. This could be due to my variables selection, e.g. there are
# some observations that have none of the listed variables (no name, no time, no place)

data = {"date": date, "lat": latitude, "lon": longitude, "municipality": municipality, "taxonId": taxonId, "scientificName": science,
        "verbatim": verbatim}
df = pd.DataFrame(data=data)
print(df)
df.to_csv("../data/samples/tree_data_sample.csv", index=False)

# Data loading/cleaning

In [None]:
# I have uploaded the "tree_data_sample.csv" file to this session
df = pd.read_csv("../data/samples/tree_data_sample.csv")
df

# Another option is to download the citable file through this link https://laji.fi/en/citation/HBF.78994?locale=en
# This table would have full variables, but somehow it only has 115295 rows

In [None]:
# Remove 1800s, 1900s
df = df[~df['date'].str.startswith(('18', '19'), na=False)]

## Smoothing


In [None]:
#Round up latitude and longtitude to define approximate fields
df['lat'] = df['lat'].round(1)
df['lon'] = df['lon'].round(1)

#In every location, group by lon, lat, and name to see how many of each tree type present
groupedDf = df.groupby(['lon','lat','scientificName'])['scientificName'].count()
groupedDf = groupedDf.rename("count").reset_index()

#Group counts by tree types to make smoothing separately for each type
multiple_lists = [x for _, x in groupedDf.groupby('scientificName')]
rolling_window = []

final_df = pd.DataFrame()

for i in range(0, 11):
  #Sort values according to lon and lat to get closer points in an order
  multiple_lists[i] = multiple_lists[i].sort_values(['lon', 'lat'])

  #Remove other columns so rolling window work
  rolling_window.append(multiple_lists[i][{'count'}])

  #Rolling window with size 5 for each type
  rolling_window[i] = rolling_window[i].rolling(window=5, closed='both').mean()

  #Adding lon, lat, and name back
  rolling_window[i]['lon'] = multiple_lists[i]['lon']
  rolling_window[i]['lat'] = multiple_lists[i]['lat']
  rolling_window[i]['scientificName'] = multiple_lists[i]['scientificName']

  #Round zero because there can't be decimal observations
  rolling_window[i]['count'] = rolling_window[i]['count'].round(0)

  #Create the final data frame
  final_df = final_df.append(rolling_window[i])

final_df

In [None]:
#NOT FINISHED

#Calculating biodiversity indexes

#Simpson's Index

#Group by lon and lat, and sum count
total_tree_df = final_df[{'lon', 'lat', 'count'}].groupby(['lon','lat'])['count'].sum().reset_index()

#Loop over all elements and calculate ratio
ratio_df = pd.DataFrame()
for i in final_df:
  line = total_tree_df.loc[(total_tree_df['lon'] == i[0]) & (total_tree_df['lat'] == i[1])]
  #ratio = i[0] / line['count']
  print((line))
  ratio_df = ratio_df.append(pd.DataFrame([i[0], i[1], ratio]))

total_tree_df


## Without smoothing

In [None]:
# df = pd.read_csv('data.csv')
df = pd.read_csv("../data/samples/tree_data_sample.csv")
df.head()

In [None]:
root_url = "https://api.laji.fi/v0"
email = "nghi.vodong942003@gmail.com"
access_token = "PNQaOkSdHzJYm3zJI8PUG7OWHnmiILIOEakXPZ95gfoMEgICaGJ22IB5Hyfi5ZCT"

def extract_last_page (root_url, token, page_size):
    response = requests.get(
    url = root_url + '/warehouse/query/unit/list',
    params={
        'access_token': token,
        'page': 1,
        'pageSize': page_size, # Limit 10,000 per page
        'informalTaxonGroupId': 'MVL.1083', # {Vascular plants: {Plant life forms: {Trees; Evergreen trees} } }
        'orderBy': ['gathering.eventDate.begin DESC', 'document.loadDate DESC', 'unit.taxonVerbatim ASC'] # Default sorting
    })

    responseContent = response.content
    results = json.loads(responseContent.decode('utf-8'))
    lastPage = results['lastPage']
    return lastPage

lastPage = extract_last_page(root_url, access_token, 10000)

features = [
    "gathering.displayDateTime",
    "gathering.conversions.wgs84CenterPoint.lat",
    "gathering.conversions.wgs84CenterPoint.lon",
    "unit.linkings.taxon.id",
    "unit.linkings.taxon.scientificName",
    "gathering.interpretations.municipalityDisplayname",
    "gathering.interpretations.finnishMunicipality"
]

def extract_dataset (root_url, token, page_size, lastPage, features = []):
    df = pd.DataFrame({ feature: [] for feature in features })
    for pageIndex in range(1, lastPage + 1):
    # Request data from API and go through all the pages in the loop (approximately 8m with 12 pages and 115739 rows)
        response = requests.get(
            url = root_url + '/warehouse/query/unit/list',
            params={
                'access_token': token,
                'page': pageIndex,
                'pageSize': page_size, # Limit 10,000 per page
                'informalTaxonGroupId': 'MVL.1083', # {Vascular plants: {Plant life forms: {Trees; Evergreen trees} } }
                'selected': features,
                'orderBy': ['gathering.eventDate.begin DESC', 'document.loadDate DESC', 'unit.taxonVerbatim ASC'] # Default sorting
        })

        responseContent = response.content
        dataset = json.loads(responseContent.decode('utf-8'))['results']
        df = pd.concat([df, pd.json_normalize(dataset)])
    return df

df = extract_dataset(root_url, access_token, 10000, lastPage , features)
df = df.rename(columns = {
    "gathering.displayDateTime" : "datetime",
    "gathering.conversions.wgs84CenterPoint.lat" : "lat",
    "gathering.conversions.wgs84CenterPoint.lon": "lon",
    "gathering.interpretations.municipalityDisplayname": "municipality",
    "gathering.interpretations.finnishMunicipality" : "municipalityId",
    "unit.linkings.taxon.id" : "taxonId",
    "unit.linkings.taxon.scientificName" : "scientificName"})
df

In [None]:
def group_observations (dataframe, columns):
    grouped_df = dataframe.groupby(columns).count()
    grouped_df.reset_index(inplace = True)
    columns_to_drop = list(filter(lambda name : name not in columns, grouped_df.columns))
    grouped_df = grouped_df.drop(columns = columns_to_drop[1:])
    grouped_df.columns = columns + ["nof_obs"]
    return grouped_df

# scientificName or taxonId works
grouped_df = group_observations(df, ['municipalityId', 'scientificName'])
#grouped_df.to_csv('grouped_municipality.csv')

def pivot_obs_dataframe (dataframe, species_identifier, area_id):
    pivoted_df = dataframe.pivot(index = area_id, columns = species_identifier, values = "nof_obs")
    pivoted_df = pivoted_df.fillna(0.0)
    return pivoted_df


pivoted_df = pivot_obs_dataframe(grouped_df, "scientificName", "municipalityId")
#pivoted_df.to_csv('pivoted_municipality.csv')
pivoted_df

In [None]:
def shannon_entropy (area_row):
    numpy_row = area_row.to_numpy()
    probabilities = numpy_row / np.sum(numpy_row)
    probabilities = probabilities[np.where(probabilities > 0)]
    return -np.sum(probabilities*np.log2(probabilities))

shannon_entropies = pivoted_df.apply(shannon_entropy, axis=1)
shannon_entropies

In [None]:
def simpson_index (area_row):
    numpy_row = area_row.to_numpy()
    observed_species = numpy_row[np.where(numpy_row > 0.0)]
    species_index = observed_species * (observed_species - 1)
    total_observations = np.sum(observed_species)
    total_index = total_observations * (total_observations - 1)
    return 1 - (np.sum(species_index) / total_index)

simpson_indices = pivoted_df.apply(simpson_index, axis=1)
simpson_indices

In [None]:
root_url = "https://api.laji.fi/v0"
email = "nghi.vodong942003@gmail.com"
access_token = "PNQaOkSdHzJYm3zJI8PUG7OWHnmiILIOEakXPZ95gfoMEgICaGJ22IB5Hyfi5ZCT"

def request_areas_info (municipality_ids, root_url, token):
    assert len(municipality_ids) > 0
    cleaned_ids = [id for id in municipality_ids if str(id) != "nan"]
    unique_ids = list(set(cleaned_ids))
    unique_ids.sort()
    area_endpoint_url = root_url + "/areas"
    ids_string = ""
    nof_ids = len(unique_ids)
    intervals = 1.0 * len(unique_ids) / 10
    current_interval = 0
    dataset = []
    while current_interval < intervals:
        ids_string = ""
        ids_slice = unique_ids[current_interval*10 : min(current_interval*10 + 10, nof_ids)]
        for municipality_id in ids_slice:
            ids_string += "," + municipality_id
        ids_string = ids_string[1:]
        response = requests.get(
            url = area_endpoint_url,
            params={
                'access_token': token,
                'page': 1,
                'pageSize': 10,
                'idIn' : ids_string,
        })
        dataset.extend(json.loads(response.content.decode('utf-8'))['results'])
        current_interval += 1
    df = pd.json_normalize(dataset)
    df.rename({'id' : 'municipalityId'}, inplace = True)
    return df
municipality_info_df = request_areas_info (
    list(df["municipalityId"]), root_url, access_token
)
municipality_info_df

# Visualization

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import geopandas as gpd
from shapely.geometry import Polygon
import h3
import s2
from geojson import Feature, Point, FeatureCollection
import numpy as np

In [None]:
df_vis = df.copy()
df_vis.dropna(subset=['lat', 'lon'], inplace=True)
df_vis.head()

In [None]:
df_vis['municipality'].value_counts()

In [None]:
def geo_to_h3(row):
  return h3.geo_to_h3(lat=row['lat'], lng=row['lon'], resolution = 5)

df_vis['h3_cell'] = df_vis.apply(geo_to_h3, axis=1)
df_vis_ag = df_vis.reset_index(drop=False)
df_vis_ag

In [None]:
df_vis_agg = (df_vis_ag
              .groupby('h3_cell')
              .index
              .agg(list)
              .to_frame("ids")
              .reset_index())

df_vis_agg['count'] = df_vis_agg['ids'].apply(lambda row: len(row))
df_vis_agg

In [None]:
def add_geometry(row):
  points = h3.h3_to_geo_boundary(row['h3_cell'], True)
  return Polygon(points)

df_vis_agg['geometry'] = df_vis_agg.apply(add_geometry,axis=1)
df_vis_agg

In [None]:
def hex_to_geojson(df_hex, hex_id_field, geometry_field, value_field):
  list_features = []
  for i, row in df_hex.iterrows():
    feature = Feature(geometry=row[geometry_field],
                      id=row[hex_id_field],
                      properties={'value': row[value_field]})
    list_features.append(feature)
    feat_collection = FeatureCollection(list_features)
  return feat_collection

geojson_obj = hex_to_geojson(df_vis_agg, hex_id_field='h3_cell', value_field='count', geometry_field='geometry')

In [None]:
def plot_observation_count(data: pd.DataFrame, category: str = 'all'):
  if category != 'all':
    data = data[data['scientificName'] == category]

  month_count = data['date'].apply(lambda row: row[:7] if isinstance(row, str) else None).value_counts().sort_index()
  fig = px.line(x=month_count.index, y=month_count.values)
  fig.update_layout(title=f'Observations count for {category}',
                    xaxis_title='Month',
                    yaxis_title='Observations')
  fig.show()

plot_observation_count(df_vis)

In [None]:
def plot_observation_municipality(data: pd.DataFrame, category: str = 'all'):
  if category != 'all':
    data = data[data['scientificName'] == category]

  mun_count = data['municipality'].value_counts().head(10)
  fig = px.bar(x=mun_count.index, y=mun_count.values)
  fig.update_layout(title=f'Top 10 municipality for {category}',
                    xaxis_title='Municipality',
                    yaxis_title='Observations')
  fig.show()

plot_observation_municipality(df_vis)

In [None]:
def plot_observation_comp(data: pd.DataFrame):
  obs_count = data['scientificName'].value_counts().to_frame().reset_index()
  obs_count.columns = ['scientificName', 'count']
  obs_count_small = obs_count[obs_count['count'] < 250].sum().to_frame().T
  obs_count_small.loc[0, 'scientificName'] = 'Others'
  obs_count_agg = pd.concat([obs_count_small, obs_count[obs_count['count'] >= 250]])
  fig = px.pie(obs_count_agg, values='count', names='scientificName')
  fig.update_layout(title=f'Distributions of composing species')
  fig.show()

plot_observation_comp(df_vis)

In [None]:
def discrete_colorscale(bvals, colors):
    """
    bvals - list of values bounding intervals/ranges of interest
    colors - list of rgb or hex color codes for values in [bvals[k], bvals[k+1]],0<=k < len(bvals)-1
    returns a nonuniform discrete colorscale
    """
    if len(bvals) != len(colors)+1:
        raise ValueError('len(boundary values) should be equal to  len(colors)+1')
    bvals = sorted(bvals)
    nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals]  #normalized values

    dcolorscale = [] #discrete colorscale
    for k in range(len(colors)):
        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
    return dcolorscale

bvals= [1, 10, 100, 1000]
colors=["#e62bf0", "#2026e3", "#26de26"]
discrete_nonuniform = discrete_colorscale(bvals, colors)

bvals = np.array(bvals)
tickvals = [np.mean(bvals[k:k+2]) for k in range(len(bvals)-1)]
ticktext =  [f'{bvals[k]}-' for k in range(0, len(bvals)-1)]

fig = px.choropleth_mapbox(
    df_vis_agg,
    geojson=geojson_obj,
    color_continuous_scale=discrete_nonuniform,
    locations='h3_cell',
    color='count',
    center=dict(lat=65, lon=24),
    zoom=4.3,
    width=600,
    height=650,
    opacity=0.3,
    labels={'count': 'observations'},
    mapbox_style="open-street-map")

fig.update_geos(projection_type='foucaut')

fig.update_layout(
      autosize=False,
      margin = dict(l=0, r=0, b=0, t=0, pad=4, autoexpand=True),
      coloraxis =dict(colorbar_thickness=25, colorbar_ticktext=ticktext, colorbar_tickvals=tickvals)
    )
fig.show()

### Gridding (& smoothing?)

See here https://colab.research.google.com/drive/1scpAh0uaBH99KI2Q2yxG4D7pU3Tbg0F-#scrollTo=yJcezMMwzWlH