In [None]:
import pandas as pd
import requests
import json
import numpy as np

In [None]:
df = pd.read_csv("../../data/samples/tree_data_sample.csv")
df.head()

In [None]:
root_url = "https://api.laji.fi/v0"
email = "nghi.vodong942003@gmail.com"
access_token = "PNQaOkSdHzJYm3zJI8PUG7OWHnmiILIOEakXPZ95gfoMEgICaGJ22IB5Hyfi5ZCT"

def extract_last_page (root_url, token, page_size):
    response = requests.get(
    url = root_url + '/warehouse/query/unit/list',
    params={
        'access_token': token,
        'page': 1,
        'pageSize': page_size, # Limit 10,000 per page
        'informalTaxonGroupId': 'MVL.1083', # {Vascular plants: {Plant life forms: {Trees; Evergreen trees} } }
        'orderBy': ['gathering.eventDate.begin DESC', 'document.loadDate DESC', 'unit.taxonVerbatim ASC'] # Default sorting
    })

    responseContent = response.content
    results = json.loads(responseContent.decode('utf-8'))
    lastPage = results['lastPage']
    return lastPage

lastPage = extract_last_page(root_url, access_token, 10000)

features = [
    "gathering.displayDateTime",
    "gathering.conversions.wgs84CenterPoint.lat",
    "gathering.conversions.wgs84CenterPoint.lon",
    "unit.linkings.taxon.id",
    "unit.linkings.taxon.scientificName",
    "gathering.interpretations.municipalityDisplayname",
    "gathering.interpretations.finnishMunicipality"
]

def extract_dataset (root_url, token, page_size, lastPage, features = []):
    df = pd.DataFrame({ feature: [] for feature in features })
    for pageIndex in range(1, lastPage + 1):
    # Request data from API and go through all the pages in the loop (approximately 8m with 12 pages and 115739 rows)
        response = requests.get(
            url = root_url + '/warehouse/query/unit/list',
            params={
                'access_token': token,
                'page': pageIndex,
                'pageSize': page_size, # Limit 10,000 per page
                'informalTaxonGroupId': 'MVL.1083', # {Vascular plants: {Plant life forms: {Trees; Evergreen trees} } }
                'selected': features,
                'orderBy': ['gathering.eventDate.begin DESC', 'document.loadDate DESC', 'unit.taxonVerbatim ASC'] # Default sorting
        })

        responseContent = response.content
        dataset = json.loads(responseContent.decode('utf-8'))['results']
        df = pd.concat([df, pd.json_normalize(dataset)])
    return df

df = extract_dataset(root_url, access_token, 10000, lastPage , features)
df = df.rename(columns = {
    "gathering.displayDateTime" : "datetime",
    "gathering.conversions.wgs84CenterPoint.lat" : "lat",
    "gathering.conversions.wgs84CenterPoint.lon": "lon",
    "gathering.interpretations.municipalityDisplayname": "municipality",
    "gathering.interpretations.finnishMunicipality" : "municipalityId",
    "unit.linkings.taxon.id" : "taxonId",
    "unit.linkings.taxon.scientificName" : "scientificName"})
df

In [None]:
def group_observations (dataframe, columns):
    grouped_df = dataframe.groupby(columns).count()
    grouped_df.reset_index(inplace = True)
    columns_to_drop = list(filter(lambda name : name not in columns, grouped_df.columns))
    grouped_df = grouped_df.drop(columns = columns_to_drop[1:])
    grouped_df.columns = columns + ["nof_obs"]
    return grouped_df

# scientificName or taxonId works
grouped_df = group_observations(df, ['municipalityId', 'scientificName'])
#grouped_df.to_csv('grouped_municipality.csv')

def pivot_obs_dataframe (dataframe, species_identifier, area_id):
    pivoted_df = dataframe.pivot(index = area_id, columns = species_identifier, values = "nof_obs")
    pivoted_df = pivoted_df.fillna(0.0)
    return pivoted_df


pivoted_df = pivot_obs_dataframe(grouped_df, "scientificName", "municipalityId")
#pivoted_df.to_csv('pivoted_municipality.csv')
pivoted_df

In [None]:
def shannon_entropy (area_row):
    numpy_row = area_row.to_numpy()
    probabilities = numpy_row / np.sum(numpy_row)
    probabilities = probabilities[np.where(probabilities > 0)]
    return -np.sum(probabilities*np.log2(probabilities))

shannon_entropies = pivoted_df.apply(shannon_entropy, axis=1)
shannon_entropies

In [None]:
def simpson_index (area_row):
    numpy_row = area_row.to_numpy()
    observed_species = numpy_row[np.where(numpy_row > 0.0)]
    species_index = observed_species * (observed_species - 1)
    total_observations = np.sum(observed_species)
    total_index = total_observations * (total_observations - 1)
    return 1 - (np.sum(species_index) / total_index)

simpson_indices = pivoted_df.apply(simpson_index, axis=1)
simpson_indices

In [None]:
root_url = "https://api.laji.fi/v0"
email = "nghi.vodong942003@gmail.com"
access_token = "PNQaOkSdHzJYm3zJI8PUG7OWHnmiILIOEakXPZ95gfoMEgICaGJ22IB5Hyfi5ZCT"

def request_areas_info (municipality_ids, root_url, token):
    assert len(municipality_ids) > 0
    cleaned_ids = [id for id in municipality_ids if str(id) != "nan"]
    unique_ids = list(set(cleaned_ids))
    unique_ids.sort()
    area_endpoint_url = root_url + "/areas"
    ids_string = ""
    nof_ids = len(unique_ids)
    intervals = 1.0 * len(unique_ids) / 10
    current_interval = 0
    dataset = []
    while current_interval < intervals:
        ids_string = ""
        ids_slice = unique_ids[current_interval*10 : min(current_interval*10 + 10, nof_ids)]
        for municipality_id in ids_slice:
            ids_string += "," + municipality_id
        ids_string = ids_string[1:]
        response = requests.get(
            url = area_endpoint_url,
            params={
                'access_token': token,
                'page': 1,
                'pageSize': 10,
                'idIn' : ids_string,
        })
        dataset.extend(json.loads(response.content.decode('utf-8'))['results'])
        current_interval += 1
    df = pd.json_normalize(dataset)
    df.rename({'id' : 'municipalityId'}, inplace = True)
    return df
municipality_info_df = request_areas_info (
    list(df["municipalityId"]), root_url, access_token
)
municipality_info_df