In [6]:
import xarray as xr
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import numpy as np
from pyproj import Transformer
from urllib.parse import quote
from tqdm import tqdm
import os

In [None]:
save = False

# Download observation data

In [60]:
directory = './data/beetle/artportalen'

pd_list = []
for file in os.listdir(directory):
    filename = os.path.join(directory, file)
    if filename.endswith(".xlsx"):
        pd_list.append(pd.read_excel(filename,engine="openpyxl", header=2))
    else:
        continue

obs_raw = pd.concat(pd_list, axis=0,ignore_index=True)
print(f'Dataset contains {len(obs_raw)} observations')
obs_raw.head(5)

Dataset contains 5022 observations


Unnamed: 0,Skyddat fynd,Id,Taxonsorteringsordning,Valideringsstatus,Rödlistade,TaxonId,Artnamn,Vetenskapligt namn,Auktor,Antal,...,Samlingsbeskrivning,Artbestämd av,Bestämningsår,Bekräftad av,Bekräftelseår,Redigeringsansvarig,Rapportör,Observatörer,Länk till BOLD/GenBank,Projektnamn
0,,92216808,8440,Ej granskad,,106554,Björksplintborre,Scolytus ratzeburgii,"Janson, 1856",noterad,...,,,,,,Marika Sjödin,Marika Sjödin,"David Ek, Marika Sjödin",,
1,,92219092,8440,Ej granskad,,106554,Björksplintborre,Scolytus ratzeburgii,"Janson, 1856",noterad,...,,,,,,Marika Sjödin,Marika Sjödin,"David Ek, Marika Sjödin",,
2,,92219749,8440,Ej granskad,,106554,Björksplintborre,Scolytus ratzeburgii,"Janson, 1856",noterad,...,,,,,,Marika Sjödin,Marika Sjödin,"David Ek, Marika Sjödin",,
3,,92219881,8440,Ej granskad,,106554,Björksplintborre,Scolytus ratzeburgii,"Janson, 1856",noterad,...,,,,,,Marika Sjödin,Marika Sjödin,"David Ek, Marika Sjödin",,
4,,93318230,8440,Ej granskad,,106554,Björksplintborre,Scolytus ratzeburgii,"Janson, 1856",noterad,...,,,,,,Marika Sjödin,Marika Sjödin,"Eva Siljeholm, Marika Sjödin",,


NOTES:
- Noggrannhet is the accuracy (m) of the observation's location. Most values are under 2km, which is much smaller than the spatial resolution of weather data. So tracking the accuracy of the observation location is not important.
- Enhet encodes if the amount (Antal) is in individual specimens or colonies. 99.4% of values are not colonies, so empty values and colony observations are mapped to single specimen observations (since it's nearly impossible to accurately map colony to specimens).
- Antal encodes the number of [enhet] observed. 38.1% of values are 'noterad', meaning that an observation was made but the number of specimens was not registered. These values were mapped to 1.

In [61]:
cols = ['Startdatum','Slutdatum', 'Antal', 'Kommun', 'Län', 'Vetenskapligt namn']

obs = obs_raw[cols].copy()
obs['Startdatum'] = pd.to_datetime(obs['Startdatum'])
obs['Slutdatum'] = pd.to_datetime(obs['Slutdatum'])
 
obs['Duration'] = (obs['Slutdatum'] - obs['Startdatum']).dt.days
obs['Date'] = obs['Startdatum'] + (obs['Slutdatum'] - obs['Startdatum'])/2

obs.loc[obs['Antal'] == 'noterad', 'Antal'] = '1'
obs['Antal'] = pd.to_numeric(obs['Antal'])
obs["row_id"] = obs.index.astype(int)

obs.head(5)


Unnamed: 0,Startdatum,Slutdatum,Antal,Kommun,Län,Vetenskapligt namn,Duration,Date,row_id
0,2021-04-06,2021-04-06,1,Finspång,Östergötland,Scolytus ratzeburgii,0,2021-04-06,0
1,2021-04-06,2021-04-06,1,Finspång,Östergötland,Scolytus ratzeburgii,0,2021-04-06,1
2,2021-04-06,2021-04-06,1,Finspång,Östergötland,Scolytus ratzeburgii,0,2021-04-06,2
3,2021-04-06,2021-04-06,1,Finspång,Östergötland,Scolytus ratzeburgii,0,2021-04-06,3
4,2021-05-12,2021-05-12,1,Finspång,Östergötland,Scolytus ratzeburgii,0,2021-05-12,4


In [62]:
obs["Pressence"] = 0
obs.loc[obs["Vetenskapligt namn"] == 'Ips typographus', "Pressence"] = 1

In [63]:
## Transform from the RT90 coordinate system (csv) to lat/lon
transformer = Transformer.from_crs(
    "EPSG:3021",   # RT90 
    "EPSG:4326",   # WGS84 lat/lon
    always_xy=True # Easting, Northing order
)

obs["Lon"], obs["Lat"] = transformer.transform(
    obs_raw["Ost"].values,
    obs_raw["Nord"].values
)

obs.rename(columns={'Startdatum': 'StartDate', 
                    'Slutdatum':'EndDate', 
                    'Antal':'Quantity', 
                    'Län':'Lan'}, inplace=True)
obs = obs[['row_id','Lat', 'Lon', 'Date','Kommun', 'Lan', 'Quantity', 'Pressence']]
obs.head(5)

Unnamed: 0,row_id,Lat,Lon,Date,Kommun,Lan,Quantity,Pressence
0,0,58.788614,15.821428,2021-04-06,Finspång,Östergötland,1,0
1,1,58.788632,15.817157,2021-04-06,Finspång,Östergötland,1,0
2,2,58.78698,15.816672,2021-04-06,Finspång,Östergötland,1,0
3,3,58.786361,15.815842,2021-04-06,Finspång,Östergötland,1,0
4,4,58.835413,15.509221,2021-05-12,Finspång,Östergötland,1,0


In [64]:
if save: obs.to_csv('./data/beetle/artportalen/artportalen_final.csv', index=False)