In [None]:
import xarray as xr
import pandas as pd
from pathlib import Path
import os
import cdsapi
from matplotlib import pyplot as plt
import numpy as np
from pyproj import Transformer

import netCDF4 as nc
import zipfile

from urllib.parse import quote
from tqdm import tqdm

In [3]:
obs = pd.read_csv('./data/beetle/artportalen_final.csv')
obs['Date'] = pd.to_datetime(obs['Date'])
obs

Unnamed: 0.1,Unnamed: 0,Lat,Lon,Kommun,Lan,Quanity,Date
0,0,64.024023,20.650910,Robertsfors,Västerbotten,1,2018-09-14
1,1,56.729677,15.956413,Nybro,Kalmar,1,2018-09-14
2,2,55.614954,14.276141,Simrishamn,Skåne,10,2018-09-16
3,3,61.714395,17.372628,Hudiksvall,Gävleborg,1,2018-06-25
4,4,56.730931,15.906116,Nybro,Kalmar,1,2018-09-19
...,...,...,...,...,...,...,...
1017,1017,61.006044,15.190126,Rättvik,Dalarna,1,2024-10-26
1018,1018,57.858032,15.090049,Ydre,Östergötland,1,2025-04-30
1019,1019,59.993022,18.877981,Norrtälje,Stockholm,5,2025-11-25
1020,1020,58.913920,14.528770,Laxå,Örebro,1,2024-09-05


# Download weather data

### 1. We don't need weather data for all days and all coordinates. We can specify the time and area windows of interest.

For time, take all observation 'dates' and create a window of 30 days around each value.

In [4]:
try:
    weather = xr.open_dataset('./data/weather_monthly/weather.nc', engine='netcdf4')
except:
    c = cdsapi.Client()
    zip_path = '"./data/weather_monthly/era5_sweden_monthly.zip"'

    c.retrieve(
        "reanalysis-era5-land-monthly-means",
        {
            "product_type": "monthly_averaged_reanalysis",
            "variable": [
                "2m_temperature",
                "total_precipitation",
                "volumetric_soil_water_layer_1",
                "volumetric_soil_water_layer_2",
                "surface_solar_radiation_downwards"
            ],
            "year": [str(i) for i in obs['Date'].dt.year.unique()],
            "month": [
                "01","02","03","04","05","06",
                "07","08","09","10","11","12"
            ],
            "time": "00:00",
            "area": [
                69.5, 10.5, # Box for Sweden
                55.0, 24.5
            ],
            "format": "netcdf",
        },
        zip_path
    )

    with zipfile.ZipFile(zip_path, "r") as z:
        extracted_files = z.namelist()
        z.extractall('./data/weather_monthly/')

    original_path = os.path.join('./data/weather_monthly/', extracted_files[0])
    new_path = os.path.join('./data/weather_monthly/', 'weather.nc')
    os.rename(original_path, new_path)

    weather = xr.open_dataset(new_path, engine='netcdf4')

weather

### 2. Open weather data and map it to the beetle data

In [11]:
def sample_era5_to_points(obs_df: pd.DataFrame, ds: xr.Dataset, 
        lat_col="Lat", lon_col="Lon", date_col="Date", method="nearest"):
    """
    Sample ERA5-Land monthly variables at observation points.
    Returns a new DataFrame with climate variables appended.
    """

    df = obs_df.copy()

    df["Year"] = df[date_col].dt.year
    df["Month"] = df[date_col].dt.month

    # Build a datetime column for matching ERA5 time
    df["time"] = pd.to_datetime(
        dict(year=df["Year"], month=df["Month"], day=1)
    )

    # Convert points to xarray DataArray
    xr_points = xr.Dataset(
        {
            "latitude": ("points", df[lat_col].values),
            "longitude": ("points", df[lon_col].values),
            "valid_time": ("points", df["time"].values),
        }
    )
    sampled = ds.sel(
        latitude=xr_points["latitude"],
        longitude=xr_points["longitude"],
        valid_time=xr_points["valid_time"],
        method=method
    )

    # Convert back to DataFrame
    weather_df = sampled.to_dataframe().reset_index()

    weather_vars = [v for v in ds.data_vars]
    weather_df = weather_df[weather_vars]

    # Join back to original df
    df = pd.concat([df[['Lat','Lon','time']].reset_index(drop=True), weather_df.reset_index(drop=True)], axis=1)

    return df

In [12]:
result_df = sample_era5_to_points(obs_df=obs,ds=weather)
l1 = len(result_df)
result_df.dropna(inplace=True)
l2 = len(result_df)
print(f"{int(l2/l1*100)}% of the data was kept")
result_df

97% of the data was kept


Unnamed: 0,Lat,Lon,time,t2m,tp,swvl1,swvl2,ssrd
0,64.024023,20.650910,2018-09-01,284.201416,0.001569,0.215851,0.208481,9420792.0
1,56.729677,15.956413,2018-09-01,287.385010,0.000978,0.212662,0.211121,10295400.0
3,61.714395,17.372628,2018-06-01,287.197998,0.001399,0.103455,0.144211,22301380.0
4,56.730931,15.906116,2018-09-01,287.154541,0.001042,0.216354,0.214478,10131648.0
5,64.025160,20.649693,2018-09-01,284.092041,0.001598,0.217621,0.210419,9403106.0
...,...,...,...,...,...,...,...,...
1016,59.282157,18.096434,2025-06-01,288.434814,0.002281,0.324600,0.313660,20982180.0
1017,61.006044,15.190126,2024-10-01,279.260254,0.001599,0.251587,0.255066,3809643.0
1018,57.858032,15.090049,2025-04-01,281.406494,0.000681,0.197144,0.200638,15125244.0
1020,58.913920,14.528770,2024-09-01,287.219482,0.002503,0.336945,0.334595,10466504.0


In [13]:
result_df.to_csv('./data/weather_monthly/weather_final.csv')