In [17]:
import xarray as xr
import pandas as pd
from pathlib import Path
import os
import cdsapi
from matplotlib import pyplot as plt
import numpy as np
from pyproj import Transformer

import netCDF4 as nc
import zipfile

from urllib.parse import quote
from tqdm import tqdm

In [18]:
save = False

# Open obs data

In [19]:
obs = pd.read_csv('../data/beetle/artportalen/artportalen_final.csv')
obs['Date'] = pd.to_datetime(obs['Date'])
obs["Month"] = pd.to_datetime(obs['Date']).dt.to_period("M").dt.to_timestamp()
obs

Unnamed: 0,row_id,Lat,Lon,Date,Kommun,Lan,Quantity,Pressence,Month
0,0,58.788614,15.821428,2021-04-06,Finspång,Östergötland,1,0,2021-04-01
1,1,58.788632,15.817157,2021-04-06,Finspång,Östergötland,1,0,2021-04-01
2,2,58.786980,15.816672,2021-04-06,Finspång,Östergötland,1,0,2021-04-01
3,3,58.786361,15.815842,2021-04-06,Finspång,Östergötland,1,0,2021-04-01
4,4,58.835413,15.509221,2021-05-12,Finspång,Östergötland,1,0,2021-05-01
...,...,...,...,...,...,...,...,...,...
5017,5017,59.276611,17.976767,2024-10-24,Stockholm,Stockholm,1,0,2024-10-01
5018,5018,63.828780,20.293920,2024-10-24,Umeå,Västerbotten,1,0,2024-10-01
5019,5019,57.668491,12.210855,2024-09-14,Härryda,Västra Götaland,1,0,2024-09-01
5020,5020,60.596110,17.358899,2024-09-22,Älvkarleby,Uppsala,1,0,2024-09-01


# Download weather data

### 1. We don't need weather data for all days and all coordinates. We can specify the time and area windows of interest.

For time, take all observation 'dates' and create a window of 30 days around each value.

In [20]:
try:
    weather = xr.open_dataset('../data/weather_monthly/weather.nc', engine='netcdf4')
except:
    c = cdsapi.Client()
    zip_path = "../data/weather_monthly/era5_sweden_monthly.zip"

    c.retrieve(
        "reanalysis-era5-land-monthly-means",
        {
            "product_type": "monthly_averaged_reanalysis",
            "variable": [
                "2m_temperature",
                "total_precipitation",
                "volumetric_soil_water_layer_1",
                "volumetric_soil_water_layer_2",
                "surface_solar_radiation_downwards"
            ],
            "year": [str(i) for i in obs['Date'].dt.year.unique()],
            "month": [
                "01","02","03","04","05","06",
                "07","08","09","10","11","12"
            ],
            "time": "00:00",
            "area": [
                69.5, 10.5, # Box for Sweden
                55.0, 24.5
            ],
            "format": "netcdf",
        },
        zip_path
    )

    with zipfile.ZipFile(zip_path, "r") as z:
        extracted_files = z.namelist()
        z.extractall('../data/weather_monthly/')

    original_path = os.path.join('../data/weather_monthly/', extracted_files[0])
    new_path = os.path.join('../data/weather_monthly/', 'weather.nc')
    os.rename(original_path, new_path)

    weather = xr.open_dataset(new_path, engine='netcdf4')

weather

### 2. Open weather data and map it to the beetle data

In [50]:
def sample_era5_to_points(obs_df: pd.DataFrame, ds: xr.Dataset, 
        lat_col="Lat", lon_col="Lon", date_col="Date", method="nearest", max_lag=3):
    """
    Sample ERA5-Land monthly variables at observation points.
    Returns a new DataFrame with climate variables appended.
    """

    lag_df = {"ssrd":2, "swvl1":2, "swvl2":2, "t2m":3, "tp":3}

    df = obs_df.copy()
    df["Month_number"] = df[date_col].dt.month

    # Get weather variables
    weather_vars = [v for v in ds.data_vars]
    
    # Create a multi-index for all observations x all lags
    n_obs = len(df)
    n_lags = max_lag + 1
    
    # Repeat lat/lon for each lag
    lats = np.repeat(df[lat_col].values, n_lags)
    lons = np.repeat(df[lon_col].values, n_lags)
    
    # Create all lagged times at once
    times = []
    for lag in range(n_lags):
        lagged_month = df["Month"] - pd.DateOffset(months=lag)
        times.append(lagged_month.values)
    times = np.concatenate(times)
    
    # Single selection for all points and all times
    xr_points = xr.Dataset({
        "latitude": ("points", lats),
        "longitude": ("points", lons),
        "valid_time": ("points", times),
    })
    
    sampled = ds.sel(
        latitude=xr_points["latitude"],
        longitude=xr_points["longitude"],
        valid_time=xr_points["valid_time"],
        method=method
    )
    
    # Convert to DataFrame
    weather_df = sampled.to_dataframe().reset_index()
    weather_df = weather_df[weather_vars]
    
    # Reshape: split by lag and rename columns
    all_results = []
    for lag in range(n_lags):
        start_idx = lag * n_obs
        end_idx = (lag + 1) * n_obs
        lag_df = weather_df.iloc[start_idx:end_idx].reset_index(drop=True)
        
        if lag > 0:
            lag_df = lag_df.rename(columns={v: f"{v}_lag{lag}" for v in weather_vars})
        
        all_results.append(lag_df)
    
    # Concatenate horizontally
    combined_weather = pd.concat(all_results, axis=1)
    
    # Join back to original df
    df = pd.concat([df[['Lat','Lon','Month','row_id']].reset_index(drop=True), combined_weather], axis=1)

    return df

In [51]:
weather_final = sample_era5_to_points(obs_df=obs,ds=weather)
l1 = len(weather_final)
weather_final.dropna(inplace=True)
l2 = len(weather_final)
print(f"{int(l2/l1*100)}% of the data was kept")
weather_final = weather_final[sorted(list(weather_final.columns))]
weather_final.head(5)

87% of the data was kept


Unnamed: 0,Lat,Lon,Month,row_id,ssrd,ssrd_lag1,ssrd_lag2,ssrd_lag3,swvl1,swvl1_lag1,...,swvl2_lag2,swvl2_lag3,t2m,t2m_lag1,t2m_lag2,t2m_lag3,tp,tp_lag1,tp_lag2,tp_lag3
22,58.210249,15.000075,2022-11-01,22,806360.0,2261682.0,4920779.5,16562478.0,0.25116,0.360443,...,0.21489,0.388428,277.905029,281.278076,284.283691,292.22583,0.000781,0.001028,0.001129,0.00219
23,58.283784,16.66071,2022-11-01,23,806360.0,2261682.0,4920779.5,16562478.0,0.25116,0.360443,...,0.21489,0.388428,277.905029,281.278076,284.283691,292.22583,0.000781,0.001028,0.001129,0.00219
24,58.829263,15.890906,2023-02-01,24,2205158.0,650332.0,374830.0,887550.0,0.24231,0.383041,...,0.241821,0.466431,273.578369,271.992432,270.299072,278.326904,0.000537,0.001789,0.000751,0.000811
25,58.788818,16.195337,2023-02-01,25,2205158.0,650332.0,374830.0,887550.0,0.24231,0.383041,...,0.241821,0.466431,273.578369,271.992432,270.299072,278.326904,0.000537,0.001789,0.000751,0.000811
26,58.788594,16.195127,2023-02-01,26,2205158.0,650332.0,374830.0,209686.0,0.24231,0.383041,...,0.241821,0.249435,273.578369,271.992432,270.299072,267.951904,0.000537,0.001789,0.000751,0.000577


In [53]:
if save: weather_final.to_csv('../data/weather_monthly/weather_final.csv', index=False)