In [1]:
import xarray as xr
import pandas as pd
from pathlib import Path
import os
import cdsapi
from matplotlib import pyplot as plt
import numpy as np
from pyproj import Transformer

import zipfile

from urllib.parse import quote
from tqdm import tqdm

import hopsworks

from download_utils import sample_era5_to_points, download_monthly_weather_from_obs

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
save = False

# Open obs data

In [None]:
obs = pd.read_csv('../data/beetle/artportalen/artportalen_final.csv')
obs['Date'] = pd.to_datetime(obs['Date'])
obs["Month"] = pd.to_datetime(obs['Date']).dt.to_period("M").dt.to_timestamp()
obs

# Download weather data

### 1. We don't need weather data for all days and all coordinates. We can specify the time and area windows of interest.

For time, take all observation 'dates' and create a window of 30 days around each value.

In [None]:
try:
    weather = xr.open_dataset('../data/weather_monthly/weather.nc', engine='netcdf4')
except:
    weather = download_monthly_weather_from_obs(obs)

weather

### 2. Open weather data and map it to the beetle data

In [None]:
weather_final = sample_era5_to_points(obs_df=obs,ds=weather)
l1 = len(weather_final)
weather_final.dropna(inplace=True)
l2 = len(weather_final)
print(f"Datapoints: {l2}")
print(f"{int(l2/l1*100)}% of the data was kept")
weather_final = weather_final[sorted(list(weather_final.columns))]
weather_final.head(5)

In [None]:
if save: weather_final.to_csv('../data/weather_monthly/weather_final.csv', index=False)

# Save in Hopsworks


In [3]:
weather_final = pd.read_csv('../data/weather_monthly/weather_final.csv')
weather_final['Month'] = pd.to_datetime(weather_final['Month']).dt.tz_localize(None)
weather_final.drop(columns=['row_id', 'ssrd', 'swvl1','swvl2','t2m','tp'], inplace=True)
weather_final.columns = [x.lower() for x in weather_final.columns]
weather_final


Unnamed: 0,lat,lon,month,ssrd_lag1,ssrd_lag2,ssrd_lag3,swvl1_lag1,swvl1_lag2,swvl1_lag3,swvl2_lag1,swvl2_lag2,swvl2_lag3,t2m_lag1,t2m_lag2,t2m_lag3,tp_lag1,tp_lag2,tp_lag3
0,58.210249,15.000075,2022-11-01,2261682.0,4920779.5,16562478.0,0.360443,0.224594,0.381378,0.360733,0.214890,0.388428,281.27808,284.28370,292.22583,0.001028,0.001129,0.002190
1,58.283784,16.660710,2022-11-01,2261682.0,4920779.5,16562478.0,0.360443,0.224594,0.381378,0.360733,0.214890,0.388428,281.27808,284.28370,292.22583,0.001028,0.001129,0.002190
2,58.829263,15.890906,2023-02-01,650332.0,374830.0,887550.0,0.383041,0.246750,0.471695,0.379745,0.241821,0.466431,271.99243,270.29907,278.32690,0.001789,0.000751,0.000811
3,58.788818,16.195337,2023-02-01,650332.0,374830.0,887550.0,0.383041,0.246750,0.471695,0.379745,0.241821,0.466431,271.99243,270.29907,278.32690,0.001789,0.000751,0.000811
4,58.788594,16.195127,2023-02-01,650332.0,374830.0,209686.0,0.383041,0.246750,0.254562,0.379745,0.241821,0.249435,271.99243,270.29907,267.95190,0.001789,0.000751,0.000577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,60.452108,17.513017,2024-05-01,10555996.0,6092584.0,1695332.0,0.258118,0.287598,0.268723,0.260162,0.285065,0.267624,277.09230,272.86133,271.13892,0.001892,0.001878,0.001030
4406,59.276186,17.977477,2024-06-01,22673676.0,12157532.0,6689808.0,0.185318,0.269867,0.277710,0.198425,0.273224,0.279846,286.55542,275.86182,273.48633,0.000593,0.001816,0.001505
4407,59.276611,17.976767,2024-10-01,10203214.0,14195300.0,16541118.0,0.215347,0.226532,0.224701,0.199615,0.226257,0.206985,286.89136,288.63477,290.74023,0.002375,0.001779,0.003611
4408,63.828780,20.293920,2024-10-01,10203214.0,14195300.0,16950942.0,0.215347,0.226532,0.253296,0.199615,0.226257,0.239532,286.89136,288.63477,290.53516,0.002375,0.001779,0.003995


In [4]:
project = hopsworks.login(api_key_value=os.getenv('HOPSWORKS_API_KEY'))
fs = project.get_feature_store()

2026-01-05 18:55:37,744 INFO: Initializing external client
2026-01-05 18:55:37,745 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-05 18:55:37,745 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-05 18:55:39,199 INFO: Python Engine initialized.
2026-01-05 18:55:39,199 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286351

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286351


In [5]:
from hsfs.feature import Feature

# Remove timezone info from month column before inserting
weather_final['month'] = weather_final['month'].dt.tz_localize(None)

features = [Feature("month", "timestamp")]
for c in list(weather_final.columns):
    if c != 'month':
        features.append(Feature(c, "double"))

weather_fg = fs.get_or_create_feature_group(
    name='weather',
    description='Weather variables at each coordinate for a given month',
    version=2,
    features=features,
    primary_key=['lat', 'lon', 'month'],
    event_time="month",
    online_enabled=False
)

# Explicitly set the feature schema to avoid dictionary serialization issues
weather_fg.insert(weather_final, write_options={"wait_for_job": True})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286351/fs/1273971/fg/1911127


Uploading Dataframe: 100.00% |██████████| Rows 4410/4410 | Elapsed Time: 00:03 | Remaining Time: 00:00



Launching job: weather_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286351/jobs/named/weather_2_offline_fg_materialization/executions
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286351/jobs/named/weather_2_offline_fg_materialization/executions
2026-01-05 18:56:18,436 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2026-01-05 18:56:18,436 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2026-01-05 18:56:21,642 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-05 18:56:21,642 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-05 18:56:24,829 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-05 18:56:24,829 INFO: Waiting for execution to finish. 

(Job('weather_2_offline_fg_materialization', 'SPARK'), None)