In [13]:
import pandas as pd
import ee
from tqdm import tqdm
import os

from download_utils import download_monthly_ndvi

import hopsworks

ee.Authenticate()
ee.Initialize()

In [14]:
save = False


# Open obs data

In [18]:
obs = pd.read_csv('../data/beetle/artportalen/artportalen_final.csv')
obs["month"] = pd.to_datetime(obs['date']).dt.to_period("M").dt.to_timestamp()

obs.drop(columns=['kommun', 'lan','quantity','date','pressence'], inplace=True)

print(f"Obs has {len(obs)} datapoints")
obs.head(5)

Obs has 5022 datapoints


Unnamed: 0,row_id,lat,lon,month
0,0,58.788614,15.821428,2021-04-01
1,1,58.788632,15.817157,2021-04-01
2,2,58.78698,15.816672,2021-04-01
3,3,58.786361,15.815842,2021-04-01
4,4,58.835413,15.509221,2021-05-01


# Download NDVI data
We will use Google Earth Engine to get this data

In [25]:
# MODIS is a dataset containing a time series of satellite images 
try:
    ndvi_df = pd.read_csv('../data/ndvi/backfill_ndvi_raw.csv')
    ndvi_df['Month'] = pd.to_datetime(ndvi_df['Month'])

except:
    modis = ee.ImageCollection("MODIS/061/MOD13Q1").select("NDVI")
    ndvi_df = download_monthly_ndvi(
        dataset=modis, points_df=obs[["row_id", "Lat", "Lon"]], months=obs["Month"].unique()
        )
    
ndvi_df

Unnamed: 0,row_id,Month,NDVI,Lon,Lat
0,0,2018-02-01,0.29380,15.820455,58.789121
1,1,2018-02-01,0.27730,15.818209,58.789121
2,2,2018-02-01,0.18505,15.815963,58.786875
3,3,2018-02-01,0.18505,15.815963,58.786875
4,4,2018-02-01,0.23960,15.508290,58.836282
...,...,...,...,...,...
454502,5016,2025-12-01,0.00750,17.976412,59.276457
454503,5017,2025-12-01,0.00750,17.976412,59.276457
454504,5019,2025-12-01,0.11590,12.211473,57.668473
454505,5020,2025-12-01,-0.00850,17.358820,60.596980


In [None]:
if save: ndvi_df.to_csv('../data/ndvi/backfill_ndvi_raw.csv', index=False)


## 2. Create features

### 2.1 Anomalies: compare NDVI for a month vs. climatology (average of that month for previous years)

In [None]:
ndvi_df["Month_num"] = ndvi_df["Month"].dt.month

ndvi_climatology = (
    ndvi_df
    .groupby(["Lat", "Lon", "Month_num"], as_index=False)
    .agg(NDVI_clim=("NDVI", "mean"))
)
ndvi_climatology

In [None]:
ndvi_features = ndvi_df.merge(ndvi_climatology, on=["Lat", "Lon", "Month_num"],how="left")
ndvi_features

In [None]:
ndvi_features["NDVI_anom"] = ndvi_features["NDVI"] - ndvi_features["NDVI_clim"]
ndvi_features

### 2.2 Lagged features

In [None]:
ndvi_features = ndvi_features.sort_values(["Lat", "Lon", "Month"])

MAX_LAG = 2

for lag in range(1, MAX_LAG + 1):
    ndvi_features[f"NDVI_lag{lag}"] = (
        ndvi_features
        .groupby(["Lat", "Lon"])["NDVI"]
        .shift(lag)
    )

    ndvi_features[f"NDVI_anom_lag{lag}"] = (
        ndvi_features
        .groupby(["Lat", "Lon"])["NDVI_anom"]
        .shift(lag)
    )

ndvi_features


In [None]:
if save: ndvi_features.to_csv('../data/ndvi/backfill_ndvi_features.csv', index=False)

## Map back to beetle dataset schema (1 row = 1 observation, 1 coordinate, 1 day)

In [29]:
ndvi_features = pd.read_csv('../data/ndvi/backfill_ndvi_features.csv')
ndvi_features["Month"] = pd.to_datetime(ndvi_features['Month']).dt.tz_localize(None)
ndvi_features.columns = [c.lower() for c in ndvi_features.columns]
ndvi_features

Unnamed: 0,row_id,month,ndvi,lon,lat,month_num,ndvi_clim,ndvi_anom,ndvi_lag1,ndvi_anom_lag1,ndvi_lag2,ndvi_anom_lag2
0,2550,2018-02-01,0.73495,13.320893,55.541711,2,0.645775,0.089175,,,,
1,2550,2018-03-01,0.42985,13.320893,55.541711,3,0.554850,-0.125000,0.73495,0.089175,,
2,2550,2018-04-01,0.75935,13.320893,55.541711,4,0.621519,0.137831,0.42985,-0.125000,0.73495,0.089175
3,2550,2018-05-01,0.86870,13.320893,55.541711,5,0.757869,0.110831,0.75935,0.137831,0.42985,-0.125000
4,2550,2018-06-01,0.87220,13.320893,55.541711,6,0.803556,0.068644,0.86870,0.110831,0.75935,0.137831
...,...,...,...,...,...,...,...,...,...,...,...,...
454502,2124,2025-07-01,0.82175,20.740977,67.891300,7,0.786956,0.034794,0.74490,-0.038844,0.64265,0.196006
454503,2124,2025-08-01,0.80555,20.740977,67.891300,8,0.769225,0.036325,0.82175,0.034794,0.74490,-0.038844
454504,2124,2025-09-01,0.74025,20.740977,67.891300,9,0.673638,0.066613,0.80555,0.036325,0.82175,0.034794
454505,2124,2025-10-01,0.59580,20.740977,67.891300,10,0.363193,0.232607,0.74025,0.066613,0.80555,0.036325


In [33]:
ndvi_final = obs.merge(ndvi_features.drop(columns=["lat", "lon"]), on=["row_id", "month"], how="left")
ndvi_final.drop(columns=['month_num'], inplace=True)
ndvi_final['month'] = ndvi_final['month'].dt.tz_localize(None)
ndvi_final

Unnamed: 0,row_id,lat,lon,month,ndvi,ndvi_clim,ndvi_anom,ndvi_lag1,ndvi_anom_lag1,ndvi_lag2,ndvi_anom_lag2
0,0,58.788614,15.821428,2021-04-01,0.67775,0.712356,-0.034606,0.71975,0.115750,0.45875,-0.177544
1,1,58.788632,15.817157,2021-04-01,0.69475,0.690738,0.004012,0.71860,0.103387,0.45815,-0.176888
2,2,58.786980,15.816672,2021-04-01,0.64620,0.652888,-0.006687,0.66210,0.093669,0.66210,0.093669
3,3,58.786361,15.815842,2021-04-01,0.64620,0.652888,-0.006687,0.64620,-0.006687,0.66210,0.093669
4,4,58.835413,15.509221,2021-05-01,0.64880,0.678756,-0.029956,0.51110,-0.088069,0.61880,0.030450
...,...,...,...,...,...,...,...,...,...,...,...
5017,5017,59.276611,17.976767,2024-10-01,0.55590,0.536143,0.019757,0.55590,0.019757,0.60900,-0.020437
5018,5018,63.828780,20.293920,2024-10-01,0.47930,0.440079,0.039221,0.61105,0.037662,0.68690,0.034394
5019,5019,57.668491,12.210855,2024-09-01,0.78610,0.767144,0.018956,0.80535,0.035537,0.80110,0.064681
5020,5020,60.596110,17.358899,2024-09-01,0.81350,0.794363,0.019137,0.84515,0.005481,0.83440,-0.020612


In [None]:
if save: ndvi_final.to_csv('../data/ndvi/backfill_ndvi_final.csv', index=False)

# Save to Hopsworks

In [35]:
ndvi_final = pd.read_csv('../data/ndvi/backfill_ndvi_final.csv')
ndvi_final['month'] = pd.to_datetime(ndvi_final['month']).dt.tz_localize(None)
ndvi_final.drop(columns=["ndvi_clim", "ndvi", "row_id","ndvi_anom", "ndvi_anom_lag1", "ndvi_anom_lag2"], inplace=True)
ndvi_final.columns = [c.lower() for c in ndvi_final.columns]
ndvi_final

Unnamed: 0,lat,lon,month,ndvi_lag1,ndvi_lag2
0,58.788614,15.821428,2021-04-01,0.71975,0.45875
1,58.788632,15.817157,2021-04-01,0.71860,0.45815
2,58.786980,15.816672,2021-04-01,0.66210,0.66210
3,58.786361,15.815842,2021-04-01,0.64620,0.66210
4,58.835413,15.509221,2021-05-01,0.51110,0.61880
...,...,...,...,...,...
5017,59.276611,17.976767,2024-10-01,0.55590,0.60900
5018,63.828780,20.293920,2024-10-01,0.61105,0.68690
5019,57.668491,12.210855,2024-09-01,0.80535,0.80110
5020,60.596110,17.358899,2024-09-01,0.84515,0.83440


In [36]:
project = hopsworks.login(api_key_value=os.getenv('HOPSWORKS_API_KEY'))
fs = project.get_feature_store() 

2026-01-05 18:53:22,272 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-05 18:53:22,274 INFO: Initializing external client
2026-01-05 18:53:22,274 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-05 18:53:23,436 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286351


In [37]:
ndvi_final['month'] = ndvi_final['month'].dt.tz_localize(None)

ndvi_fg = fs.get_or_create_feature_group(
    name='ndvi',
    description='NDVI values for different months at specific coordinates.',
    version=2,
    primary_key=['lat', 'lon','month'],
    event_time="month"
)

ndvi_fg.insert(ndvi_final)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286351/fs/1273971/fg/1893811


Uploading Dataframe: 100.00% |██████████| Rows 5022/5022 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: ndvi_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286351/jobs/named/ndvi_2_offline_fg_materialization/executions


(Job('ndvi_2_offline_fg_materialization', 'SPARK'), None)