In [34]:
import pandas as pd
import ee

ee.Authenticate()
ee.Initialize()

In [35]:
try:
    obs = pd.read_csv('./data/weather_monthly/weather_final.csv')
    obs['Date'] = pd.to_datetime(obs['time'])
    obs.drop(columns=['Unnamed: 0', 'time'], inplace=True)
except:
    obs = pd.read_csv('./data/beetle/artportalen_final.csv')
    obs['Date'] = pd.to_datetime(obs['Date'])
    obs.drop(columns=['Unnamed: 0'], inplace=True)

obs.head(5)

Unnamed: 0,Lat,Lon,t2m,tp,swvl1,swvl2,ssrd,Date
0,64.024023,20.65091,284.20142,0.001569,0.215851,0.208481,9420792.0,2018-09-01
1,56.729677,15.956413,287.385,0.000978,0.212662,0.211121,10295400.0,2018-09-01
2,61.714395,17.372628,287.198,0.001399,0.103455,0.144211,22301380.0,2018-06-01
3,56.730931,15.906116,287.15454,0.001042,0.216354,0.214478,10131648.0,2018-09-01
4,64.02516,20.649693,284.09204,0.001598,0.217621,0.210419,9403106.0,2018-09-01


# Download NDVI data
We will use Google Earth Engine to get this data

In [None]:
def df_to_ee_points(df):
    """
    Convert df rows to EE points (Earth Engine format) and add metadata (row id and date of the month)
    """
    features = []
    for i, row in df.iterrows():
        feat = ee.Feature(
            ee.Geometry.Point(row["Lon"], row["Lat"]),
            {
                "row_id": i,
                "month": row["Date"].strftime("%Y-%m-%d")
            }
        )
        features.append(feat)
    return ee.FeatureCollection(features)

points_fc = df_to_ee_points(obs)


In [None]:
def monthly_ndvi_image(dataset, year, month):
    """
    Select satellite image from modis 
    """
    start = ee.Date.fromYMD(year, month, 1)
    end = start.advance(1, "month")

    monthly = dataset.filterDate(start, end)

    # Check if there are images
    count = monthly.size()

    def compute():
        return (
            monthly
            .mean()
            .multiply(0.0001)
            .rename("NDVI")
            .set("year", year)
            .set("month", month)
        )

    def empty():
        return ee.Image().rename("NDVI") 
    
    return ee.Image(ee.Algorithms.If(count.gt(0), compute(), empty()))

def extract_monthly_ndvi(dataset, df, points_fc):
    results = []

    months = (
        df["Date"]
        .dt.to_period("M")
        .dt.to_timestamp()
        .drop_duplicates()
    )

    # Iterate through all months. For each month, sample data for all coordinates
    for m in months:
        year = m.year
        month = m.month

        # Get satellite image
        img = monthly_ndvi_image(dataset, year, month) 

        # Filter satellite image to coordinate of interest
        sampled = img.sampleRegions(
            collection=points_fc,
            scale=250,
            geometries=True
        )

        info = sampled.getInfo()

        for f in info["features"]:
            props = f["properties"]
            geom = f["geometry"]["coordinates"]

            results.append({
                "row_id": props["row_id"],
                "month": m,
                "ndvi": props.get("NDVI"),
                "lon": geom[0],
                "lat": geom[1],
            })

    return pd.DataFrame(results)


In [57]:
# MODIS is a dataset containing a time series of satellite images 
modis = ee.ImageCollection("MODIS/061/MOD13Q1").select("NDVI")
ndvi_df = extract_monthly_ndvi(modis, obs, points_fc)
ndvi_df

In [65]:
ndvi_df = ndvi_df.sort_values(['lat', 'lon', 'month'])
ndvi_df

Unnamed: 0,row_id,month,ndvi,lon,lat
8371,541,2018-02-01,0.59925,13.320893,55.541711
2517,541,2018-03-01,0.87220,13.320893,55.541711
6474,541,2018-04-01,0.86870,13.320893,55.541711
5487,541,2018-05-01,0.84585,13.320893,55.541711
1528,541,2018-06-01,0.69680,13.320893,55.541711
...,...,...,...,...,...
80388,120,2025-08-01,0.74490,20.740977,67.891300
81378,120,2025-09-01,0.80555,20.740977,67.891300
82369,120,2025-10-01,0.59580,20.740977,67.891300
84345,120,2025-11-01,0.82175,20.740977,67.891300


## 2. Create features
- Anomalies: compare NDVI for a month vs. climatology (average of that month for previous years)
- Lagged features

In [71]:
ndvi_climatology = climatology = (
    ndvi_df
    .groupby(["lat", "lon", "month_num"], as_index=False)
    .agg(NDVI_clim=("ndvi", "mean"))
)
ndvi_climatology

Unnamed: 0,lat,lon,month_num,NDVI_clim
0,55.541711,13.320893,1,0.574180
1,55.541711,13.320893,2,0.502721
2,55.541711,13.320893,3,0.625488
3,55.541711,13.320893,4,0.636381
4,55.541711,13.320893,5,0.636600
...,...,...,...,...
9232,67.891300,20.740977,8,0.506644
9233,67.891300,20.740977,9,0.707043
9234,67.891300,20.740977,10,0.569900
9235,67.891300,20.740977,11,0.449963


In [72]:
ndvi_df = ndvi_df.merge(climatology, on=["lat", "lon", "month_num"],how="left")
ndvi_df

Unnamed: 0,row_id,month,ndvi_x,lon,lat,month_num,ndvi_y,ndvi,NDVI_clim
0,541,2018-02-01,0.59925,13.320893,55.541711,2,0.59925,0.59925,0.502721
1,541,2018-03-01,0.87220,13.320893,55.541711,3,0.87220,0.87220,0.625488
2,541,2018-04-01,0.86870,13.320893,55.541711,4,0.86870,0.86870,0.636381
3,541,2018-05-01,0.84585,13.320893,55.541711,5,0.84585,0.84585,0.636600
4,541,2018-06-01,0.69680,13.320893,55.541711,6,0.69680,0.69680,0.742688
...,...,...,...,...,...,...,...,...,...
86199,120,2025-08-01,0.74490,20.740977,67.891300,8,0.74490,0.74490,0.506644
86200,120,2025-09-01,0.80555,20.740977,67.891300,9,0.80555,0.80555,0.707043
86201,120,2025-10-01,0.59580,20.740977,67.891300,10,0.59580,0.59580,0.569900
86202,120,2025-11-01,0.82175,20.740977,67.891300,11,0.82175,0.82175,0.449963
