# Feature Backfill

1. Get historical avalanche warnings for the resort locations
2. Get historical weather data + terrain data
3. Store data in feature groups - avalanche_warnings, weather_data, terrain_data

#### Imports

In [4]:
import hopsworks
import sys
from pathlib import Path
import warnings
from dotenv import load_dotenv
import os
from util import *
import datetime
import numpy as np
import rasterio
from rasterio.mask import mask
from shapely.geometry import Point, box
from pyproj import Transformer
from tqdm import tqdm
from locations import resort_locations
from dateutil.relativedelta import relativedelta
warnings.filterwarnings("ignore", module="IPython")

## Connect to Hopsworks

In [5]:
# Fix this: idk if this will work in git
load_dotenv()

api_key_value = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",             # DNS of your Hopsworks instance
    project="ID2223_Project"
)

2025-12-28 18:22:07,103 INFO: Initializing external client
2025-12-28 18:22:07,104 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-28 18:22:08,085 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2173


## Historical warnings data

In [4]:
os.makedirs("historical data", exist_ok=True)
start_date = (datetime.datetime.now() - relativedelta(years=5)).strftime('%Y-%m-%d') 
#start_date = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') 
end_date = (datetime.datetime.now()).strftime('%Y-%m-%d')
if not os.path.exists("historical data/warnings.pkl"):
    rows = []


    for location, (lat, lon) in resort_locations.items():
        print(f"Fetching {location}")

        for chunk_start, chunk_end in date_chunks(start_date, end_date, chunk_days=60):
            warnings = get_warning_data(chunk_start, chunk_end, lat, lon)

            # print(chunk_start, chunk_end)

            for w in warnings:
                rows.append({
                    "location": location,
                    "latitude": lat,
                    "longitude": lon,
                    "date": w.get("ValidFrom"),
                    "warning_level": w.get("DangerLevel")
                })

            time.sleep(0.2)

    warning_data_df = pd.DataFrame(rows)
    warning_data_df["date"] = pd.to_datetime(warning_data_df["date"]).dt.date

    warning_data_df.to_pickle("historical data/warnings.pkl")
else:
    warning_data_df = pd.read_pickle("historical data/warnings.pkl")

In [4]:
warning_data_df[warning_data_df["location"] == "Strandafjellet Skisenter"]


Unnamed: 0,location,latitude,longitude,date,warning_level
1827,Strandafjellet Skisenter,62.399663,6.899585,2020-12-23,2
1828,Strandafjellet Skisenter,62.399663,6.899585,2020-12-24,2
1829,Strandafjellet Skisenter,62.399663,6.899585,2020-12-25,3
1830,Strandafjellet Skisenter,62.399663,6.899585,2020-12-26,2
1831,Strandafjellet Skisenter,62.399663,6.899585,2020-12-27,3
...,...,...,...,...,...
3649,Strandafjellet Skisenter,62.399663,6.899585,2025-12-19,2
3650,Strandafjellet Skisenter,62.399663,6.899585,2025-12-20,1
3651,Strandafjellet Skisenter,62.399663,6.899585,2025-12-21,1
3652,Strandafjellet Skisenter,62.399663,6.899585,2025-12-22,1


In [5]:
len(warning_data_df)

36540

## Historical weather data

In [11]:
dfs = []
i = 1
start_date = datetime.datetime.strptime('2020-12-20', '%Y-%m-%d').strftime('%Y-%m-%d')
end_date = (datetime.datetime.now()).strftime('%Y-%m-%d')
print(start_date, end_date)
for loc, (lat, lon) in resort_locations.items():
    dfs.append(get_historical_weather(loc, start_date, end_date, lon, lat))
    if i % 5 == 0:
        time.sleep(60)
    i+=1


weather_df = pd.concat(dfs, ignore_index=True)

2020-12-20 2025-12-28


In [12]:
weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,rain_sum,snowfall_sum,wind_speed_10m_max,wind_direction_10m_dominant,location
0,2020-12-20,2.687583,4.500000,3.5,0.70,10.966713,74.822220,Narvik Ski Resort
1,2020-12-21,4.916750,2.000000,1.4,0.42,8.913181,162.180984,Narvik Ski Resort
2,2020-12-22,-2.962416,0.000000,0.0,0.00,13.896187,67.628601,Narvik Ski Resort
3,2020-12-23,-1.927000,0.200000,0.0,0.14,12.849528,73.340157,Narvik Ski Resort
4,2020-12-24,-0.122833,0.900000,0.0,0.63,8.707238,79.654022,Narvik Ski Resort
...,...,...,...,...,...,...,...,...
36695,2025-12-24,-9.267750,0.000000,0.0,0.00,5.991594,53.972527,Bjorli Ski
36696,2025-12-25,-2.288583,0.000000,0.0,0.00,9.499158,310.079987,Bjorli Ski
36697,2025-12-26,-4.601083,0.100000,0.0,0.07,12.549757,21.219721,Bjorli Ski
36698,2025-12-27,-4.432333,10.400001,0.3,7.07,20.693735,298.031555,Bjorli Ski


## Terrain data

In [7]:
dtm_dir = "DTM10_UTM33_20251219"
buffer_m = 5000
output_csv = "terrain_features.csv"

transformer = Transformer.from_crs(
    "EPSG:4326", "EPSG:25833", always_xy=True
)

tiles = [
    os.path.join(dtm_dir, f)
    for f in os.listdir(dtm_dir)
    if f.endswith(".tif")
]

def get_tile_bounds(tile_path):
    with rasterio.open(tile_path) as src:
        return box(*src.bounds)

tile_boxes = {t: get_tile_bounds(t) for t in tiles}

def compute_slope(dem, pixel_size=10):
    dem = dem.astype(float)
    dem[dem <= -9999] = np.nan

    gy, gx = np.gradient(dem, pixel_size, pixel_size)
    slope_rad = np.arctan(np.sqrt(gx**2 + gy**2))
    return np.degrees(slope_rad)

rows = []

for location, (lat, lon) in tqdm(resort_locations.items(), desc="Processing resorts"):
    # Transform coordinates
    x, y = transformer.transform(lon, lat)
    geom = Point(x, y).buffer(buffer_m)

    elevations = []
    slopes = []

    for tile, bounds in tile_boxes.items():
        if not bounds.intersects(geom):
            continue

        with rasterio.open(tile) as src:
            try:
                out_img, _ = mask(src, [geom], crop=True)
                dem_tile = out_img[0]

                slope_tile = compute_slope(dem_tile)

                elevations.append(dem_tile.flatten())
                slopes.append(slope_tile.flatten())

            except ValueError:
                continue

    if not elevations:
        print(f"No terrain data for {location}")
        continue

    dem = np.concatenate(elevations)
    slope = np.concatenate(slopes)

# Remove NaNs
    valid = ~np.isnan(dem) & ~np.isnan(slope)
    dem = dem[valid]
    slope = slope[valid]
    rows.append({
        "location": location,
        "latitude": lat,
        "longitude": lon,
        "buffer_m": buffer_m,
        "mean_elevation": float(np.mean(dem)),
        "std_elevation": float(np.std(dem)),
        "min_elevation": float(np.min(dem)),
        "max_elevation": float(np.max(dem)),
        "mean_slope": float(np.mean(slope)),
        "std_slope": float(np.std(slope)),
        "steep_fraction_30deg": float(np.mean(slope > 30)),
        "steep_fraction_35deg": float(np.mean(slope > 35)),
    })

# =========================
# OUTPUT
# =========================
terrain_df = pd.DataFrame(rows)
terrain_df.to_csv(output_csv, index=False)

print(terrain_df)

Processing resorts: 100%|██████████| 20/20 [00:06<00:00,  3.31it/s]

                          location   latitude  longitude  buffer_m  \
0                Narvik Ski Resort  68.473753  17.429654      5000   
1         Strandafjellet Skisenter  62.399663   6.899585      5000   
2                     Skimore Oslo  60.163288  10.743012      5000   
3                        Norefjell  60.225045   9.552882      5000   
4                          Hafjell  61.298052  10.403024      5000   
5             Kvitfjell ski resort  61.510161  10.087528      5000   
6               Drammen ski center  59.823446  10.130625      5000   
7          Voss Resort Fjellheisar  60.666797   6.416377      5000   
8            Myrkdalen Fjellandsby  60.916092   6.531733      5000   
9    Nedre fjellheisstasjon Narvik  68.480971  17.404409      5000   
10               Skimore Kongsberg  59.726222   9.644142      5000   
11         Eikedalen Ski Center AS  60.485358   5.921808      5000   
12              Hemsedal Skisenter  60.910916   8.494637      5000   
13               Rau




## Add to Hopsworks

### Warnings

In [None]:
warning_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36540 entries, 0 to 36539
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   location       36540 non-null  object        
 1   latitude       36540 non-null  float32       
 2   longitude      36540 non-null  float32       
 3   date           36540 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float32(2), object(2)
memory usage: 1.1+ MB


In [None]:
# Convert to correct datatype
warning_data_df['latitude'] = warning_data_df['latitude'].astype('float32')
warning_data_df['longitude'] = warning_data_df['longitude'].astype('float32')
warning_data_df['warning_level'] = warning_data_df['warning_level'].astype('int32')
warning_data_df['date']=pd.to_datetime(warning_data_df['date'], format='%Y-%m-%d')

fs = project.get_feature_store()
warning_fg = fs.get_or_create_feature_group(
    name="avalanche_warning",
    description = "Warnings for each day for the different resorts in Norway.",
    version=3,
    primary_key=["location", "date"],
    event_time = "date"
)

warning_fg.insert(warning_data_df)
warning_fg.update_feature_description("date", "Date of level of warning of avalanches.")
warning_fg.update_feature_description("location", "Name of resort.")
warning_fg.update_feature_description("latitude", "Latitude of resort.")
warning_fg.update_feature_description("longitude", "Longitude of resort.")
warning_fg.update_feature_description("warning_level", "Warning level.")


### Weather

In [13]:
import great_expectations as ge
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

def expect_greater_than_zero(col):
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column":col,
                "min_value":-0.1,
                "max_value":1000.0,
                "strict_min":True
            }
        )
    )
expect_greater_than_zero("precipitation_sum")
expect_greater_than_zero("rain_sum")
expect_greater_than_zero("snowfall_sum")
expect_greater_than_zero("wind_speed_10m_max")

weather_fg = fs.get_or_create_feature_group(
    name='weather_sensor',
    description='Weather characteristics of each day',
    version=2,
    primary_key=['location', 'date'],
    event_time="date",
    expectation_suite=weather_expectation_suite
)

weather_fg.insert(weather_df, wait=True)
weather_fg.update_feature_description("date", "Date of the daily weather measurement")
weather_fg.update_feature_description("temperature_2m_mean", "Mean temperature at 2 meters above ground (°C)")

weather_fg.update_feature_description("precipitation_sum", "Total daily precipitation including rain and snow (mm)")
weather_fg.update_feature_description("rain_sum", "Total daily rainfall (mm)")
weather_fg.update_feature_description("snowfall_sum", "Total daily snowfall (cm)")

weather_fg.update_feature_description("wind_speed_10m_max", "Maximum wind speed at 10 meters above ground (m/s)")
weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant wind direction during the day (degrees)")

weather_fg.update_feature_description("location", "Location (resort) where weather measurements were collected")

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/2173/fs/2122/fg/2202
2025-12-28 18:30:57,291 INFO: 	4 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://eu-west.cloud.hopsworks.ai:443/p/2173/fs/2122/fg/2202


Uploading Dataframe: 100.00% |██████████| Rows 36700/36700 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_sensor_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/2173/jobs/named/weather_sensor_2_offline_fg_materialization/executions
2025-12-28 18:31:11,515 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-28 18:31:14,696 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-28 18:33:36,201 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-28 18:33:36,334 INFO: Waiting for log aggregation to finish.
2025-12-28 18:33:45,211 INFO: Execution finished successfully.


<hsfs.feature_group.FeatureGroup at 0x1e307a2b700>

### Terrain

In [None]:
fs = project.get_feature_store()
terrain_fg = fs.get_or_create_feature_group(
    name="terrain_data",
    description="Terrain characteristics of each resort location.",
    version=1,
    primary_key=['location']
)

terrain_fg.insert(terrain_df)
terrain_fg.update_feature_description("location", "Name of the ski resort.")
terrain_fg.update_feature_description("latitude", "Latitude of the ski resort.")
terrain_fg.update_feature_description("longitude", "Longitude of the ski resort.")
terrain_fg.update_feature_description("buffer_m", "Radius in meters used for terrain feature extraction.")
terrain_fg.update_feature_description("mean_elevation", "Mean terrain elevation within the buffer (meters above sea level).")
terrain_fg.update_feature_description("std_elevation", "Standard deviation of terrain elevation within the buffer (meters).")
terrain_fg.update_feature_description("min_elevation", "Minimum terrain elevation within the buffer (meters above sea level).")
terrain_fg.update_feature_description("max_elevation", "Maximum terrain elevation within the buffer (meters above sea level).")
terrain_fg.update_feature_description("mean_slope", "Mean terrain slope angle within the buffer (degrees).")
terrain_fg.update_feature_description("std_slope","Standard deviation of terrain slope angle (degrees).")
terrain_fg.update_feature_description("steep_fraction_30deg","Fraction of terrain area with slope angles > 30 degrees.")
terrain_fg.update_feature_description("steep_fraction_35deg","Fraction of terrain area with slope angles > 35 degrees.")

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/2173/fs/2122/fg/2184


Uploading Dataframe: 100.00% |██████████| Rows 20/20 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: terrain_data_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/2173/jobs/named/terrain_data_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x270161e5de0>

: 

In [7]:
fs = project.get_feature_store()
warning_fg = fs.get_feature_group(
    name='avalanche_warning',
    version=3
)
df = warning_fg.read()
first_date = df['date'].min()
print(first_date)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.45s) 
2020-12-20 00:00:00+00:00
