# Feature Backfill

1. Get historical avalanche warnings for the resort locations
2. Get historical weather data + terrain data
3. Store data in feature groups - avalanche_warnings, weather_data, terrain_data

#### Imports

In [1]:
import hopsworks
import sys
from pathlib import Path
import warnings
from dotenv import load_dotenv
import os
from util import *
import datetime
from locations import resort_locations
from dateutil.relativedelta import relativedelta
warnings.filterwarnings("ignore", module="IPython")

## Connect to Hopsworks

In [2]:
# Fix this: idk if this will work in git
load_dotenv()

api_key_value = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",             # DNS of your Hopsworks instance
    project="ID2223_Project"
)

2025-12-23 14:25:48,039 INFO: Initializing external client
2025-12-23 14:25:48,040 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-23 14:25:49,557 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2173


## Historical warnings data

In [3]:
os.makedirs("historical data", exist_ok=True)
start_date = (datetime.datetime.now() - relativedelta(years=5)).strftime('%Y-%m-%d') 
#start_date = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') 
end_date = (datetime.datetime.now()).strftime('%Y-%m-%d')
if not os.path.exists("historical data/warnings.pkl"):
    rows = []


    for location, (lat, lon) in resort_locations.items():
        print(f"Fetching {location}")

        for chunk_start, chunk_end in date_chunks(start_date, end_date, chunk_days=60):
            warnings = get_warning_data(chunk_start, chunk_end, lat, lon)

            # print(chunk_start, chunk_end)

            for w in warnings:
                rows.append({
                    "location": location,
                    "latitude": lat,
                    "longitude": lon,
                    "date": w.get("ValidFrom"),
                    "warning_level": w.get("DangerLevel")
                })

            time.sleep(0.2)

    warning_data_df = pd.DataFrame(rows)
    warning_data_df["date"] = pd.to_datetime(warning_data_df["date"]).dt.date

    warning_data_df.to_pickle("historical data/warnings.pkl")
else:
    warning_data_df = pd.read_pickle("historical data/warnings.pkl")

Fetching Narvik Ski Resort
2020-12-23 2021-02-21
2021-02-22 2021-04-23
2021-04-24 2021-06-23
2021-06-24 2021-08-23
2021-08-24 2021-10-23
2021-10-24 2021-12-23
2021-12-24 2022-02-22
2022-02-23 2022-04-24
2022-04-25 2022-06-24
2022-06-25 2022-08-24
2022-08-25 2022-10-24
2022-10-25 2022-12-24
2022-12-25 2023-02-23
2023-02-24 2023-04-25
2023-04-26 2023-06-25
2023-06-26 2023-08-25
2023-08-26 2023-10-25
2023-10-26 2023-12-25
2023-12-26 2024-02-24
2024-02-25 2024-04-25
2024-04-26 2024-06-25
2024-06-26 2024-08-25
2024-08-26 2024-10-25
2024-10-26 2024-12-25
2024-12-26 2025-02-24
2025-02-25 2025-04-26
2025-04-27 2025-06-26
2025-06-27 2025-08-26
2025-08-27 2025-10-26
2025-10-27 2025-12-23
Fetching Strandafjellet Skisenter
2020-12-23 2021-02-21
2021-02-22 2021-04-23
2021-04-24 2021-06-23
2021-06-24 2021-08-23
2021-08-24 2021-10-23
2021-10-24 2021-12-23
2021-12-24 2022-02-22
2022-02-23 2022-04-24
2022-04-25 2022-06-24
2022-06-25 2022-08-24
2022-08-25 2022-10-24
2022-10-25 2022-12-24
2022-12-25 2023

In [4]:
warning_data_df[warning_data_df["location"] == "Strandafjellet Skisenter"]


Unnamed: 0,location,latitude,longitude,date,warning_level
1827,Strandafjellet Skisenter,62.399663,6.899585,2020-12-23,2
1828,Strandafjellet Skisenter,62.399663,6.899585,2020-12-24,2
1829,Strandafjellet Skisenter,62.399663,6.899585,2020-12-25,3
1830,Strandafjellet Skisenter,62.399663,6.899585,2020-12-26,2
1831,Strandafjellet Skisenter,62.399663,6.899585,2020-12-27,3
...,...,...,...,...,...
3649,Strandafjellet Skisenter,62.399663,6.899585,2025-12-19,2
3650,Strandafjellet Skisenter,62.399663,6.899585,2025-12-20,1
3651,Strandafjellet Skisenter,62.399663,6.899585,2025-12-21,1
3652,Strandafjellet Skisenter,62.399663,6.899585,2025-12-22,1


In [5]:
len(warning_data_df)

36540

## Historical weather data

In [8]:
dfs = []
i = 1
for loc, (lat, lon) in resort_locations.items():
    dfs.append(get_historical_weather(loc, start_date, end_date, lon, lat))
    if i % 5 == 0:
        time.sleep(60)
    i+=1


weather_df = pd.concat(dfs, ignore_index=True)

In [9]:
weather_df

Unnamed: 0,date,temperature_2m_mean,precipitation_sum,rain_sum,snowfall_sum,wind_speed_10m_max,wind_direction_10m_dominant,location
0,2020-12-23,-1.927000,0.2,0.0,0.140000,12.849528,73.340157,Narvik Ski Resort
1,2020-12-24,-0.122833,0.9,0.0,0.630000,8.707238,79.654022,Narvik Ski Resort
2,2020-12-25,-6.577000,0.0,0.0,0.000000,14.799459,87.712997,Narvik Ski Resort
3,2020-12-26,-8.716582,2.1,0.1,1.400000,19.995399,80.722618,Narvik Ski Resort
4,2020-12-27,-11.747834,2.4,0.0,1.680001,31.427404,77.309097,Narvik Ski Resort
...,...,...,...,...,...,...,...,...
36535,2025-12-19,-2.390667,4.2,0.0,2.940000,16.024044,125.022659,Bjorli Ski
36536,2025-12-20,-1.653166,2.3,0.3,1.400000,8.654986,273.057678,Bjorli Ski
36537,2025-12-21,-9.340668,0.0,0.0,0.000000,7.380000,83.316940,Bjorli Ski
36538,2025-12-22,-11.559417,0.0,0.0,0.000000,8.473393,92.319527,Bjorli Ski


## Terrain data

(5.821807746740899, 59.26593432520996, 17.529653561603485, 68.58097085721195)


## Add to Hopsworks

### Warnings

In [None]:
warning_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36540 entries, 0 to 36539
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   location       36540 non-null  object        
 1   latitude       36540 non-null  float32       
 2   longitude      36540 non-null  float32       
 3   date           36540 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float32(2), object(2)
memory usage: 1.1+ MB


In [7]:
# Convert to correct datatype
warning_data_df['latitude'] = warning_data_df['latitude'].astype('float32')
warning_data_df['longitude'] = warning_data_df['longitude'].astype('float32')
warning_data_df['warning_level'] = warning_data_df['warning_level'].astype('int32')
warning_data_df['date']=pd.to_datetime(warning_data_df['date'], format='%Y-%m-%d')

fs = project.get_feature_store()
warning_fg = fs.get_or_create_feature_group(
    name="avalanche_warning",
    description = "Warnings for each day for the different resorts in Norway.",
    version=2,
    primary_key=["location"],
    event_time = "date"
)

warning_fg.insert(warning_data_df)
warning_fg.update_feature_description("date", "Date of level of warning of avalanches.")
warning_fg.update_feature_description("location", "Name of resort.")
warning_fg.update_feature_description("latitude", "Latitude of resort.")
warning_fg.update_feature_description("longitude", "Longitude of resort.")
warning_fg.update_feature_description("warning_level", "Warning level.")


Uploading Dataframe: 100.00% |██████████| Rows 36540/36540 | Elapsed Time: 00:01 | Remaining Time: 00:00


Job started successfully, you can follow the progress at 


<hsfs.feature_group.FeatureGroup at 0x190b6aa7c70>

### Weather

In [10]:
import great_expectations as ge
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

def expect_greater_than_zero(col):
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column":col,
                "min_value":-0.1,
                "max_value":1000.0,
                "strict_min":True
            }
        )
    )
expect_greater_than_zero("precipitation_sum")
expect_greater_than_zero("rain_sum")
expect_greater_than_zero("snowfall_sum")
expect_greater_than_zero("wind_speed_10m_max")

weather_fg = fs.get_or_create_feature_group(
    name='weather_sensor',
    description='Weather characteristics of each day',
    version=1,
    primary_key=['location'],
    event_time="date",
    expectation_suite=weather_expectation_suite
)

weather_fg.insert(weather_df, wait=True)
weather_fg.update_feature_description("date", "Date of the daily weather measurement")
weather_fg.update_feature_description("temperature_2m_mean", "Mean temperature at 2 meters above ground (°C)")

weather_fg.update_feature_description("precipitation_sum", "Total daily precipitation including rain and snow (mm)")
weather_fg.update_feature_description("rain_sum", "Total daily rainfall (mm)")
weather_fg.update_feature_description("snowfall_sum", "Total daily snowfall (cm)")

weather_fg.update_feature_description("wind_speed_10m_max", "Maximum wind speed at 10 meters above ground (m/s)")
weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant wind direction during the day (degrees)")

weather_fg.update_feature_description("location", "Location (resort) where weather measurements were collected")

2025-12-23 15:00:53,851 INFO: 	4 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://eu-west.cloud.hopsworks.ai:443/p/2173/fs/2122/fg/3170


Uploading Dataframe: 100.00% |██████████| Rows 36540/36540 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: weather_sensor_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/2173/jobs/named/weather_sensor_1_offline_fg_materialization/executions
2025-12-23 15:01:07,038 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-23 15:01:53,697 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-23 15:03:55,379 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2025-12-23 15:03:58,500 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-23 15:03:58,585 INFO: Waiting for log aggregation to finish.
2025-12-23 15:04:06,970 INFO: Execution finished successfully.


<hsfs.feature_group.FeatureGroup at 0x190b7e65b40>

### Terrain