In [1]:
import sys
from pathlib import Path
import os

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
from mlfs import config
if os.path.exists(f"{root_dir}/.env"):
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Added the following directory to the PYTHONPATH: c:\Users\nives\AllFolders\SML\mlfs-book
HopsworksSettings initialized!


__This notebook should be scheduled to run daily__

The GitHub Action is stored here:
[.github/workflows/air-quality-daily.yml](https://github.com/featurestorebook/mlfs-book/blob/main/.github/workflows/air-quality-daily.yml)

### Imports

In [2]:
import datetime
import time
import requests
import pandas as pd
import hopsworks
from mlfs.airquality import util
from mlfs import config
import json
import warnings
warnings.filterwarnings("ignore")

## <span style='color:#ff5f27'> Get the Sensor URL, Country, City, Street names from Hopsworks </span>

In [3]:
project = hopsworks.login()
fs = project.get_feature_store() 
secrets = hopsworks.get_secrets_api()

AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
location_str = secrets.get_secret("SENSOR_LOCATION_BERLIN_JSON").value
location = json.loads(location_str)

country=location['country']
city=location['city']

latitude=location['latitude']
longitude=location['longitude']

today = datetime.date.today()

location_str

2025-11-16 19:28:06,088 INFO: Initializing external client
2025-11-16 19:28:06,088 INFO: Base URL: https://c.app.hopsworks.ai:443






2025-11-16 19:28:07,837 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1279175


'{"country": "Germany", "city": "Berlin", "latitude": 52.52, "longitude": 13.4}'

### <span style="color:#ff5f27;">Get references to the Feature Groups </span>

In [4]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality_berlin',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_berlin',
    version=1,
)

### Get streets and urls

In [5]:
metadata = air_quality_fg.select(['street', 'url']).read()
metadata = metadata[['street', 'url']].drop_duplicates()  
metadata

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.24s) 


Unnamed: 0,street,url
0,friedrichshagen,https://api.waqi.info/feed/@10034
1,neukolln-silbersteinstrasse,https://api.waqi.info/feed/@10036
2,karl-liebknecht-strasse,https://api.waqi.info/feed/@6132
3,wedding-amrumer-strasse,https://api.waqi.info/feed/@10030
4,mariendorf--mariendorfer-damm,https://api.waqi.info/feed/@10040
6,buch,https://api.waqi.info/feed/@10033
10,neukolln-nansenstrasse,https://api.waqi.info/feed/@10032
18,mitte--bruckenstrasse,https://api.waqi.info/feed/@10039
19,karl-marx-strasse,https://api.waqi.info/feed/@13851
29,leipziger-strasse,https://api.waqi.info/feed/@13852


## <span style='color:#ff5f27'> Retrieve yesterday's air quality data (PM2.5) from the AQI API</span>


### Get recent PM2.5 values from Hopsworks

In [20]:
yesterday = today - datetime.timedelta(days=1)
lagged_days = 3

pm25_recent = air_quality_fg.select(['pm25', 'date']).filter(air_quality_fg.date > today - datetime.timedelta(days=lagged_days+1)).read()
pm25_recent = pm25_recent[['pm25', 'date']].drop_duplicates()  
pm25_recent

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.04s) 


Unnamed: 0,pm25,date
0,36.0,2025-11-13 00:00:00+00:00
1,34.0,2025-11-13 00:00:00+00:00
2,52.0,2025-11-14 00:00:00+00:00
3,46.0,2025-11-13 00:00:00+00:00
4,47.0,2025-11-13 00:00:00+00:00
5,53.0,2025-11-14 00:00:00+00:00
6,51.0,2025-11-14 00:00:00+00:00
7,19.0,2025-11-15 00:00:00+00:00
8,16.0,2025-11-15 00:00:00+00:00
9,40.0,2025-11-13 00:00:00+00:00


In [17]:
import requests
import pandas as pd

aq_today_df = pd.DataFrame()
for i in range(metadata.shape[0]):
    aqicn_url = metadata.iloc[i]['url']
    street = metadata.iloc[i]['street']

    # Air quality: yesterday
    aq_today_i_df = util.get_pm25(aqicn_url, country, city, street, today, AQICN_API_KEY)

    # lagged Air Quality for yesterday
    lagged_days = 3
    pm25_recent_i_df = air_quality_fg.select(['pm25', 'date']) \
                    .filter(air_quality_fg.url == aqicn_url) \
                    .filter(air_quality_fg.date > today - datetime.timedelta(days=lagged_days+1)) \
                    .read()
    pm25_recent_i_df = pm25_recent_i_df[['pm25', 'date']].drop_duplicates()  
    pm25_recent_i_df = pm25_recent_i_df.sort_values(by=['date'], ascending=False)
    pm25_lagged_1 = [ pm25_recent_i_df.iloc[0]['pm25'] ]
    pm25_lagged_2 = [ pm25_recent_i_df.iloc[1]['pm25'] ]
    pm25_lagged_3 = [ pm25_recent_i_df.iloc[2]['pm25'] ]

    aq_today_i_df['pm25_lagged_1'] = pm25_lagged_1[-1].astype('float32')
    aq_today_i_df['pm25_lagged_2'] = pm25_lagged_2[-1].astype('float32')
    aq_today_i_df['pm25_lagged_3'] = pm25_lagged_3[-1].astype('float32')

    aq_today_df = pd.concat([aq_today_df, aq_today_i_df], ignore_index=True)

print("Air Quality Data (today):")
aq_today_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.65s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.59s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.89s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.58s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.70s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.57s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.69s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.64s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.67s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.58s) 
Air Quality Data (today):


Unnamed: 0,pm25,country,city,street,date,url,pm25_lagged_1,pm25_lagged_2,pm25_lagged_3
0,13.0,Germany,Berlin,friedrichshagen,2025-11-16,https://api.waqi.info/feed/@10034,16.0,54.0,46.0
1,30.0,Germany,Berlin,neukolln-silbersteinstrasse,2025-11-16,https://api.waqi.info/feed/@10036,16.0,52.0,42.0
2,17.0,Germany,Berlin,karl-liebknecht-strasse,2025-11-16,https://api.waqi.info/feed/@6132,15.0,55.0,47.0
3,17.0,Germany,Berlin,wedding-amrumer-strasse,2025-11-16,https://api.waqi.info/feed/@10030,13.0,47.0,34.0
4,21.0,Germany,Berlin,mariendorf--mariendorfer-damm,2025-11-16,https://api.waqi.info/feed/@10040,18.0,53.0,43.0
5,13.0,Germany,Berlin,buch,2025-11-16,https://api.waqi.info/feed/@10033,11.0,38.0,32.0
6,17.0,Germany,Berlin,neukolln-nansenstrasse,2025-11-16,https://api.waqi.info/feed/@10032,14.0,48.0,40.0
7,17.0,Germany,Berlin,mitte--bruckenstrasse,2025-11-16,https://api.waqi.info/feed/@10039,12.0,47.0,36.0
8,34.0,Germany,Berlin,karl-marx-strasse,2025-11-16,https://api.waqi.info/feed/@13851,19.0,52.0,40.0
9,17.0,Germany,Berlin,leipziger-strasse,2025-11-16,https://api.waqi.info/feed/@13852,17.0,51.0,40.0


In [18]:
aq_today_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   pm25           10 non-null     float32       
 1   country        10 non-null     object        
 2   city           10 non-null     object        
 3   street         10 non-null     object        
 4   date           10 non-null     datetime64[ns]
 5   url            10 non-null     object        
 6   pm25_lagged_1  10 non-null     float32       
 7   pm25_lagged_2  10 non-null     float32       
 8   pm25_lagged_3  10 non-null     float32       
dtypes: datetime64[ns](1), float32(4), object(4)
memory usage: 688.0+ bytes


## <span style='color:#ff5f27'> Get Weather Forecast data</span>

In [21]:
# Download yesterday weather data
start_date = yesterday
end_date = yesterday

weather_yesterday_df = util.get_historical_weather(city=city, start_date=start_date, end_date=end_date, latitude=latitude, longitude=longitude)
weather_yesterday_df

Coordinates 52.5483283996582°N 13.407821655273438°E
Elevation 30.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
0,2025-11-15,5.220917,12.5,19.146112,83.454689,Berlin


In [23]:
# Weather hourly forecast starting from today
weather_hourly_df = util.get_hourly_weather_forecast(city=city, latitude=latitude, longitude=longitude)
weather_hourly_df = weather_hourly_df.set_index('date')

# One daily prediction at 12:00
weather_daily_df = weather_hourly_df.between_time('11:59', '12:01')
weather_daily_df = weather_daily_df.reset_index()
weather_daily_df['date'] = pd.to_datetime(weather_daily_df['date']).dt.date
weather_daily_df['date'] = pd.to_datetime(weather_daily_df['date'])
weather_daily_df['city'] = city
weather_daily_df

Coordinates 52.5°N 13.5°E
Elevation 30.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
0,2025-11-16,10.0,0.0,9.107359,251.564957,Berlin
1,2025-11-17,5.35,0.0,14.372974,292.067963,Berlin
2,2025-11-18,5.45,0.0,19.930477,249.928375,Berlin
3,2025-11-19,5.2,0.0,12.964998,181.59111,Berlin
4,2025-11-20,5.7,0.0,12.768586,220.425995,Berlin
5,2025-11-21,3.0,0.0,9.983106,295.640961,Berlin
6,2025-11-22,3.35,0.0,8.089993,69.145462,Berlin


## <span style="color:#ff5f27;">Uploading new data to the Feature Store</span>

In [24]:
# Insert new data
air_quality_fg.insert(aq_today_df)
weather_fg.insert(weather_yesterday_df, wait=True)
weather_fg.insert(weather_daily_df, wait=True)
print("Updated feature groups with yesterday's data and forecast.")


2025-11-16 19:51:13,066 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279175/fs/1265791/fg/1718740


Uploading Dataframe: 100.00% |██████████| Rows 10/10 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: air_quality_berlin_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279175/jobs/named/air_quality_berlin_1_offline_fg_materialization/executions
2025-11-16 19:51:27,877 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279175/fs/1265791/fg/1724775


Uploading Dataframe: 100.00% |██████████| Rows 1/1 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_berlin_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279175/jobs/named/weather_berlin_1_offline_fg_materialization/executions
2025-11-16 19:51:45,768 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-11-16 19:51:48,988 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-16 19:53:34,760 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-11-16 19:53:34,925 INFO: Waiting for log aggregation to finish.
2025-11-16 19:53:43,576 INFO: Execution finished successfully.
2025-11-16 19:53:43,784 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1279175/fs/1265791/fg/1724775


Uploading Dataframe: 100.00% |██████████| Rows 7/7 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: weather_berlin_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1279175/jobs/named/weather_berlin_1_offline_fg_materialization/executions
2025-11-16 19:54:01,116 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-11-16 19:54:04,304 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-11-16 19:55:49,892 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-11-16 19:55:50,065 INFO: Waiting for log aggregation to finish.
2025-11-16 19:55:58,819 INFO: Execution finished successfully.
Updated feature groups with yesterday's data and forecast.
