In [None]:
import datetime
import pandas as pd
from xgboost import XGBRegressor
import hopsworks
import json
from helpers import util
import os

In [None]:
today = datetime.datetime.now() - datetime.timedelta(0)
tomorrow = today + datetime.timedelta(days = 1)
today

In [None]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 

secrets = hopsworks.get_secrets_api()
sensors_str = secrets.get_secret("SENSORS_JSON").value
sensors_data = json.loads(sensors_str)

In [None]:
mr = project.get_model_registry()

retrieved_model = mr.get_model(
    name="air_quality_xgboost_lagged_model",
    version=1,
)

fv = retrieved_model.get_feature_view()

# Download the saved model artifacts to a local directory
saved_model_dir = retrieved_model.download()

In [None]:
# Loading the XGBoost regressor model and label encoder from the saved model directory
# retrieved_xgboost_model = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
retrieved_xgboost_model = XGBRegressor()

retrieved_xgboost_model.load_model(saved_model_dir + "/model.json")

# Displaying the retrieved XGBoost regressor model
retrieved_xgboost_model

## <span style="color:#ff5f27;">âœ¨ Get Weather Forecast Features with Feature View   </span>



In [None]:
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)

yesterday_2359 = (today - datetime.timedelta(days=1)).replace(hour=23, minute=59, second=59, microsecond=0)
air_quality_data = air_quality_fg.select(['pm25', 'lagged_1', 'lagged_2', 'lagged_3', 'city', 'date']).filter(air_quality_fg.date >= yesterday_2359.date()).read()
weather_data = weather_fg.filter(weather_fg.date >= today).read().sort_values(by='date')
weather_data

### <span style="color:#ff5f27;">ðŸ¤– Making the predictions</span>

In [None]:
def prepare_prediction_row(row, air_quality_data, city_mapping):
    """
    Prepare a single-row DataFrame for XGBoost prediction.
    
    Parameters:
    - row: Series, one row from weather_data
    - air_quality_data: DataFrame, historical air quality with columns ['city', 'date', 'pm25']
    - city_mapping: dict, mapping city names to integers used during training
    
    Returns:
    - prediction_df: 1-row DataFrame ready for model.predict()
    """
    # Ensure air_quality_data['date'] is datetime
    air_quality_data['date'] = pd.to_datetime(air_quality_data['date'])
    
    # Initialize dictionary for features
    features = {}
    city = row['city']
    pred_date = pd.to_datetime(row['date'])
    prev_date = pred_date - pd.Timedelta(days=1)

    # filter the air quality data
    last_days_aq = air_quality_data[
        (air_quality_data['city'] == city) &
        (air_quality_data['date'].dt.date == prev_date.date())
    ]

    print(last_days_aq)
    features['lagged_1'] = last_days_aq['pm25'].item()
    features['lagged_2'] = last_days_aq['lagged_1'].item()
    features['lagged_3'] = last_days_aq['lagged_2'].item()
    
    # Lagged PM2.5 for 1, 2, 3 days
    # for lag in [1, 2, 3]:
    #     prev_date = pd.to_datetime(row['date']) - pd.Timedelta(days=lag)
    #     lagged_row = air_quality_data[
    #         (air_quality_data['city'] == row['city']) &
    #         (air_quality_data['date'].dt.date == prev_date.date())
    #     ]
    #     if not lagged_row.empty:
    #         features[f'lagged_{lag}'] = lagged_row['pm25'].iloc[0]
    #     else:
    #         # Use NaN or a default value if missing
    #         features[f'lagged_{lag}'] = None
    
    # Copy weather features
    weather_cols = ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']
    for col in weather_cols:
        features[col] = row[col]
    
    # Encode city
    features['city'] = city_mapping[row['city']]
    
    # Convert to single-row DataFrame
    prediction_df = pd.DataFrame([features])
    
    # Ensure numeric types for XGBoost
    numeric_cols = ['lagged_1', 'lagged_2', 'lagged_3'] + weather_cols
    prediction_df[numeric_cols] = prediction_df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    
    return prediction_df


In [None]:
def append_predicted_pm25(air_quality_data, prediction_row, predicted_pm25, city_mapping):
    """
    Append a new row to air_quality_data for tomorrow with the predicted PM2.5.
    Reverts the city encoding to original city names.
    
    Parameters:
    - air_quality_data: DataFrame with columns ['city', 'date', 'pm25', 'lagged_1', 'lagged_2', 'lagged_3']
    - prediction_row: Series or DataFrame row used for prediction
    - predicted_pm25: float, predicted PM2.5 for tomorrow
    - city_mapping: dict, category->code mapping used during training
    
    Returns:
    - updated_air_quality_data: DataFrame with the new row appended
    """
    new_rows = []

    # Reverse mapping: code -> city name
    reverse_city_mapping = {v: k for k, v in city_mapping.items()}

    for _, row in prediction_row.iterrows():
        city_code = row['city']
        city_name = reverse_city_mapping.get(city_code, city_code)  # fallback to code if missing
                
        # Build new row
        new_row = {
            'city': city_name,
            'date': row['date'],
            'pm25': predicted_pm25.item(),
            'lagged_1': row['lagged_1'],  # today's pm25
            'lagged_2': row['lagged_2'] if 'lagged_2' in row else None,
            'lagged_3': row['lagged_3'] if 'lagged_3' in row else None
        }
        new_rows.append(new_row)
    
    # Append new row(s) to DataFrame
    updated_air_quality_data = pd.concat([air_quality_data, pd.DataFrame(new_rows)], ignore_index=True)
    return updated_air_quality_data


In [None]:
city_mapping = {'skoghall': 0, 'Ã¥rjÃ¤ng': 1, 'jonsbyn': 2, 'nykroppa': 3}
for _, prediction in weather_data.iterrows():
    # city = prediction['city']
    pred_date = pd.to_datetime(prediction['date'])
    # prev_date = pred_date - pd.Timedelta(days=1)

    # # filter the air quality data
    # last_days_aq = air_quality_data[
    #     (air_quality_data['city'] == city) &
    #     (air_quality_data['date'].dt.date == prev_date.date())
    # ]
    # prediction['lagged_1'] = last_days_aq['pm25'].item()
    # prediction['lagged_2'] = last_days_aq['lagged_1'].item()
    # prediction['lagged_3'] = last_days_aq['lagged_2'].item()
    prediction_df = prepare_prediction_row(prediction, air_quality_data, city_mapping)
    prediction_df['date'] = pred_date
    predicted_pm25 = retrieved_xgboost_model.predict(prediction_df[['lagged_1', 'lagged_2', 'lagged_3', 'city', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
    air_quality_data = append_predicted_pm25(air_quality_data, prediction_df, predicted_pm25, city_mapping)


# batch_data['predicted_pm25'] = retrieved_xgboost_model.predict(
#     batch_data[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
# batch_data


In [None]:
air_quality_data

In [None]:
batch_data.info()

### <span style="color:#ff5f27;">ðŸ¤– Saving the predictions (for monitoring) to a Feature Group</span>

In [None]:
batch_data['street'] = street
batch_data['city'] = city
batch_data['country'] = country
# Fill in the number of days before the date on which you made the forecast (base_date)
batch_data['days_before_forecast_day'] = range(1, len(batch_data)+1)
batch_data = batch_data.sort_values(by=['date'])
batch_data

In [None]:
batch_data.info()

### Create Forecast Graph
Draw a graph of the predictions with dates as a PNG and save it to the github repo
Show it on github pages

In [None]:

pred_file_path = f"{root_dir}/docs/air-quality/assets/img/pm25_forecast.png"
plt = util.plot_air_quality_forecast(city, street, batch_data, pred_file_path)

plt.show()

In [None]:
# Get or create feature group
monitor_fg = fs.get_or_create_feature_group(
    name='aq_predictions',
    description='Air Quality prediction monitoring',
    version=1,
    primary_key=['city','street','date','days_before_forecast_day'],
    event_time="date"
)

In [None]:
monitor_fg.insert(batch_data, wait=True)

In [None]:
# We will create a hindcast chart for  only the forecasts made 1 day beforehand
monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()
monitoring_df

In [None]:
air_quality_fg = fs.get_feature_group(name='air_quality', version=1)
air_quality_df = air_quality_fg.read()
air_quality_df

In [None]:
outcome_df = air_quality_df[['date', 'pm25']]
preds_df =  monitoring_df[['date', 'predicted_pm25']]

hindcast_df = pd.merge(preds_df, outcome_df, on="date")
hindcast_df = hindcast_df.sort_values(by=['date'])

# If there are no outcomes for predictions yet, generate some predictions/outcomes from existing data
if len(hindcast_df) == 0:
    hindcast_df = util.backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, retrieved_xgboost_model)
hindcast_df

### Plot the Hindcast comparing predicted with forecasted values (1-day prior forecast)

__This graph will be empty to begin with - this is normal.__

After a few days of predictions and observations, you will get data points in this graph.

In [None]:
hindcast_file_path = f"{root_dir}/docs/air-quality/assets/img/pm25_hindcast_1day.png"
plt = util.plot_air_quality_forecast(city, street, hindcast_df, hindcast_file_path, hindcast=True)
plt.show()

### Upload the prediction and hindcast dashboards (png files) to Hopsworks


In [None]:
dataset_api = project.get_dataset_api()
str_today = today.strftime("%Y-%m-%d")
if dataset_api.exists("Resources/airquality") == False:
    dataset_api.mkdir("Resources/airquality")
dataset_api.upload(pred_file_path, f"Resources/airquality/{city}_{street}_{str_today}", overwrite=True)
dataset_api.upload(hindcast_file_path, f"Resources/airquality/{city}_{street}_{str_today}", overwrite=True)

proj_url = project.get_url()
print(f"See images in Hopsworks here: {proj_url}/settings/fb/path/Resources/airquality")

---