In [None]:
import os
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import hopsworks
from helpers import util
import json

import warnings
warnings.filterwarnings("ignore")

In [None]:
project = hopsworks.login(engine="python")
fs = project.get_feature_store() 

secrets = hopsworks.get_secrets_api()
# This line will fail if you have not registered the AQICN_API_KEY as a secret in Hopsworks
AQICN_API_KEY = secrets.get_secret("AQICN_API_KEY").value
sensors_str = secrets.get_secret("SENSORS_JSON").value
sensors_data = json.loads(sensors_str)

In [None]:
# Retrieve feature groups
air_quality_fg = fs.get_feature_group(
    name='air_quality',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

In [None]:
# Select features for training data. todo check if city should be here
selected_features = air_quality_fg.select(['pm25', 'city', 'date']).join(weather_fg.select_features(), on=['city'])

In [None]:
feature_view = fs.get_or_create_feature_view(
    name='air_quality_fv',
    description="weather features with air quality as the target",
    version=1,
    labels=['pm25'],
    query=selected_features,
)

In [None]:
start_date_test_data = "2025-05-01"
# Convert string to datetime object
test_start = datetime.strptime(start_date_test_data, "%Y-%m-%d")

In [None]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_start=test_start
)

In [None]:
X_train

In [None]:
X_train['city'] = X_train["city"].astype("category")
X_test['city'] = X_test["city"].astype("category")
X_features = X_train.drop(columns=['date'])
X_test_features = X_test.drop(columns=['date'])

In [None]:
# Creating an instance of the XGBoost Regressor
xgb_regressor = XGBRegressor(enable_categorical=True, tree_method="hist")

# Fitting the XGBoost Regressor to the training data
xgb_regressor.fit(X_features, y_train)


In [None]:
# Predicting target values on the test set
y_pred = xgb_regressor.predict(X_test_features)

# Calculating Mean Squared Error (MSE) using sklearn
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)

In [None]:
df = y_test
df['city'] = X_test['city']
df['date'] = X_test['date']
df['predicted_pm25'] = y_pred

In [None]:
# Creating a directory for the model artifacts if it doesn't exist
model_dir = "air_quality_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

In [None]:
for location in sensors_data:
    city = location['city']
    street = location['street']
    city_dir = f"{images_dir}/{city}"
    if not os.path.exists(city_dir):
        os.mkdir(city_dir)
    file_path = f"{city_dir}/pm25_hindcast.png"
    plt = util.plot_air_quality_forecast(city, street, df[df['city'] == city], file_path, hindcast=True) 
    plt.show()

In [None]:
# Plotting feature importances using the plot_importance function from XGBoost
plot_importance(xgb_regressor)
feature_importance_path = images_dir + "/feature_importance.png"
plt.savefig(feature_importance_path)
plt.show()

In [None]:
# Saving the XGBoost regressor object as a json file in the model directory
xgb_regressor.save_model(model_dir + "/model.json")

In [None]:
res_dict = { 
        "MSE": str(mse),
        "R squared": str(r2),
    }

In [None]:
mr = project.get_model_registry()

# Creating a Python model in the model registry named 'air_quality_xgboost_model'

aq_model = mr.python.create_model(
    name="air_quality_xgboost_model", 
    metrics= res_dict,
    feature_view=feature_view,
    description="Air Quality (PM2.5) predictor",
)

# Saving the model artifacts to the 'air_quality_model' directory in the model registry
aq_model.save(model_dir)