# Trainning Pipeline

In this notebook we will create a training dataset from the feature groups:
1. Select the features we want to train our model on,
2. How the features should be preprocessed,
3. Create a Feature View.
4. Create a dataset split for training and validation data.

In [28]:
import os
import sys
from pathlib import Path
import time
from dotenv import load_dotenv
import hopsworks
from confluent_kafka import Producer
import pandas as pd
import logging





sys.path.insert(0, str(Path().resolve().parent / "src"))

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(module)s - %(message)s",
    level=logging.INFO
)

logger = logging.getLogger(__name__)

# load environment
load_dotenv(dotenv_path="/src/.env")


HOPSWORK_LOGIN_API_KEY = os.getenv("HOPSWORK_LOGIN_API_KEY")

In [29]:
# Login to the Hopsworks feature store
connection = hopsworks.login(
    host='c.app.hopsworks.ai',                 # DNS of your Feature Store instance
    port=443,  
    project='air_quality_project', 
    engine="python",
    api_key_value=HOPSWORK_LOGIN_API_KEY
)


2025-03-09 00:33:02,444 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-09 00:33:02,475 INFO: Initializing external client
2025-03-09 00:33:02,476 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-09 00:33:04,981 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214615


In [30]:
# connect to feature store
project_name = "air_quality_project"

try:
    feature_store = connection.get_feature_store(name=project_name)
    logger.info(f"✅ Successfully Connected to {feature_store.project_name}")
except Exception as err:
    logger.info(f"❌ Error accessing the Feature Store!")
    raise err

2025-03-09 00:33:11,467 INFO: ✅ Successfully Connected to air_quality_project


In [32]:
# access feature view
feature_view = feature_store.get_feature_view(name="air_quality_view", version=1)

In [33]:
# create training dataset

start_time = time.time()

feature_view.create_training_data(
    name="air_quality_historical_data_2020_to_2025",
    data_format="csv",
    description="Historical Data of Air Quality in Lagos",
    coalesce=False,
    label= ["aqi"], # label/feature of the training dataset
    test_size = 0.2
)

logger.info("✅ Done \n Upload time %s seconds ---" % (time.time() - start_time))

Training dataset job started successfully, you can follow the progress at 
http://c.app.hopsworks.ai/p/1214615/jobs/named/air_quality_view_1_create_fv_td_08032025233349/executions
2025-03-09 00:33:59,827 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-03-09 00:34:03,147 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-03-09 00:35:22,327 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-03-09 00:35:22,746 INFO: Waiting for log aggregation to finish.
2025-03-09 00:35:32,035 INFO: Execution finished successfully.
2025-03-09 00:35:32,048 INFO: ✅ Done 
 Upload time 113.85633587837219 seconds ---




## Training Dataset retreival

In [27]:
# get training data
features_df, labels_df = feature_view.get_training_data(training_dataset_version=1)

2025-03-08 22:42:42,769 INFO: Provenance cached data - overwriting last accessed/created training dataset from 2 to 1.


In [None]:
X_train, y_train, X_test, y_test = feature_view.get_train_test_split(
    training_dataset_version = 2
)

In [1]:
features_df.head()

NameError: name 'features_df' is not defined

In [110]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36311 entries, 0 to 36310
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   aqi                        36311 non-null  int64  
 1   co                         36311 non-null  float64
 2   no                         36311 non-null  float64
 3   no2                        36311 non-null  float64
 4   o3                         36311 non-null  float64
 5   so2                        36311 non-null  float64
 6   pm2_5                      36311 non-null  float64
 7   pm10                       36311 non-null  float64
 8   nh3                        36311 non-null  float64
 9   timestamp                  36311 non-null  object 
 10  aqi_mapping_category_aqi_  36311 non-null  object 
dtypes: float64(8), int64(1), object(2)
memory usage: 3.0+ MB


In [111]:
features_df.columns

Index(['aqi', 'co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3',
       'timestamp', 'aqi_mapping_category_aqi_'],
      dtype='object')

## Building Machine Learning Model

In [129]:
# import libraries
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

from numpy import ravel                                  # For matrices
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold # Feature selector
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV      # For optimization


from sklearn.metrics import accuracy_score

In [114]:
X = features_df.drop(columns=["aqi", "aqi_mapping_category_aqi_", "timestamp"])
y = features_df["aqi_mapping_category_aqi_"]

In [116]:
# Encode the labels into unique integers
encoder = LabelEncoder()
y = encoder.fit_transform(ravel(y))

In [118]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [119]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (29048, 8)
y_train shape: (29048,)


In [120]:
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_test shape: (7263, 8)
y_test shape: (7263,)


In [121]:
X_train.head()

Unnamed: 0,co,no,no2,o3,so2,pm2_5,pm10,nh3
19931,2937.32,13.3,13.02,0.02,9.06,156.93,226.14,17.48
12922,1602.17,0.0,24.33,28.97,8.94,83.76,128.97,27.61
7947,11215.21,77.78,97.33,8.05,35.76,704.06,856.8,76.0
19084,1094.82,0.5,11.65,44.7,6.68,27.66,39.47,13.3
6736,647.54,0.52,8.4,24.68,3.79,12.84,23.24,9.37


In [123]:
y_train

array([4, 4, 4, ..., 4, 4, 0])

In [125]:
# KNN

knn = KNeighborsClassifier().fit(X_train, y_train)
print('Training set score: ' + str(knn.score(X_train,y_train)))
print('Test set score: ' + str(knn.score(X_test,y_test)))

Training set score: 0.8768934177912421
Test set score: 0.8098581853228694


## Pipeline

In [128]:
pipe = Pipeline([
('scaler', StandardScaler()),
('selector', VarianceThreshold()),
('classifier', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)

print('Training set score: ' + str(pipe.score(X_train,y_train)))
print('Test set score: ' + str(pipe.score(X_test,y_test)))

Training set score: 0.8906293032222529
Test set score: 0.8305108082059754


## Hyperparameter Optimization

In [130]:
parameters = {'scaler': [StandardScaler(), MinMaxScaler(),
	Normalizer(), MaxAbsScaler()],
	'selector__threshold': [0, 0.001, 0.01],
	'classifier__n_neighbors': [1, 3, 5, 7, 10],
	'classifier__p': [1, 2],
	'classifier__leaf_size': [1, 5, 10, 15]
}

grid = GridSearchCV(pipe, parameters, cv=2).fit(X_train, y_train)

print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))


KeyboardInterrupt: 

In [None]:
# Access the best set of parameters
best_params = grid.best_params_
print(best_params)

In [None]:
# Stores the optimum model in best_pipe
best_pipe = grid.best_estimator_
print(best_pipe)

In [None]:
result_df = pd.DataFrame.from_dict(grid.cv_results_, orient='columns')
print(result_df.columns)

In [None]:
sns.relplot(data=result_df,
	kind='line',
	x='param_classifier__n_neighbors',
	y='mean_test_score',
	hue='param_scaler',
	col='param_classifier__p')
plt.show()

In [None]:
sns.relplot(data=result_df,
            kind='line',
            x='param_classifier__n_neighbors',
            y='mean_test_score',
            hue='param_scaler',
            col='param_classifier__leaf_size')
plt.show()

In [101]:
X_train.columns.to_list()

['co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3']

In [96]:
numeric_features = X_train.columns.to_list()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

In [105]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [108]:
pipe.fit(X_train, y_train)

ValueError: could not convert string to float: 'Very Poor'

In [84]:
y_pred = pipe.predict(X_test)

In [87]:
accuracy_score(y_test, y_pred)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets