In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import src.config as config

In [4]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2025-03-30 00:53:46,038 INFO: Initializing external client
2025-03-30 00:53:46,038 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-30 00:53:47,321 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220769


In [6]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220769/fs/1208402/fv/time_series_hourly_feature_view/version/1


In [7]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.98s) 




In [8]:
# drop `pickup_ts` column
ts_data.drop('pickup_ts', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
ts_data.sort_values(by=['pickup_location', 'pickup_hour'], inplace=True)
ts_data

Unnamed: 0,pickup_hour,rides,pickup_longitude,pickup_latitude,pickup_location
554924,2024-01-01 00:00:00+00:00,2,-87.721559,41.968069,Albany Park
808550,2024-01-01 01:00:00+00:00,0,-87.721559,41.968069,Albany Park
352502,2024-01-01 02:00:00+00:00,0,-87.721559,41.968069,Albany Park
72574,2024-01-01 03:00:00+00:00,3,-87.721559,41.968069,Albany Park
42788,2024-01-01 04:00:00+00:00,0,-87.721559,41.968069,Albany Park
...,...,...,...,...,...
265928,2025-03-30 01:00:00+00:00,0,-87.656804,41.949140,Wrigleyville
268893,2025-03-30 02:00:00+00:00,0,-87.656804,41.949140,Wrigleyville
342626,2025-03-30 03:00:00+00:00,0,-87.656804,41.949140,Wrigleyville
272053,2025-03-30 04:00:00+00:00,0,-87.656804,41.949140,Wrigleyville


In [9]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28,
    step_size=23
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f"{features_and_target.shape=}")

100%|██████████| 95/95 [00:12<00:00,  7.90it/s]


features_and_target.shape=(42275, 677)


In [10]:
from datetime import date, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1)).tz_localize("UTC")

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date=cutoff_date,
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(39520, 676)
y_train.shape=(39520,)
X_test.shape=(2755, 676)
y_test.shape=(2755,)


In [11]:
import lightgbm as lgb
from sklearn.pipeline import Pipeline

past_rides_columns = [c for c in X_train.columns if c.startswith('rides_')]
X_train_only_numeric = X_train[past_rides_columns]
model = Pipeline([
    ("model", lgb.LGBMRegressor())  # Solo el modelo, sin transformaciones
])
model.fit(X_train_only_numeric, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170924
[LightGBM] [Info] Number of data points in the train set: 39520, number of used features: 672
[LightGBM] [Info] Start training from score 7.650329


In [12]:
X_test_only_numeric = X_test[past_rides_columns]
predictions = model.predict(X_test_only_numeric)

from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=1.8346


In [13]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(model, MODELS_DIR / 'model.pkl')

['C:\\Users\\joral_08cedew\\chicago_taxi_demand_predictor\\models\\model.pkl']

In [14]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train_only_numeric)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [15]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor",
    input_example=X_train_only_numeric.sample(),
    model_schema=model_schema
)

model.save(str(MODELS_DIR/'model.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/320294 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3364 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/60583 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1220769/models/taxi_demand_predictor_next_hour/2


Model(name: 'taxi_demand_predictor_next_hour', version: 2)