In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src.config as config
import pandas as pd

In [3]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1007769
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1007769/fs/999496/fv/time_series_hourly_feature_view/version/1


In [5]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (37.01s) 



In [6]:
# drop `pickup_ts` column
ts_data.drop('pickup_ts', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data["pickup_hour"] = pd.to_datetime(ts_data["pickup_hour"])
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
3418294,2022-01-01 00:00:00+00:00,0,1
3825532,2022-01-01 01:00:00+00:00,0,1
3337391,2022-01-01 02:00:00+00:00,0,1
4107592,2022-01-01 03:00:00+00:00,0,1
1079719,2022-01-01 04:00:00+00:00,1,1
...,...,...,...
706642,2024-09-05 07:00:00+00:00,3,265
650572,2024-09-05 08:00:00+00:00,6,265
731793,2024-09-05 09:00:00+00:00,4,265
769499,2024-09-05 10:00:00+00:00,5,265


In [None]:
# # from src.plot import plot_ts
# from typing import Optional, List
# import pandas as pd
# import plotly.express as px 

# def plot_ts(
#     ts_data: pd.DataFrame,
#     locations: Optional[List[int]] = None
#     ):
#     """
#     Plot time-series data
#     """
#     ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data

#     fig = px.line(
#         ts_data_to_plot,
#         x="pickup_hour",
#         y="rides",
#         color='pickup_location_id',
#         template='none',
#     )

#     fig.show()

# plot_ts(ts_data, locations=[43])

In [7]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28, # one month
    step_size=23,
)

features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

 86%|████████▋ | 229/265 [01:26<00:16,  2.22it/s]




 87%|████████▋ | 230/265 [01:28<00:35,  1.01s/it]




 87%|████████▋ | 231/265 [01:28<00:26,  1.29it/s]




 88%|████████▊ | 232/265 [01:28<00:20,  1.62it/s]




 88%|████████▊ | 233/265 [01:29<00:15,  2.04it/s]




 88%|████████▊ | 234/265 [01:29<00:12,  2.45it/s]




 89%|████████▊ | 235/265 [01:29<00:10,  2.92it/s]




 89%|████████▉ | 236/265 [01:29<00:08,  3.34it/s]




 89%|████████▉ | 237/265 [01:29<00:07,  3.67it/s]




 90%|████████▉ | 238/265 [01:30<00:06,  3.98it/s]




 90%|█████████ | 239/265 [01:30<00:06,  4.21it/s]




 91%|█████████ | 240/265 [01:30<00:05,  4.24it/s]




 91%|█████████ | 241/265 [01:30<00:06,  3.94it/s]




 91%|█████████▏| 242/265 [01:31<00:05,  4.10it/s]




 92%|█████████▏| 243/265 [01:31<00:05,  4.27it/s]




 92%|█████████▏| 244/265 [01:31<00:04,  4.36it/s]




 92%|█████████▏| 245/265 [01:31<00:05,  3.98it/s]




 93%|█████████▎| 246/265 [01:32<00:04,  3.95it/s]




 93%|█████████▎| 247/265 [01:32<00:04,  4.20it/s]




 94%|█████████▎| 248/265 [01:32<00:04,  3.97it/s]




 94%|█████████▍| 249/265 [01:32<00:04,  3.57it/s]




 94%|█████████▍| 250/265 [01:33<00:03,  3.80it/s]




 95%|█████████▍| 251/265 [01:33<00:04,  3.38it/s]




 95%|█████████▌| 252/265 [01:33<00:03,  3.40it/s]




 95%|█████████▌| 253/265 [01:34<00:03,  3.53it/s]




 96%|█████████▌| 254/265 [01:34<00:02,  3.73it/s]




 96%|█████████▌| 255/265 [01:34<00:02,  4.03it/s]




 97%|█████████▋| 256/265 [01:34<00:02,  4.31it/s]




 97%|█████████▋| 257/265 [01:34<00:01,  4.39it/s]




 97%|█████████▋| 258/265 [01:35<00:01,  4.37it/s]




 98%|█████████▊| 259/265 [01:35<00:01,  4.43it/s]




 98%|█████████▊| 260/265 [01:35<00:01,  4.24it/s]




 98%|█████████▊| 261/265 [01:36<00:01,  3.73it/s]




 99%|█████████▉| 262/265 [01:36<00:00,  3.73it/s]




 99%|█████████▉| 263/265 [01:36<00:00,  3.74it/s]




100%|█████████▉| 264/265 [01:36<00:00,  3.55it/s]




100%|██████████| 265/265 [01:37<00:00,  2.73it/s]

features_and_target.shape=(217471, 675)





In [8]:
from datetime import date, timedelta
from pytz import timezone
from src.data_split import train_test_split

# training data -> from January 2022 up until 2 months ago
# test data -> last 2 months
cutoff_date = pd.to_datetime(date.today() - timedelta(days=28*1), utc=True)

print(f'{cutoff_date=}')

cutoff_date=Timestamp('2024-08-08 00:00:00+0000', tz='UTC')


In [9]:
features_and_target["pickup_hour"] = pd.to_datetime(features_and_target["pickup_hour"])

In [10]:
X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'   
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(210830, 674)
y_train.shape=(210830,)
X_test.shape=(6641, 674)
y_test.shape=(6641,)


In [11]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
    
    # sort X_train by `pikup_hour` inplace
    # so the TimeSeriesSplit will split the data in a consistent way
    X_train.sort_values('pickup_hour', inplace=True)

    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-09-05 15:03:08,152] A new study created in memory with name: no-name-c957862d-f938-4a5c-9c40-50e171fdec91


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

found 0 physical cores < 1



  File "c:\Users\josec\OneDrive\Escritorio\taxi_demand_predictor\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[I 2024-09-05 15:04:20,218] Trial 0 finished with value: 23.873476072022086 and parameters: {'num_leaves': 243, 'feature_fraction': 0.4556473806906438, 'bagging_fraction': 0.5175708137953241, 'min_child_samples': 65}. Best is trial 0 with value: 23.873476072022086.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-09-05 15:04:50,249] Trial 1 finished with value: 23.163797940527246 and parameters: {'num_leaves': 23, 'feature_fraction': 0.2437599807754011, 'bagging_fraction': 0.5366255787817007, 'min_child_samples': 48}. Best is trial 1 with value: 23.163797940527246.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-09-05 15:05:41,873] Trial 2 finished with value: 23.807372173936958 and parameters: {'num_leaves': 204, 'feature_fraction': 0.3324046857636683, 'bagging_fraction': 0.8628336915271304, 'min_child_samples': 4}. Best is trial 1 with value: 23.163797940527246.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-09-05 15:06:39,436] Trial 3 finished with value: 23.335554038782593 and parameters: {'num_leaves': 61, 'feature_fraction': 0.5794608587312562, 'bagging_fraction': 0.7893500076862052, 'min_child_samples': 65}. Best is trial 1 with value: 23.163797940527246.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

[I 2024-09-05 15:07:18,997] Trial 4 finished with value: 23.436102824145742 and parameters: {'num_leaves': 94, 'feature_fraction': 0.2644695615402354, 'bagging_fraction': 0.8778120388775719, 'min_child_samples': 9}. Best is trial 1 with value: 23.163797940527246.


In [13]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 23, 'feature_fraction': 0.2437599807754011, 'bagging_fraction': 0.5366255787817007, 'min_child_samples': 48}


In [14]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [15]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=21.4926


In [16]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['C:\\Users\\josec\\OneDrive\\Escritorio\\taxi_demand_predictor\\models\\model.pkl']

In [17]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [18]:
model_schema

ModelSchema(input: 'columnar', output: 'columnar')

In [19]:
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)

Connected. Call `.close()` to terminate connection gracefully.


In [20]:
model.save(str(MODELS_DIR / 'model.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/279114 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/4445 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/60849 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1007769/models/taxi_demand_predictor_next_hour/2


Model(name: 'taxi_demand_predictor_next_hour', version: 2)