In [68]:
import pandas as pd
from paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location,pickup_longitude,pickup_latitude,target_rides_next_hour
0,2.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0,1.0,...,0.0,3.0,1.0,1.0,0.0,2024-01-29,Albany Park,-87.721559,41.968069,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,1.0,3.0,1.0,2.0,1.0,2024-01-30,Albany Park,-87.721559,41.968069,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,1.0,0.0,...,3.0,0.0,2.0,0.0,0.0,2024-01-31,Albany Park,-87.721559,41.968069,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,...,0.0,2.0,1.0,0.0,0.0,2024-02-01,Albany Park,-87.721559,41.968069,0.0
4,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,...,0.0,2.0,1.0,4.0,1.0,2024-02-02,Albany Park,-87.721559,41.968069,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2025-01-27,Andersonville,-87.671446,41.979796,0.0
35051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2025-01-28,Andersonville,-87.671446,41.979796,0.0
35052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2025-01-29,Andersonville,-87.671446,41.979796,0.0
35053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2025-01-30,Andersonville,-87.671446,41.979796,0.0


In [69]:
from datetime import datetime
from data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2024, 11, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(26315, 676)
y_train.shape=(26315,)
X_test.shape=(8740, 676)
y_test.shape=(8740,)


In [70]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    """
    X['average_rides_last_4_weeks'] = 0.25*(
        X[f'rides_previous_{7*24}_hour'] + \
        X[f'rides_previous_{2*7*24}_hour'] + \
        X[f'rides_previous_{3*7*24}_hour'] + \
        X[f'rides_previous_{4*7*24}_hour']
    )
    return X

In [71]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False)

In [72]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location,pickup_longitude,pickup_latitude,average_rides_last_4_weeks
0,2.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0,1.0,...,0.0,3.0,1.0,1.0,0.0,2024-01-29,Albany Park,-87.721559,41.968069,1.00
1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,1.0,3.0,1.0,2.0,1.0,2024-01-30,Albany Park,-87.721559,41.968069,0.50
2,0.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,1.0,0.0,...,3.0,0.0,2.0,0.0,0.0,2024-01-31,Albany Park,-87.721559,41.968069,0.50
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,...,0.0,2.0,1.0,0.0,0.0,2024-02-01,Albany Park,-87.721559,41.968069,0.50
4,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,...,0.0,2.0,1.0,4.0,1.0,2024-02-02,Albany Park,-87.721559,41.968069,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2024-10-27,Andersonville,-87.671446,41.979796,0.00
26311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2024-10-28,Andersonville,-87.671446,41.979796,0.00
26312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2024-10-29,Andersonville,-87.671446,41.979796,0.00
26313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2024-10-30,Andersonville,-87.671446,41.979796,0.00


In [73]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # Generate numeric columns from datetime
        X_["hour"] = X_['pickup_hour'].dt.hour
        X_["day_of_week"] = X_['pickup_hour'].dt.dayofweek
        
        return X_.drop(columns=['pickup_hour', 'pickup_location'])

In [74]:
add_temporal_features = TemporalFeaturesEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_longitude,pickup_latitude,average_rides_last_4_weeks,hour,day_of_week
0,2.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0,1.0,...,0.0,3.0,1.0,1.0,0.0,-87.721559,41.968069,1.00,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,1.0,3.0,1.0,2.0,1.0,-87.721559,41.968069,0.50,0,1
2,0.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,1.0,0.0,...,3.0,0.0,2.0,0.0,0.0,-87.721559,41.968069,0.50,0,2
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,...,0.0,2.0,1.0,0.0,0.0,-87.721559,41.968069,0.50,0,3
4,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,...,0.0,2.0,1.0,4.0,1.0,-87.721559,41.968069,0.25,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,-87.671446,41.979796,0.00,0,6
26311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-87.671446,41.979796,0.00,0,0
26312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-87.671446,41.979796,0.00,0,1
26313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-87.671446,41.979796,0.00,0,2


In [75]:
X_train

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location,pickup_longitude,pickup_latitude,average_rides_last_4_weeks
0,2.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0,1.0,...,0.0,3.0,1.0,1.0,0.0,2024-01-29,Albany Park,-87.721559,41.968069,1.00
1,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,1.0,3.0,1.0,2.0,1.0,2024-01-30,Albany Park,-87.721559,41.968069,0.50
2,0.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,1.0,0.0,...,3.0,0.0,2.0,0.0,0.0,2024-01-31,Albany Park,-87.721559,41.968069,0.50
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,...,0.0,2.0,1.0,0.0,0.0,2024-02-01,Albany Park,-87.721559,41.968069,0.50
4,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,...,0.0,2.0,1.0,4.0,1.0,2024-02-02,Albany Park,-87.721559,41.968069,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2024-10-27,Andersonville,-87.671446,41.979796,0.00
26311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2024-10-28,Andersonville,-87.671446,41.979796,0.00
26312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2024-10-29,Andersonville,-87.671446,41.979796,0.00
26313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2024-10-30,Andersonville,-87.671446,41.979796,0.00


In [76]:
import lightgbm as lgb

In [77]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 124294
[LightGBM] [Info] Number of data points in the train set: 26315, number of used features: 676
[LightGBM] [Info] Start training from score 4.649933


In [79]:
predictions = pipeline.predict(X_test)

from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=1.3038
