In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error

from src.paths import PROCESSED_DATA_DIR
from src.data_split  import trainTestSplit

df = pd.read_parquet(PROCESSED_DATA_DIR / "tabular_data.parquet")

In [2]:
X_train, y_train, X_test, y_test = trainTestSplit(
    df,
    cutoff_date=datetime(2022,8,1,0,0,0),
    tgt_col_name='tgt_rides_next_hr'
)

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(48208, 674)
y_train.shape=(48208,)
X_test.shape=(40086, 674)
y_test.shape=(40086,)


In [4]:
def avgRidesPerMonth(X: pd.DataFrame) -> pd.DataFrame:
    """Adds one column with average rides from weekly data over a
     month

    Args:
        X (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: _description_
    """
    X["avg_rides_per_month"] = (X[f'rides_prev_{1*7*24}_hr'] + \
                                X[f'rides_prev_{2*7*24}_hr'] + \
                                X[f'rides_prev_{3*7*24}_hr'] + \
                                X[f'rides_prev_{4*7*24}_hr']) * 0.25

    return X

In [5]:
from sklearn.preprocessing import FunctionTransformer

add_feat_avg_rides_month = FunctionTransformer(
    avgRidesPerMonth, validate=False
)

In [6]:
add_feat_avg_rides_month.fit_transform(X_train)

Unnamed: 0,rides_prev_672_hr,rides_prev_671_hr,rides_prev_670_hr,rides_prev_669_hr,rides_prev_668_hr,rides_prev_667_hr,rides_prev_666_hr,rides_prev_665_hr,rides_prev_664_hr,rides_prev_663_hr,...,rides_prev_7_hr,rides_prev_6_hr,rides_prev_5_hr,rides_prev_4_hr,rides_prev_3_hr,rides_prev_2_hr,rides_prev_1_hr,pickup_hr,location_id,avg_rides_per_month
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,20.25
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,17.50
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,0.25
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,0.75
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-07-27,199,0.00
48204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-07-28,199,0.00
48205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-07-29,199,0.00
48206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-07-30,199,0.00


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

# Another way to add a transformation
class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        
        # Add numerical columns from datetime column
        X_['hour'] = X_['pickup_hr'].dt.hour
        X_['day_of_week'] = X_['pickup_hr'].dt.weekday
           
        return X_.drop(columns=['pickup_hr'])
        

In [10]:
add_temporal_feats = TemporalFeatureEngineer()
add_temporal_feats.fit_transform(X_train)

Unnamed: 0,rides_prev_672_hr,rides_prev_671_hr,rides_prev_670_hr,rides_prev_669_hr,rides_prev_668_hr,rides_prev_667_hr,rides_prev_666_hr,rides_prev_665_hr,rides_prev_664_hr,rides_prev_663_hr,...,rides_prev_6_hr,rides_prev_5_hr,rides_prev_4_hr,rides_prev_3_hr,rides_prev_2_hr,rides_prev_1_hr,location_id,avg_rides_per_month,hour,day_of_week
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,7.0,4.0,3.0,4.0,9.0,19.0,4,20.25,0,5
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,4.0,10.0,7.0,5.0,9.0,10.0,4,17.50,0,6
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,7.0,8.0,5.0,5.0,10.0,0.0,4,0.25,0,0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,16.0,7.0,1.0,0.0,1.0,3.0,4,0.75,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,8.0,3.0,0.0,4.0,4.0,3.0,4,0.75,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,199,0.00,0,2
48204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,199,0.00,0,3
48205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,199,0.00,0,4
48206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,199,0.00,0,5


In [11]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feat_avg_rides_month,
    add_temporal_feats,
    lgb.LGBMRegressor()
)

pipeline.fit(X_train, y_train)

In [12]:
predictions = pipeline.predict(X_test)

test_mae = mean_absolute_error(y_test, predictions)

print(f"Test MAE: {test_mae:.4f}")

Test MAE: 2.5607
