In [15]:
import pandas as pd
import numpy as np

from datetime import datetime
from sklearn.metrics import mean_absolute_error

from src.paths import PROCESSED_DATA_DIR
from src.data_split  import trainTestSplit
df = pd.read_parquet(PROCESSED_DATA_DIR / "tabular_data.parquet")

In [16]:
X_train, y_train, X_test, y_test = trainTestSplit(
    df,
    cutoff_date=datetime(2022,8,1,0,0,0),
    tgt_col_name='tgt_rides_next_hr'
)

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(48208, 674)
y_train.shape=(48208,)
X_test.shape=(40086, 674)
y_test.shape=(40086,)


In [17]:
class BaselineModelPreviousHour:
    """
    Simple prediction : duplicate acutal demand from prev hour
    """
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass # No fit requried
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test['rides_prev_1_hr']

In [19]:
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

0         0.0
1         2.0
2         3.0
3         4.0
4        15.0
         ... 
40081     0.0
40082     0.0
40083     0.0
40084     0.0
40085     0.0
Name: rides_prev_1_hr, Length: 40086, dtype: float32

In [22]:
class BaselineModelPreviousWeek:
    """
    Simple prediction: Duplicate demand observed same day, last week 
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass # No fit requried
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f'rides_prev_{7*24}_hr']

In [23]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

test_mae=3.8744


In [35]:
class BaselineModelPreviousMonth:
    """
    Simple prediction: Duplicate demand observed same day, averagedf over month 
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass # No fit requried
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return (X_test[f'rides_prev_{7*24}_hr'] + \
                X_test[f'rides_prev_{14*24}_hr'] + \
                X_test[f'rides_prev_{21*24}_hr'] + \
                X_test[f'rides_prev_{28*24}_hr']) * 0.25

In [36]:
model = BaselineModelPreviousMonth()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae=:.4f}")

test_mae=3.3684
