In [136]:
# baseline model

# how to model a time series prediction?
# weekly seasonality?
# daily seasonality?
# monthly seasonality?
# yearly seasonality?
# trend?
# holiday effect?
# special events?
# weather effect?
# which features may I build to enrich the model?
# how to evaluate the model?
# how to improve the model?
# give me more features examples to improve the model


# 4 ways to improve the model

# 1. Increase training data size

# 2. Add more features to the training data

# 3. Try different algorithms

# 4. Tune the hyperparameters of the model

# should be better than 8.189822

# for tabular datasets, the following algorithms are recommended:
# Boosting trees: XGBoost, LightGBM, CatBoost


In [127]:
import pandas as pd

# Load the data
data = pd.read_parquet('../data/transformed/yellow_tripdata_features_target.parquet')

In [4]:
data.head()



Unnamed: 0,rides_previous_672,rides_previous_671,rides_previous_670,rides_previous_669,rides_previous_668,rides_previous_667,rides_previous_666,rides_previous_665,rides_previous_664,rides_previous_663,...,rides_previous_7,rides_previous_6,rides_previous_5,rides_previous_4,rides_previous_3,rides_previous_2,rides_previous_1,pickup_hour,PULocationID,target_rides_next_hour
0,49.0,94.0,84.0,85.0,43.0,22.0,14.0,24.0,19.0,9.0,...,4.0,14.0,19.0,22.0,10.0,25.0,19.0,2019-01-29,4,7.0
1,8.0,3.0,0.0,6.0,0.0,3.0,11.0,16.0,26.0,23.0,...,19.0,15.0,13.0,18.0,21.0,19.0,11.0,2019-01-30,4,15.0
2,10.0,5.0,1.0,1.0,4.0,4.0,7.0,24.0,32.0,31.0,...,23.0,20.0,25.0,19.0,26.0,26.0,17.0,2019-01-31,4,19.0
3,5.0,7.0,4.0,2.0,3.0,2.0,13.0,18.0,32.0,27.0,...,16.0,18.0,22.0,22.0,26.0,28.0,40.0,2019-02-01,4,20.0
4,70.0,68.0,39.0,33.0,18.0,5.0,8.0,16.0,18.0,23.0,...,28.0,24.0,28.0,42.0,43.0,30.0,68.0,2019-02-02,4,83.0


In [23]:
from datetime import datetime
from typing import Tuple
import pandas as pd

def create_training_sets(PATH, cutoff_date, target_column_name): 
    """
    Generate separate training and testing sets for each PULocationID.
    
    :param PATH: Path to the data file.
    :param cutoff_date: The cutoff date for splitting into train/test sets.
    :param target_column_name: The name of the target column.
    :return: A dictionary where each PULocationID has its (X_train, X_test, y_train, y_test).
    """
    data = pd.read_parquet(PATH)
    
    # Dictionary to store training/testing sets for each PULocationID
    training_sets = {}

    for pulocation_id in data['PULocationID'].unique():
        print(f"Processing PULocationID: {pulocation_id}")
        
        # Filter data specific to each PULocationID
        location_data = data[data['PULocationID'] == pulocation_id].copy()
        location_data.drop(columns=['PULocationID'], inplace=True)
        # Perform train/test split for this location
        X_train, X_test, y_train, y_test = train_test_split(
            location_data,
            cutoff_date,
            target_column_name
        )

        # Store the result in the dictionary
        training_sets[pulocation_id] = (X_train, X_test, y_train, y_test)

    return training_sets


In [24]:
# test the function
training_sets = create_training_sets(
    '../data/transformed/yellow_tripdata_features_target.parquet',
    datetime(2019, 6, 1),
    'target'
)

Processing PULocationID: 4
Processing PULocationID: 7
Processing PULocationID: 10
Processing PULocationID: 11
Processing PULocationID: 12
Processing PULocationID: 13
Processing PULocationID: 14
Processing PULocationID: 17
Processing PULocationID: 18
Processing PULocationID: 19
Processing PULocationID: 20
Processing PULocationID: 21
Processing PULocationID: 24
Processing PULocationID: 25
Processing PULocationID: 33
Processing PULocationID: 34
Processing PULocationID: 35
Processing PULocationID: 36
Processing PULocationID: 37
Processing PULocationID: 39
Processing PULocationID: 40
Processing PULocationID: 41
Processing PULocationID: 42
Processing PULocationID: 43
Processing PULocationID: 45
Processing PULocationID: 48
Processing PULocationID: 49
Processing PULocationID: 50
Processing PULocationID: 51
Processing PULocationID: 52
Processing PULocationID: 54
Processing PULocationID: 56
Processing PULocationID: 60
Processing PULocationID: 61
Processing PULocationID: 63
Processing PULocationI

In [10]:
data.columns

Index(['rides_previous_672', 'rides_previous_671', 'rides_previous_670',
       'rides_previous_669', 'rides_previous_668', 'rides_previous_667',
       'rides_previous_666', 'rides_previous_665', 'rides_previous_664',
       'rides_previous_663',
       ...
       'rides_previous_7', 'rides_previous_6', 'rides_previous_5',
       'rides_previous_4', 'rides_previous_3', 'rides_previous_2',
       'rides_previous_1', 'pickup_hour', 'PULocationID',
       'target_rides_next_hour'],
      dtype='object', length=675)

In [12]:
from src.training import train_test_split
from datetime import datetime
X_train, X_test, y_train, y_test = train_test_split(
            data,
            cutoff_date=datetime(2019, 6, 1),
            target_column_name='target_rides_next_hour'
        )

In [8]:
import numpy as np

In [9]:
np.__version__

'2.1.2'

In [55]:
# baseline model
import numpy as np
# import base model from sklearn
from sklearn.base import BaseEstimator

class BaselineModelPreviousHour:
    """
    A baseline model that uses the previous hour's target value as the prediction for the next hour.
    
    This model serves as a simple baseline for time series forecasting tasks, where the prediction for
    each instance in `X_test` is taken directly from the `target_previous_1_hour` column, representing
    the observed target value from the previous hour. This approach provides a naive benchmark, useful
    for comparing against more sophisticated models.

    Attributes:
    -----------
    X_train : pd.DataFrame
        Training feature set containing lagged and potentially engineered features for model training.
    y_train : pd.Series
        Training target series, which is the actual value the model aims to predict.
    
    Methods:
    --------
    fit(X_train: pd.DataFrame, y_train: pd.Series) -> None
        Placeholder method for fitting the model. In this baseline model, no training process is
        implemented as the prediction relies solely on the previous hour's value.

    predict(X_test: pd.DataFrame) -> pd.Series
        Generates predictions by returning the `target_previous_1_hour` column from `X_test`.
        This column should represent the value of the target variable observed in the previous hour.

        Parameters:
        -----------
        X_test : pd.DataFrame
            Test feature set containing a `target_previous_1_hour` column used for generating predictions.

        Returns:
        --------
        pd.Series
            A series containing the predictions for each instance in `X_test`, with each prediction
            being the observed target value from the previous hour.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame)-> np.array:
        # Return the previous hour's target value for all predictions
        return X_test['rides_previous_1']


In [56]:
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)
predictions

0         37.0
1         42.0
2         10.0
3         17.0
4         14.0
          ... 
249475     0.0
249476     0.0
249477     0.0
249478     0.0
249479     0.0
Name: rides_previous_1, Length: 249480, dtype: float32

In [75]:
# lets evaluate the model
y_pred = predictions
from sklearn.metrics import root_mean_squared_error

baseline_error = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {test_rmae:.4f}')

Root Mean Squared Error: 35.2734


In [32]:
columns_list = list(data.columns)

rides_previous_columns = [col for col in columns_list if 'rides_previous_' in col]

In [33]:
rides_previous_columns

['rides_previous_672',
 'rides_previous_671',
 'rides_previous_670',
 'rides_previous_669',
 'rides_previous_668',
 'rides_previous_667',
 'rides_previous_666',
 'rides_previous_665',
 'rides_previous_664',
 'rides_previous_663',
 'rides_previous_662',
 'rides_previous_661',
 'rides_previous_660',
 'rides_previous_659',
 'rides_previous_658',
 'rides_previous_657',
 'rides_previous_656',
 'rides_previous_655',
 'rides_previous_654',
 'rides_previous_653',
 'rides_previous_652',
 'rides_previous_651',
 'rides_previous_650',
 'rides_previous_649',
 'rides_previous_648',
 'rides_previous_647',
 'rides_previous_646',
 'rides_previous_645',
 'rides_previous_644',
 'rides_previous_643',
 'rides_previous_642',
 'rides_previous_641',
 'rides_previous_640',
 'rides_previous_639',
 'rides_previous_638',
 'rides_previous_637',
 'rides_previous_636',
 'rides_previous_635',
 'rides_previous_634',
 'rides_previous_633',
 'rides_previous_632',
 'rides_previous_631',
 'rides_previous_630',
 'rides_pre

In [40]:
# sum of rides in the each 24 hours in this 4 weeks
for i in range(0, 28):
    data[f'day_{i}_rides_sum'] = 0

In [35]:
data

Unnamed: 0,rides_previous_672,rides_previous_671,rides_previous_670,rides_previous_669,rides_previous_668,rides_previous_667,rides_previous_666,rides_previous_665,rides_previous_664,rides_previous_663,...,day_18_rides_sum,day_19_rides_sum,day_20_rides_sum,day_21_rides_sum,day_22_rides_sum,day_23_rides_sum,day_24_rides_sum,day_25_rides_sum,day_26_rides_sum,day_27_rides_sum
0,49.0,94.0,84.0,85.0,43.0,22.0,14.0,24.0,19.0,9.0,...,0,0,0,0,0,0,0,0,0,0
1,8.0,3.0,0.0,6.0,0.0,3.0,11.0,16.0,26.0,23.0,...,0,0,0,0,0,0,0,0,0,0
2,10.0,5.0,1.0,1.0,4.0,4.0,7.0,24.0,32.0,31.0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,7.0,4.0,2.0,3.0,2.0,13.0,18.0,32.0,27.0,...,0,0,0,0,0,0,0,0,0,0
4,70.0,68.0,39.0,33.0,18.0,5.0,8.0,16.0,18.0,23.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
281948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
281949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
281950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
for i in range(0, 28):
    data[f'day_{i}_rides_sum'] = data[f'day_{i}_rides_sum'] + np.sum(data[f'rides_previous_{672-(24*i+j)}'] for j in range(0, 24))

  data[f'day_{i}_rides_sum'] = data[f'day_{i}_rides_sum'] + np.sum(data[f'rides_previous_{672-(24*i+j)}'] for j in range(0, 24))


In [44]:
data['day_0_rides_sum']

0         583.0
1         283.0
2         358.0
3         427.0
4         716.0
          ...  
281947      0.0
281948      0.0
281949      0.0
281950      0.0
281951      0.0
Name: day_0_rides_sum, Length: 281952, dtype: float64

In [51]:
for i in range(0, 28):
    data[f'average_dayly_rides_{i}'] = data[f'day_{i}_rides_sum']/24

In [53]:
root_mean_squared_error(data['target_rides_next_hour'], data['average_dayly_rides_27'])

np.float64(49.520196518120976)

# previous week model

In [67]:
7*24

168

In [71]:
# BaseModelPreviousWeek

class BaselineModelPreviousWeek:
    """
    A baseline model that uses the previous week's target value as the prediction for the next hour.
    
    This model serves as a simple baseline for time series forecasting tasks, where the prediction for
    each instance in `X_test` is taken directly from the `target_previous_168_hours` column, representing
    the observed target value from the previous week. This approach provides a naive benchmark, useful
    for comparing against more sophisticated models.

    Attributes:
    -----------
    X_train : pd.DataFrame
        Training feature set containing lagged and potentially engineered features for model training.
    y_train : pd.Series
        Training target series, which is the actual value the model aims to predict.
    
    Methods:
    --------
    fit(X_train: pd.DataFrame, y_train: pd.Series) -> None
        Placeholder method for fitting the model. In this baseline model, no training process is
        implemented as the prediction relies solely on the previous week's value.

    predict(X_test: pd.DataFrame) -> pd.Series
        Generates predictions by returning the `target_previous_168_hours` column from `X_test`.
        This column should represent the value of the target variable observed in the previous week.

        Parameters:
        -----------
        X_test : pd.DataFrame
            Test feature set containing a `target_previous_168_hours` column used for generating predictions.

        Returns:
        --------
        pd.Series
            A series containing the predictions for each instance in `X_test`, with each prediction
            being the observed target value from the previous week.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame)-> np.array:
        # Return the previous week's target value for all predictions
        return X_test[f'rides_previous_{7*24}']

In [72]:
previous_week_model = BaselineModelPreviousWeek()
predictions = previous_week_model.predict(X_test)
predictions

0         36.0
1         37.0
2         17.0
3          6.0
4          4.0
          ... 
249475     0.0
249476     0.0
249477     0.0
249478     0.0
249479     0.0
Name: rides_previous_168, Length: 249480, dtype: float32

In [76]:
previous_week_error = root_mean_squared_error(y_test, predictions)
previous_week_error

np.float32(22.451237)

In [78]:
print(f'Previous Week Baseline RMSE: {previous_week_error:.4f}')
print(f'Previous Hour Baseline RMSE: {test_rmae:.4f}')

Previous Week Baseline RMSE: 22.4512
Previous Hour Baseline RMSE: 35.2734


In [88]:
class PreviousWeeksModel:
    """
    A baseline model that uses the previous week's target value as the prediction for the next hour.
    
    This model serves as a simple baseline for time series forecasting tasks, where the prediction for
    each instance in `X_test` is taken directly from the `target_previous_168_hours` column, representing
    the observed target value from the previous week. This approach provides a naive benchmark, useful
    for comparing against more sophisticated models.

    Attributes:
    -----------
    X_train : pd.DataFrame
        Training feature set containing lagged and potentially engineered features for model training.
    y_train : pd.Series
        Training target series, which is the actual value the model aims to predict.
    
    Methods:
    --------
    fit(X_train: pd.DataFrame, y_train: pd.Series) -> None
        Placeholder method for fitting the model. In this baseline model, no training process is
        implemented as the prediction relies solely on the previous week's value.

    predict(X_test: pd.DataFrame) -> pd.Series
        Generates predictions by returning the `target_previous_168_hours` column from `X_test`.
        This column should represent the value of the target variable observed in the previous week.

        Parameters:
        -----------
        X_test : pd.DataFrame
            Test feature set containing a `target_previous_168_hours` column used for generating predictions.

        Returns:
        --------
        pd.Series
            A series containing the predictions for each instance in `X_test`, with each prediction
            being the observed target value from the previous week.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame)-> np.array:
        # Return the previous week's target value for all predictions
        prediction = np.zeros(X_test.shape[0])
        for i in range(0,4):
            prediction = prediction + X_test[f'rides_previous_{7*24*(i+1)}']
        
        prediction = prediction/4
        prediction = np.round(prediction)
        return prediction

In [89]:
previous_weeks_model = PreviousWeeksModel()
predictions_previous_weeks = previous_weeks_model.predict(X_test)
predictions_previous_weeks

0         58.0
1         74.0
2         12.0
3          9.0
4         10.0
          ... 
249475     0.0
249476     0.0
249477     0.0
249478     0.0
249479     0.0
Length: 249480, dtype: float64

In [90]:
previous_weeks_error = root_mean_squared_error(y_test, predictions_previous_weeks)
previous_weeks_error

np.float64(21.595067437502273)

In [128]:
data_train = data.copy()

In [129]:
data_train['pickup_hour']

0        2019-01-29
1        2019-01-30
2        2019-01-31
3        2019-02-01
4        2019-02-02
            ...    
281947   2021-12-27
281948   2021-12-28
281949   2021-12-29
281950   2021-12-30
281951   2021-12-31
Name: pickup_hour, Length: 281952, dtype: datetime64[ns]

In [130]:
# Supondo que você já tenha o DataFrame data_train com a coluna 'pickup_hour' no formato datetime
data_train['pickup_hour'] = pd.to_datetime(data_train['pickup_hour'])

# Define last_date como a última data da coluna 'pickup_hour'
last_date = data_train['pickup_hour'].max()

# Calcula a data limite de 6 meses antes de last_date
cutoff_date = last_date - pd.DateOffset(months=6)

In [100]:
X_43 = data_train[data_train['PULocationID']==43].copy()
X_43 = X_43.drop(columns=['PULocationID'])

In [113]:
def train_test_split(
        df: pd.DataFrame,
        cutoff_date: datetime,
        target_column_name: str):
    """
    Split the data into training and testing sets
    :param df: the input DataFrame
    :param cutoff_date: the cutoff date
    :param target_column_name: the name of the target column
    :return: a tuple of two DataFrames: (training, testing)
    """
    training = df[df['pickup_hour'] < cutoff_date].reset_index(drop=True)
    test = df[df['pickup_hour'] >= cutoff_date].reset_index(drop=True)

    X_train = training.drop(columns=[target_column_name])
    y_train = training[target_column_name]

    X_test = test.drop(columns=[target_column_name])
    y_test = test[target_column_name]

    return X_train, y_train, X_test, y_test

In [131]:
X_train, y_train, X_test, y_test = train_test_split(
    data_train,
    cutoff_date=cutoff_date,
    target_column_name='target_rides_next_hour'
)


In [132]:
past_rides_columns = [c for c in X_train.columns if c not in ('pickup_hour', 'target_rides_next_hour')]
X_train_only_numeric = X_train[past_rides_columns].copy()

In [103]:
import xgboost as xgb

In [122]:
y_train

0          7.0
1         15.0
2         19.0
3         20.0
4         83.0
          ... 
233107     0.0
233108     0.0
233109     0.0
233110     0.0
233111     0.0
Name: target_rides_next_hour, Length: 233112, dtype: float32

In [133]:
model = xgb.XGBRegressor()
model.fit(X_train_only_numeric, y_train)

In [134]:
xgb_predictions = model.predict(X_test[past_rides_columns])

In [135]:
xgb_error = root_mean_squared_error(y_test, xgb_predictions)
xgb_error

np.float32(8.283567)