In [1]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_next_27_hour,rides_next_28_hour,rides_next_29_hour,rides_next_30_hour,rides_next_31_hour,rides_next_32_hour,rides_next_33_hour,rides_next_34_hour,rides_next_35_hour,rides_next_36_hour
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,10.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,1.0,2.0,5.0,2.0,2.0,2.0
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,2.0,0.0,5.0,4.0,1.0,8.0
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,2.0,1.0,1.0,0.0,1.0,1.0,7.0,2.0,2.0,3.0
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0
111212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,3.0,0.0
111213,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0
111214,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0


In [2]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    targets_columns_names=[c for c in df.columns if c.startswith('rides_next_')]
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(40713, 674)
y_train.shape=(40713, 36)
X_test.shape=(70503, 674)
y_test.shape=(70503, 36)


In [3]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    """
    X['average_rides_last_4_weeks'] = 0.25*(
        X[f'rides_previous_{7*24}_hour'] + \
        X[f'rides_previous_{2*7*24}_hour'] + \
        X[f'rides_previous_{3*7*24}_hour'] + \
        X[f'rides_previous_{4*7*24}_hour']
    )
    return X

In [4]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False)

In [5]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,2.0,8.0,9.0,3.0,3.0,4.0,2.0,2022-01-29,9,1.25
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,4.0,3.0,2.0,0.0,2022-01-30,9,0.50
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,6.0,6.0,3.0,3.0,4.0,3.0,2.0,2022-01-31,9,1.75
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,5.0,14.0,2.0,11.0,2.0,6.0,0.0,2022-02-01,9,1.50
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,4.0,6.0,8.0,8.0,7.0,6.0,3.0,2022-02-02,9,2.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-27,57,0.00
40709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-28,57,0.00
40710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-29,57,0.00
40711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-30,57,0.00


In [6]:
import pandas as pd
from src.paths import RAW_DATA_DIR

def latitude_and_longitude(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds two columns with the latitude and longitude from pickup_location_id
    
    """
    raw_data_rides = pd.read_parquet(RAW_DATA_DIR / 'rides_2022.parquet')

    #Nos quedamos sólo con las columnas que nos interesan y las renombramos
    raw_data_rides = raw_data_rides[['id_estacion_origen', 'lat_estacion_origen', 'long_estacion_origen']]
    raw_data_rides['id_estacion_origen'] = raw_data_rides['id_estacion_origen'].str.replace('BAEcobici', '').astype(int)
    raw_data_rides = raw_data_rides.drop_duplicates().reset_index(drop=True)
    raw_data_rides.rename(columns={
    'id_estacion_origen': 'pickup_location_id',
    'lat_estacion_origen': 'latitude',
    'long_estacion_origen': 'longitude'
    }, inplace=True)

    # Combinar la información de latitud y longitud en X
    X = X.merge(raw_data_rides, on='pickup_location_id', how='left')

    # Eliminar la columna 'pickup_location_id'
    #X.drop('pickup_location_id', axis=1, inplace=True)

    return X

In [7]:
from sklearn.preprocessing import FunctionTransformer

add_feature_latitude_and_longitude = FunctionTransformer(
    latitude_and_longitude, validate=False)

In [8]:
add_feature_latitude_and_longitude.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,average_rides_last_4_weeks,latitude,longitude
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,9.0,3.0,3.0,4.0,2.0,2022-01-29,9,1.25,-34.585443,-58.407741
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,4.0,3.0,2.0,0.0,2022-01-30,9,0.50,-34.585443,-58.407741
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,3.0,3.0,4.0,3.0,2.0,2022-01-31,9,1.75,-34.585443,-58.407741
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,2.0,11.0,2.0,6.0,0.0,2022-02-01,9,1.50,-34.585443,-58.407741
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,8.0,8.0,7.0,6.0,3.0,2022-02-02,9,2.75,-34.585443,-58.407741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2022-05-27,57,0.00,-34.612690,-58.371250
40709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2022-05-28,57,0.00,-34.612690,-58.371250
40710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2022-05-29,57,0.00,-34.612690,-58.371250
40711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2022-05-30,57,0.00,-34.612690,-58.371250


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # Generate numeric columns from datetime
        X_["hour"] = X_['pickup_hour'].dt.hour
        X_["day_of_week"] = X_['pickup_hour'].dt.dayofweek

                        

        return X_.drop(columns=['pickup_hour'])

In [10]:
add_temporal_features = TemporalFeaturesEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_location_id,average_rides_last_4_weeks,hour,day_of_week
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,8.0,9.0,3.0,3.0,4.0,2.0,9,1.25,0,5
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,4.0,3.0,2.0,0.0,9,0.50,0,6
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,6.0,3.0,3.0,4.0,3.0,2.0,9,1.75,0,0
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,14.0,2.0,11.0,2.0,6.0,0.0,9,1.50,0,1
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,6.0,8.0,8.0,7.0,6.0,3.0,9,2.75,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,57,0.00,0,4
40709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,57,0.00,0,5
40710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,57,0.00,0,6
40711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,57,0.00,0,0


In [11]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

In [12]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_feature_latitude_and_longitude,
    add_temporal_features,
    MultiOutputRegressor(lgb.LGBMRegressor(force_col_wise=True))
)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 12130
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 677
[LightGBM] [Info] Start training from score 0.416796
[LightGBM] [Info] Total Bins 12130
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 677
[LightGBM] [Info] Start training from score 0.287181
[LightGBM] [Info] Total Bins 12130
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 677
[LightGBM] [Info] Start training from score 0.155110
[LightGBM] [Info] Total Bins 12130
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 677
[LightGBM] [Info] Start training from score 0.090856
[LightGBM] [Info] Total Bins 12130
[LightGBM] [Info] Number of data points in the train set: 40713, number of used features: 677
[LightGBM] [Info] Start training from score 0.081473
[LightGBM] [Info] Total Bins 12130
[LightGBM] [Info] Number of data points in the train se

In [13]:
X_train

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,2.0,8.0,9.0,3.0,3.0,4.0,2.0,2022-01-29,9,1.25
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,4.0,3.0,2.0,0.0,2022-01-30,9,0.50
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,6.0,6.0,3.0,3.0,4.0,3.0,2.0,2022-01-31,9,1.75
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,5.0,14.0,2.0,11.0,2.0,6.0,0.0,2022-02-01,9,1.50
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,4.0,6.0,8.0,8.0,7.0,6.0,3.0,2022-02-02,9,2.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-27,57,0.00
40709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-28,57,0.00
40710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-29,57,0.00
40711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-30,57,0.00


In [14]:
predictions = pipeline.predict(X_test)

from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=0.6749


In [15]:
# Como da algun valor negativo como resultado eliminamos las predicciones negativas y los intervalos negativos reemplazando por cero
import numpy as np

#Limitar los valores predichos a un mínimo de cero
predictions = np.clip(predictions, 0, None)

In [16]:
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=0.6746
