In [1]:
import pandas as pd

ts_data = pd.read_parquet('../data/transformed/ts_data_2022.parquet')
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,1,9
1,2022-01-01 01:00:00,1,9
2,2022-01-01 02:00:00,1,9
3,2022-01-01 03:00:00,0,9
4,2022-01-01 04:00:00,1,9
...,...,...,...
2899555,2022-12-31 19:00:00,0,57
2899556,2022-12-31 20:00:00,0,57
2899557,2022-12-31 21:00:00,0,57
2899558,2022-12-31 22:00:00,0,57


In [2]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 175, :].reset_index(drop=True)
ts_data_one_location.head(15)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,0,175
1,2022-01-01 01:00:00,0,175
2,2022-01-01 02:00:00,0,175
3,2022-01-01 03:00:00,0,175
4,2022-01-01 04:00:00,2,175
5,2022-01-01 05:00:00,1,175
6,2022-01-01 06:00:00,2,175
7,2022-01-01 07:00:00,0,175
8,2022-01-01 08:00:00,0,175
9,2022-01-01 09:00:00,0,175


In [3]:
def get_cutoff_indices(
    data: pd.DataFrame,
    n_features: int,
    step_size: int,
    output_seq_len: int #Lo que agregué nuevo
    ) -> list:

        stop_position = len(data) - 1
        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        subseq_mid_idx = n_features
        subseq_last_idx = n_features + output_seq_len #le agrego "output_seq_len" para introducirlo como variable
        indices = []
        
        while subseq_last_idx <= stop_position:
            indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            subseq_mid_idx += step_size
            subseq_last_idx += step_size

        return indices

In [4]:
n_features = 24 
step_size = 1
output_seq_len = 36 #Agregué nuevo

indices = get_cutoff_indices(
    ts_data_one_location,
    n_features,
    step_size,
    output_seq_len #Agregué nuevo
)
indices[:5]

[(0, 24, 60), (1, 25, 61), (2, 26, 62), (3, 27, 63), (4, 28, 64)]

In [5]:
indices[-1]

(8699, 8723, 8759)

In [6]:
ts_data_one_location.shape

(8760, 3)

In [7]:
import numpy as np

n_examples = len(indices)
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agrego (n_examples,output_seq_len) para que de como resultado las 36 horas a predecir
pickup_hours = []

for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

In [8]:
print(f'{x.shape=}')
print(f'{x=}')
print(f'{pickup_hours[:5]=}')

x.shape=(8700, 24)
x=array([[ 0.,  0.,  0., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  2., ...,  0.,  0.,  0.],
       ...,
       [ 8.,  6.,  2., ..., 14., 16., 14.],
       [ 6.,  2.,  9., ..., 16., 14.,  3.],
       [ 2.,  9.,  3., ..., 14.,  3.,  1.]], dtype=float32)
pickup_hours[:5]=[Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-02 01:00:00'), Timestamp('2022-01-02 02:00:00'), Timestamp('2022-01-02 03:00:00'), Timestamp('2022-01-02 04:00:00')]


In [9]:
features_one_location = pd.DataFrame(
    x,
    columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))]
)
features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,...,0.0,2.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0
1,0.0,0.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,...,2.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0
2,0.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0
3,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8695,27.0,19.0,8.0,6.0,2.0,9.0,3.0,0.0,9.0,4.0,...,5.0,5.0,6.0,0.0,3.0,0.0,0.0,0.0,8.0,14.0
8696,19.0,8.0,6.0,2.0,9.0,3.0,0.0,9.0,4.0,1.0,...,5.0,6.0,0.0,3.0,0.0,0.0,0.0,8.0,14.0,16.0
8697,8.0,6.0,2.0,9.0,3.0,0.0,9.0,4.0,1.0,11.0,...,6.0,0.0,3.0,0.0,0.0,0.0,8.0,14.0,16.0,14.0
8698,6.0,2.0,9.0,3.0,0.0,9.0,4.0,1.0,11.0,7.0,...,0.0,3.0,0.0,0.0,0.0,8.0,14.0,16.0,14.0,3.0


In [10]:
targets_one_location = pd.DataFrame(y, columns=[f'rides_next_{i+1}_hour' for i in range(36)]) #f'target_rides_next_hour'])
targets_one_location

Unnamed: 0,rides_next_1_hour,rides_next_2_hour,rides_next_3_hour,rides_next_4_hour,rides_next_5_hour,rides_next_6_hour,rides_next_7_hour,rides_next_8_hour,rides_next_9_hour,rides_next_10_hour,...,rides_next_27_hour,rides_next_28_hour,rides_next_29_hour,rides_next_30_hour,rides_next_31_hour,rides_next_32_hour,rides_next_33_hour,rides_next_34_hour,rides_next_35_hour,rides_next_36_hour
0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,...,1.0,0.0,2.0,9.0,1.0,2.0,5.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,2.0,9.0,1.0,2.0,5.0,0.0,0.0,1.0,3.0
2,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.0,9.0,1.0,2.0,5.0,0.0,0.0,1.0,3.0,1.0
3,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,...,9.0,1.0,2.0,5.0,0.0,0.0,1.0,3.0,1.0,4.0
4,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,...,1.0,2.0,5.0,0.0,0.0,1.0,3.0,1.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8695,16.0,14.0,3.0,1.0,8.0,7.0,8.0,1.0,3.0,0.0,...,2.0,1.0,2.0,2.0,3.0,0.0,1.0,2.0,7.0,4.0
8696,14.0,3.0,1.0,8.0,7.0,8.0,1.0,3.0,0.0,11.0,...,1.0,2.0,2.0,3.0,0.0,1.0,2.0,7.0,4.0,2.0
8697,3.0,1.0,8.0,7.0,8.0,1.0,3.0,0.0,11.0,10.0,...,2.0,2.0,3.0,0.0,1.0,2.0,7.0,4.0,2.0,3.0
8698,1.0,8.0,7.0,8.0,1.0,3.0,0.0,11.0,10.0,4.0,...,2.0,3.0,0.0,1.0,2.0,7.0,4.0,2.0,3.0,2.0


In [11]:
from tqdm import tqdm

def transform_ts_data_into_features_and_target(
    ts_data: pd.DataFrame,
    input_seq_len: int,
    step_size: int,
    output_seq_len: int #Lo que agregué nuevo
) -> pd.DataFrame:
    """
    Slices and transposes data from time-series format into a (features, target)
    format that we can use to train Supervised ML models
    """
    assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}

    location_ids = ts_data['pickup_location_id'].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()
    
    for location_id in tqdm(location_ids):
        
        # keep only ts data for this `location_id`
        ts_data_one_location = ts_data.loc[
            ts_data.pickup_location_id == location_id, 
            ['pickup_hour', 'rides']
        ]

        # pre-compute cutoff indices to split dataframe rows
        indices = get_cutoff_indices(
            ts_data_one_location,
            input_seq_len,
            step_size,
            output_seq_len #Lo que agregué nuevo
        )

        # slice and transpose data into numpy arrays for features and targets
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
        y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas
        pickup_hours = []
        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        # numpy -> pandas
        features_one_location = pd.DataFrame(
            x,
            columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
        )
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        # numpy -> pandas
        targets_one_location = pd.DataFrame(y, columns=[f'rides_next_{i+1}_hour' for i in range(output_seq_len)])

        # concatenate results
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, targets_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets #['target_rides_next_hour']

In [12]:
features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*7*1, # one week of history
    step_size=24,
    output_seq_len=36
)

print(f'{features.shape=}')
print(f'{targets.shape=}')

100%|██████████| 331/331 [00:36<00:00,  9.03it/s]

features.shape=(118167, 170)
targets.shape=(118167, 36)





In [13]:
#ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values

In [13]:
features.head()

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,4.0,5.0,7.0,12.0,8.0,2.0,1.0,0.0,2022-01-08,9
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,9.0,0.0,1.0,1.0,3.0,1.0,1.0,0.0,2022-01-09,9
2,0.0,1.0,2.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,...,3.0,5.0,6.0,3.0,1.0,2.0,3.0,1.0,2022-01-10,9
3,4.0,2.0,2.0,1.0,0.0,0.0,0.0,2.0,1.0,4.0,...,4.0,6.0,5.0,11.0,3.0,7.0,1.0,2.0,2022-01-11,9
4,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,3.0,2.0,...,0.0,1.0,4.0,4.0,8.0,7.0,2.0,1.0,2022-01-12,9


In [14]:
features.pickup_location_id.unique().size

331

In [15]:
features.tail()

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
118162,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2022-12-26,57
118163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-12-27,57
118164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,2022-12-28,57
118165,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2022-12-29,57
118166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,2.0,2.0,1.0,2.0,2.0,1.0,0.0,0.0,2022-12-30,57
