In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm

ts_data = pd.read_parquet('../data/processed/validated_rides_ts_2022_01.parquet')

In [19]:
def getCutoffIndices(
    data: pd.DataFrame,
    n_features: int,
    step_size: int
) -> list:
    stop_pos = len(data) - 1
    
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []
    
    while subseq_last_idx <= stop_pos:
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
        
        subseq_first_idx    += step_size
        subseq_mid_idx      += step_size
        subseq_last_idx     += step_size
    
    return indices

In [24]:
def processData2FeatTgt(
    ts_data: pd.DataFrame,
    input_feat_len: int,
    step_size: int
    ) -> pd.DataFrame:
    n_features = input_feat_len
    
    
    loc_ids = ts_data['pickup_location_id'].unique()
    feat_df = pd.DataFrame()
    tgt_df = pd.DataFrame()
    for loc_id in tqdm(loc_ids):     
        
        ts_data_one_loc = ts_data.loc[ts_data.pickup_location_id == loc_id, :].reset_index(drop=True)

        indices = getCutoffIndices(
            ts_data_one_loc,
            n_features,
            step_size
        )

        n_examples = len(indices)

        x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []

        for i,idx in enumerate(indices):
            x[i,:] = ts_data_one_loc.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_loc.iloc[idx[1]:idx[2]]['rides'].values
            pickup_hours.append(ts_data_one_loc.iloc[idx[1]]['pickup_hour'])
        feat_1_loc = pd.DataFrame(
            x,
            columns=[f"rides_prev_{i+1}_hr" for i in reversed(range(n_features))]
        )
        feat_1_loc['pickup_hr'] = pickup_hours
        feat_1_loc['location_id'] = loc_id
        
        tgt_1_loc = pd.DataFrame(y, columns=[f"tgt_rides_nxt_hr"])
        
        feat_df = pd.concat([feat_df, feat_1_loc])
        tgt_df = pd.concat([tgt_df, tgt_1_loc])
        feat_df.reset_index(drop=True, inplace=True)
        tgt_df.reset_index(drop=True, inplace=True)
        
    return feat_df,tgt_df
        

In [25]:
feats, tgts = processData2FeatTgt(ts_data, input_feat_len= 24*7*1, step_size=24)

print(f"feats size {feats.shape=}")
print(f"tgts size {tgts.shape=}")


100%|██████████| 257/257 [00:01<00:00, 133.64it/s]

feats size feats.shape=(6168, 170)
tgts size tgts.shape=(6168, 1)



