## Steps
1. Download all raw data for a given year and month
1. For each raw data, apply the filter before saving it
1. Transform the saved raw data into TS data
1. Convert the ts data into features and targets
1. Save the transformed data


Main objective is to write utility functions to do all these things so we can reuse them later. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Now you can import from src
from src.data_utils import load_and_process_taxi_data

In [3]:
rides = load_and_process_taxi_data(year=2023)

File already exists for 2023-01.
Loading data for 2023-01...
Total records: 3,066,766
Valid records: 2,993,140
Records dropped: 73,626 (2.40%)
Successfully processed data for 2023-01.
File already exists for 2023-02.
Loading data for 2023-02...
Total records: 2,913,955
Valid records: 2,845,058
Records dropped: 68,897 (2.36%)
Successfully processed data for 2023-02.
File already exists for 2023-03.
Loading data for 2023-03...
Total records: 3,403,766
Valid records: 3,331,705
Records dropped: 72,061 (2.12%)
Successfully processed data for 2023-03.
File already exists for 2023-04.
Loading data for 2023-04...
Total records: 3,288,250
Valid records: 3,214,922
Records dropped: 73,328 (2.23%)
Successfully processed data for 2023-04.
File already exists for 2023-05.
Loading data for 2023-05...
Total records: 3,513,649
Valid records: 3,435,875
Records dropped: 77,774 (2.21%)
Successfully processed data for 2023-05.
File already exists for 2023-06.
Loading data for 2023-06...
Total records: 3,30

In [4]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107
...,...,...
37463263,2023-12-31 23:04:34,233
37463264,2023-12-31 23:08:15,48
37463265,2023-12-31 23:16:15,196
37463266,2023-12-31 23:21:58,140


In [5]:
from src.data_utils import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-01-01 00:00:00,2,0
1,2023-01-01 01:00:00,2,0
2,2023-01-01 02:00:00,2,0
3,2023-01-01 03:00:00,2,0
4,2023-01-01 04:00:00,2,0


In [6]:
ts_data.shape

(2277600, 3)

In [7]:
from src.data_utils import transform_ts_data_info_features_and_target_loop

features, targets = transform_ts_data_info_features_and_target_loop(ts_data, window_size=24*28, step_size=24)


In [8]:
features

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2023-01-29,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-30,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-31,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-01,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-02,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87615,25,14,5,3,7,16,53,133,126,136,...,73,62,62,58,50,48,42,37,2023-12-27,263
87616,30,7,9,6,5,23,58,123,136,108,...,83,64,79,65,71,72,75,35,2023-12-28,263
87617,50,26,17,9,8,11,43,116,137,132,...,71,81,78,60,85,63,62,37,2023-12-29,263
87618,117,88,39,19,14,12,27,37,70,97,...,78,84,75,100,98,88,77,69,2023-12-30,263


In [9]:
import numpy as np
window_size = 4
step_size = 1
num_windows = 4

step_size * np.arange(num_windows)[:, None]  # Output: array([[0], [1], [2], [3]])

indices = np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]


In [10]:
window_size = 4
step_size = 1
num_windows = 4

In [11]:
import numpy as np
np.arange(num_windows)

array([0, 1, 2, 3])

In [12]:
np.arange(num_windows)[:, None]

array([[0],
       [1],
       [2],
       [3]])

In [13]:
step_size * np.arange(num_windows)[:, None]

array([[0],
       [1],
       [2],
       [3]])

In [14]:
indices = np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]


In [15]:
np.arange(window_size)[None, :]

array([[0, 1, 2, 3]])

In [16]:
np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]

array([[0, 1, 2, 3],
       [1, 2, 3, 4],
       [2, 3, 4, 5],
       [3, 4, 5, 6]])

In [17]:
# Array 1: A 2D array with 2 rows and 2 columns
array1 = np.array([[1, 2],
                   [3, 4]])

# Array 2: A 2D array with 2 rows and 1 column
array2 = np.array([[5],
                   [6]])

# Array 3: A 2D array with 2 rows and 1 column
array3 = np.array([[7],
                   [8]])


np.hstack([array1, array2, array3])

array([[1, 2, 5, 7],
       [3, 4, 6, 8]])

In [18]:
from src.data_utils import transform_ts_data_info_features_and_target_loop

features, targets = transform_ts_data_info_features_and_target_loop(ts_data, window_size=24*28*1, step_size=24)


In [19]:
tabular_data = features
tabular_data["target"] = targets

from src.config import TRANSFORMED_DATA_DIR
tabular_data.to_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet", engine="pyarrow")

In [20]:
features, targets = transform_ts_data_info_features_and_target_loop(ts_data[ts_data["pickup_location_id"]==43], window_size=12, step_size=1)


In [21]:
features

Unnamed: 0,rides_t-12,rides_t-11,rides_t-10,rides_t-9,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,92,81,29,15,4,4,3,12,12,23,37,41,2023-01-01 12:00:00,43
1,81,29,15,4,4,3,12,12,23,37,41,102,2023-01-01 13:00:00,43
2,29,15,4,4,3,12,12,23,37,41,102,97,2023-01-01 14:00:00,43
3,15,4,4,3,12,12,23,37,41,102,97,106,2023-01-01 15:00:00,43
4,4,4,3,12,12,23,37,41,102,97,106,119,2023-01-01 16:00:00,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8743,11,13,29,60,99,103,143,151,209,160,98,66,2023-12-31 19:00:00,43
8744,13,29,60,99,103,143,151,209,160,98,66,55,2023-12-31 20:00:00,43
8745,29,60,99,103,143,151,209,160,98,66,55,72,2023-12-31 21:00:00,43
8746,60,99,103,143,151,209,160,98,66,55,72,50,2023-12-31 22:00:00,43


In [22]:
ts_data[ts_data["pickup_location_id"]==43].head(36)

Unnamed: 0,pickup_hour,pickup_location_id,rides
359160,2023-01-01 00:00:00,43,92
359161,2023-01-01 01:00:00,43,81
359162,2023-01-01 02:00:00,43,29
359163,2023-01-01 03:00:00,43,15
359164,2023-01-01 04:00:00,43,4
359165,2023-01-01 05:00:00,43,4
359166,2023-01-01 06:00:00,43,3
359167,2023-01-01 07:00:00,43,12
359168,2023-01-01 08:00:00,43,12
359169,2023-01-01 09:00:00,43,23


In [23]:
from src.data_utils import transform_ts_data_info_features_and_target

features, targets = transform_ts_data_info_features_and_target(ts_data[ts_data["pickup_location_id"]==43], window_size=4, step_size=1)


In [24]:
features

Unnamed: 0,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,92,81,29,15,2023-01-01 04:00:00,43
1,81,29,15,4,2023-01-01 05:00:00,43
2,29,15,4,4,2023-01-01 06:00:00,43
3,15,4,4,3,2023-01-01 07:00:00,43
4,4,4,3,12,2023-01-01 08:00:00,43
...,...,...,...,...,...,...
8751,209,160,98,66,2023-12-31 19:00:00,43
8752,160,98,66,55,2023-12-31 20:00:00,43
8753,98,66,55,72,2023-12-31 21:00:00,43
8754,66,55,72,50,2023-12-31 22:00:00,43


In [25]:
from src.data_utils import transform_ts_data_info_features_and_target

In [26]:
features, targets = transform_ts_data_info_features_and_target(ts_data[ts_data["pickup_location_id"]==43], window_size=4, step_size=1)


In [27]:
features

Unnamed: 0,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,92,81,29,15,2023-01-01 04:00:00,43
1,81,29,15,4,2023-01-01 05:00:00,43
2,29,15,4,4,2023-01-01 06:00:00,43
3,15,4,4,3,2023-01-01 07:00:00,43
4,4,4,3,12,2023-01-01 08:00:00,43
...,...,...,...,...,...,...
8751,209,160,98,66,2023-12-31 19:00:00,43
8752,160,98,66,55,2023-12-31 20:00:00,43
8753,98,66,55,72,2023-12-31 21:00:00,43
8754,66,55,72,50,2023-12-31 22:00:00,43
