In [11]:
HOPSWORKS_PROJECT_NAME = "taxi_demand_ml"

In [12]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

load_dotenv(PARENT_DIR / '.env')

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

In [13]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2023
to_year = datetime.now().year
print(f'Downloading raw data from {from_year} to {to_year}')

rides = pd.DataFrame()
for year in range(from_year, to_year+1):

    rides_one_year = load_raw_data(year)

    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2023 to 2024
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local storage
File 2023-12 was already in local storage
Downloading file 2024-01
2024-01 file is not available
Downloading file 2024-02
2024-02 file is not available
Downloading file 2024-03
2024-03 file is not available
Downloading file 2024-04
2024-04 file is not available
Downloading file 2024-05
2024-05 file is not available
Downloading file 2024-06
2024-06 file is not available
Downloading file 2024-07
2024-07 file is not available
Downloading file 2024-08
2024-08 file is not available
Downloading file 

In [14]:
print(f'{len(rides)=:,}')

len(rides)=38,309,496


In [15]:
rides.groupby('pickup_location_id').count()

Unnamed: 0_level_0,pickup_datetime
pickup_location_id,Unnamed: 1_level_1
1,5105
2,35
3,836
4,47518
5,387
...,...
261,198638
262,511237
263,726572
264,356722


In [16]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 263/263 [00:04<00:00, 63.33it/s]


In [17]:
ts_data.dtypes

pickup_hour           datetime64[ns]
rides                          int64
pickup_location_id             int64
dtype: object

In [18]:
# string to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].apply(lambda x: x.timestamp()) // 10**6



In [19]:
ts_data.dtypes

pickup_hour           datetime64[ns, UTC]
rides                               int64
pickup_location_id                  int64
pickup_ts                         float64
dtype: object

**Login and get feature store**

In [20]:
import hopsworks

project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value = HOPSWORKS_API_KEY
)

feature_store = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/423065
Connected. Call `.close()` to terminate connection gracefully.


In [21]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 2

In [22]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="time_series data at hourly frequency",
    primary_key= ['pickup_location_id','pickup_ts'],
    event_time='pickup_ts',
)

In [24]:
ts_data['pickup_ts'] = ts_data['pickup_ts'].astype('int64')
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Uploading Dataframe: 0.00% |          | Rows 0/2303880 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: time_series_hourly_feature_group_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/423065/jobs/named/time_series_hourly_feature_group_2_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x21d2d2ea280>, None)

In [None]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id,pickup_ts
0,2023-01-01 00:00:00+00:00,19,4,1672531200000
1,2023-01-01 01:00:00+00:00,28,4,1672534800000
2,2023-01-01 02:00:00+00:00,43,4,1672538400000
3,2023-01-01 03:00:00+00:00,33,4,1672542000000
4,2023-01-01 04:00:00+00:00,12,4,1672545600000
...,...,...,...,...
2100187,2023-11-30 19:00:00+00:00,0,27,1701370800000
2100188,2023-11-30 20:00:00+00:00,0,27,1701374400000
2100189,2023-11-30 21:00:00+00:00,0,27,1701378000000
2100190,2023-11-30 22:00:00+00:00,0,27,1701381600000
