In [42]:
HOPSWORKS_PROJECT_NAME = "taxi_demand_ml"

In [43]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

load_dotenv(PARENT_DIR / '.env')

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

In [44]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2023
to_year = datetime.now().year
print(f'Downloading raw data from {from_year} to {to_year}')

rides = pd.DataFrame()
for year in range(from_year, to_year+1):

    rides_one_year = load_raw_data(year)

    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2023 to 2024
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local storage
File 2023-12 was already in local storage
Downloading file 2024-01
2024-01 file is not available
Downloading file 2024-02
2024-02 file is not available
Downloading file 2024-03
2024-03 file is not available
Downloading file 2024-04
2024-04 file is not available
Downloading file 2024-05
2024-05 file is not available
Downloading file 2024-06
2024-06 file is not available
Downloading file 2024-07
2024-07 file is not available
Downloading file 2024-08
2024-08 file is not available
Downloading file 

In [45]:
print(f'{len(rides)=:,}')

len(rides)=38,309,496


In [46]:
rides.groupby('pickup_location_id').count()

Unnamed: 0_level_0,pickup_datetime
pickup_location_id,Unnamed: 1_level_1
1,5105
2,35
3,836
4,47518
5,387
...,...
261,198638
262,511237
263,726572
264,356722


In [59]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
100%|██████████| 263/263 [00:04<00:00, 61.43it/s]


In [61]:
ts_data[ts_data.pickup_location_id.isin([27,84,103,104,110,111,199,251])].groupby('pickup_location_id')['pickup_hour'].count()

pickup_location_id
27     8760
84     8760
110    8760
111    8760
199    8760
251    8760
Name: pickup_hour, dtype: int64

In [62]:
# string to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].apply(lambda x: x.timestamp()) // 10**6



In [63]:
ts_data[ts_data.pickup_location_id.isin([27,84,103,104,110,111,199,251])].groupby('pickup_location_id')['pickup_hour'].count()

pickup_location_id
27     8760
84     8760
110    8760
111    8760
199    8760
251    8760
Name: pickup_hour, dtype: int64

**Login and get feature store**

In [50]:
import hopsworks

project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value = HOPSWORKS_API_KEY
)

feature_store = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/423065
Connected. Call `.close()` to terminate connection gracefully.


In [51]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 2

In [52]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="time_series data at hourly frequency",
    primary_key= ['pickup_location_id','pickup_ts'],
    event_time='pickup_ts',
)

In [53]:
ts_data['pickup_ts'] = ts_data['pickup_ts'].astype('int64')
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Uploading Dataframe: 0.00% |          | Rows 0/2303880 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: time_series_hourly_feature_group_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/423065/jobs/named/time_series_hourly_feature_group_2_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x21d03341670>, None)

In [54]:
pd.DataFrame(ts_data.groupby('pickup_location_id')['pickup_hour'].count())

Unnamed: 0_level_0,pickup_hour
pickup_location_id,Unnamed: 1_level_1
1,8760
2,8760
3,8760
4,8760
5,8760
...,...
261,8760
262,8760
263,8760
264,8760


In [55]:
print(len(ts_data.pickup_hour.unique()))
print(len(ts_data.pickup_location_id.unique()))

8760
263


In [58]:
sorted(ts_data.pickup_location_id.unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187