In [2]:
HOPSWORKS_PROJECT_NAME = 'chicago_taxi_demand'

In [3]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

load_dotenv(PARENT_DIR / '.env')
HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

In [4]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

rides = load_raw_data()
rides

Downloading...
From (original): https://drive.google.com/uc?id=1dj6DhNhELjnxjziXIGlo4FhPneXauFV8
From (redirected): https://drive.google.com/uc?id=1dj6DhNhELjnxjziXIGlo4FhPneXauFV8&confirm=t&uuid=bfe8b302-7ad6-4f06-8bed-8bc1f26af956
To: C:\Users\joral_08cedew\chicago_taxi_demand_predictor\data\raw\taxi_trips.parquet
100%|██████████| 437M/437M [00:11<00:00, 36.8MB/s] 


Taxi data downloaded: C:\Users\joral_08cedew\chicago_taxi_demand_predictor\data\raw\taxi_trips.parquet
Date range: 2024-01-01 00:00:00 to 2025-02-01 00:00:00


Unnamed: 0,pickup_datetime,pickup_latitude,pickup_longitude
0,2024-01-19 17:00:00,41.979071,-87.903040
1,2024-01-28 14:30:00,41.944227,-87.655998
2,2024-01-05 09:00:00,41.944227,-87.655998
3,2024-01-22 08:45:00,41.980264,-87.913625
4,2024-01-18 19:15:00,41.880994,-87.632746
...,...,...,...
6905283,2024-12-31 11:30:00,41.899602,-87.633308
6905284,2024-12-31 15:15:00,41.954028,-87.763399
6905285,2024-12-31 10:45:00,,
6905286,2024-12-31 14:00:00,41.979071,-87.903040


In [5]:
from src.data import merge_geo_and_ts_data
rides = merge_geo_and_ts_data(rides)
rides

Downloading...
From: https://drive.google.com/uc?id=1AqIi-XKEuLosLZbMYTGTRbWEOWotz_pZ
To: C:\Users\joral_08cedew\chicago_taxi_demand_predictor\data\raw\chicago_geo_data.parquet
100%|██████████| 1.30M/1.30M [00:00<00:00, 3.51MB/s]


Geo data downloaded: C:\Users\joral_08cedew\chicago_taxi_demand_predictor\data\raw\chicago_geo_data.parquet


Unnamed: 0,pickup_datetime,pickup_latitude,pickup_longitude,pickup_location
0,2024-01-19 17:00:00,41.979071,-87.903040,O'Hare
1,2024-01-28 14:30:00,41.944227,-87.655998,Lake View
2,2024-01-05 09:00:00,41.944227,-87.655998,Lake View
3,2024-01-22 08:45:00,41.980264,-87.913625,O'Hare
4,2024-01-18 19:15:00,41.880994,-87.632746,Loop
...,...,...,...,...
6905283,2024-12-31 11:30:00,41.899602,-87.633308,River North
6905284,2024-12-31 15:15:00,41.954028,-87.763399,Portage Park
6905285,2024-12-31 10:45:00,,,Outside Chicago
6905286,2024-12-31 14:00:00,41.979071,-87.903040,O'Hare


In [6]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)
ts_data

100%|██████████| 95/95 [00:02<00:00, 37.59it/s]


Unnamed: 0,pickup_hour,rides,pickup_longitude,pickup_latitude,pickup_location
0,2024-01-01 00:00:00,2,-87.721559,41.968069,Albany Park
1,2024-01-01 01:00:00,0,-87.721559,41.968069,Albany Park
2,2024-01-01 02:00:00,0,-87.721559,41.968069,Albany Park
3,2024-01-01 03:00:00,3,-87.721559,41.968069,Albany Park
4,2024-01-01 04:00:00,0,-87.721559,41.968069,Albany Park
...,...,...,...,...,...
905250,2025-01-31 20:00:00,0,-87.671446,41.979796,Andersonville
905251,2025-01-31 21:00:00,0,-87.671446,41.979796,Andersonville
905252,2025-01-31 22:00:00,0,-87.671446,41.979796,Andersonville
905253,2025-01-31 23:00:00,0,-87.671446,41.979796,Andersonville


In [7]:
# string to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].astype('int64') // 10**6

In [8]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905255 entries, 0 to 905254
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   pickup_hour       905255 non-null  datetime64[ns, UTC]
 1   rides             905255 non-null  int64              
 2   pickup_longitude  905255 non-null  float64            
 3   pickup_latitude   905255 non-null  float64            
 4   pickup_location   905255 non-null  object             
 5   pickup_ts         905255 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(2), int64(2), object(1)
memory usage: 41.4+ MB


In [9]:
import hopsworks

In [10]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

2025-03-30 15:30:07,544 INFO: Initializing external client
2025-03-30 15:30:07,545 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-30 15:30:08,805 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220769


In [11]:
feature_store = project.get_feature_store()

In [12]:
feature_store

<hsfs.feature_store.FeatureStore at 0x1e760733d10>

In [13]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 1

In [14]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location', 'pickup_hour'],
    event_time='pickup_hour',
)

In [15]:
#This cell is because an error when uploading data

from confluent_kafka import Producer, Consumer, TopicPartition
import hsfs.core.kafka_engine

# Forzar la asignación de Producer y Consumer
hsfs.core.kafka_engine.Producer = Producer  
hsfs.core.kafka_engine.Consumer = Consumer  
hsfs.core.kafka_engine.TopicPartition = TopicPartition

In [16]:
feature_group.insert(ts_data,write_options={"wait_for_job":False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220769/fs/1208402/fg/1423019


Uploading Dataframe: 100.00% |██████████| Rows 905255/905255 | Elapsed Time: 00:52 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220769/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),
 None)