In [1]:
from datetime import datetime, timedelta

import src.config as config
import pandas as pd


In [2]:
current_date = pd.to_datetime(datetime.utcnow()).floor('H')
print(f"{current_date=}")

# Now fetch raw data for the last 28 days for additional data pipeline redundancy
fetch_data_to = current_date
fetch_data_from = current_date - timedelta(days=28) 

current_date=Timestamp('2023-05-08 06:00:00')


In [3]:
from src.data import loadRawData

def fetchBatchRawData(from_date: datetime,
                      to_date: datetime) -> pd.DataFrame:
    """
    Simulate production data by sampling historical data from a year ago
    """
    
    from_date_ = from_date - timedelta(days=7*52)
    to_date_ = to_date - timedelta(days=7*52)
    
    # download 2 files from website
    rides_from = loadRawData(year=from_date_.year, months=from_date_.month)
    rides_from = rides_from[rides_from.pickup_datetime >= from_date_]
    rides_to = loadRawData(year=to_date_.year, months=to_date_.month)
    rides_to = rides_to[rides_to.pickup_datetime < to_date_]
    
    rides = pd.concat([rides_from, rides_to])
    
    # shift data to mimic recent data
    rides['pickup_datetime'] += timedelta(days=7*52)
    
    rides.sort_values(by=['pickup_loc_id', 'pickup_datetime'], inplace=True)
    
    return rides
    

In [4]:
rides = fetchBatchRawData(from_date=fetch_data_from, 
                          to_date=fetch_data_to)

File for 2022_04 found locally...
File for 2022_05 found locally...


In [5]:
from src.data import processRawData

ts_data = processRawData(rides)

100%|██████████| 255/255 [00:00<00:00, 517.61it/s]


In [None]:
import hopsworks

# Connect to project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# Connect to feature store
feature_store = project.get_feature_store()

# Connect to the feature group
feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key= ['pickup_location_id', 'pickup_hour'],
    event_time='pickup_hour'
)
