In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()
kafka_api = project.get_kafka_api()

# Get orderbook feature group
orderbook_fg = fs.get_feature_group("orderbook", 1)
orderbook_agg_1m_fg = fs.get_feature_group("orderbook_agg_1m", 1)
orderbook_agg_2m_fg = fs.get_feature_group("orderbook_agg_2m", 1)
orderbook_agg_3m_fg = fs.get_feature_group("orderbook_agg_3m", 1)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://hopsworks0.logicalclocks.com/p/121
Connected. Call `.close()` to terminate connection gracefully.


# Streaming feature pipeline

### Create QuixStreams Application

In [2]:
from quixstreams import Application, State

In [3]:
# callbacks

def on_consumer_error(*args, **kwargs):
    print("ON CONSUMER ERROR")
    if args is not None:
        print(args)
    if kwargs is not None:
        print(kwargs)
    
def on_processing_error(*args, **kwargs):
    print("ON PROCESSING ERROR")
    if args is not None:
        print(args)
    if kwargs is not None:
        print(kwargs)
    
def on_producer_error(*args, **kwargs):
    print("ON PRODUCER ERROR")
    if args is not None:
        print(args)
    if kwargs is not None:
        print(kwargs)

In [4]:
# Get default Kafka configuration

kafka_config = kafka_api.get_default_config()
consumer_config = kafka_config
consumer_config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
consumer_config['partition.assignment.strategy'] = "cooperative-sticky"

from hsfs import engine
producer_config = engine.get_instance()._get_kafka_config(
    orderbook_fg.feature_store_id, {}
)

In [5]:
# Create QuixStreams Application

app = Application(
    broker_address=kafka_config["bootstrap.servers"],
    auto_create_topics=False,
    #loglevel = "DEBUG",
    
    # producer
    producer_extra_config=producer_config,
    on_producer_error=on_producer_error,
    
    # consumer
    consumer_extra_config=consumer_config,
    consumer_group="my-group-id",
    on_consumer_error=on_consumer_error,
    auto_offset_reset="earliest",
    auto_commit_enable=False,
    use_changelog_topics=False,
    
    # processing
    on_processing_error=on_processing_error,
)

# Add consumer topic
input_topic = app.topic(name="btc_usd_orderbook", value_serializer="json")

In [6]:
try:
    app.clear_state()
except:
    print("Nothing to clear")

Nothing to clear


### Using Streaming DataFrame with hsfs multi_part_insert

In [7]:
# on-demand transformation

def to_upper(value: str):
    return str(value).upper()

In [8]:
import pandas as pd

def multi_part_insert(event: dict, feature_group):
    event_df = to_pandas(event)
    with feature_group.multi_part_insert() as writer:
        writer.insert(event_df)
    print(">> Event delived successfully to " + feature_group._online_topic_name + "_" + feature_group.name)
    
def to_pandas(event: dict):
    df = pd.DataFrame(event, index=[0])
    # parse to float32
    cols = list(df.columns) # ['bid', 'bid_size', 'ask', 'ask_size', 'spread']
    cols.remove("ticker")
    cols.remove("timestamp")
    df[cols] = df[cols].astype('float32')
    return df

In [9]:
from datetime import datetime, timezone, timedelta

def compute_price_and_size(event: dict, state: State):
    # load from state
    ask_price = state.get("ask_price", 9999999.9)
    bid_price = state.get("bid_price", 0.0)
    bids = state.get("bids", {str(bid_price): 0.0})
    asks = state.get("asks", {str(ask_price): 0.0})
    
    # process changes
    for update in event["changes"]:
        price = float(update[1])
        size = float(update[2])
        operation = str(update[0])
        
        if operation == "sell":
            # first check if the size is zero and needs to be removed
            if size == 0.0:
                try:
                    del asks[str(price)]
                    # if it was the ask price removed,
                    # update with new ask price
                    if price <= ask_price:
                        ask_price = float(min(asks.keys()))
                except KeyError:
                    # don't need to add price with size zero
                    pass
            else:
                asks[str(price)] = size
                if price < ask_price:
                    ask_price = price
            # save to the state
            state.set("asks", asks)
            state.set("ask_price", ask_price)
            
        if operation == "buy":
            # first check if the size is zero and needs to be removed
            if size == 0.0:
                try:
                    del bids[str(price)]
                    # if it was the bid price removed,
                    # update with new bid price
                    if price >= bid_price:
                        bid_price = float(max(bids.keys()))
                except KeyError:
                    # don't need to add price with size zero
                    pass
            else:
                bids[str(price)] = size
                if price > bid_price:
                    bid_price = price
            # save to the state
            state.set("bids", bids)
            state.set("bid_price", bid_price)

    return {
        "ticker": event["product_id"],
        "timestamp": datetime.strptime(
            event["time"], "%Y-%m-%dT%H:%M:%S.%fZ"
        ).replace(tzinfo=timezone.utc),
        "bid": bid_price,
        "bid_size": bids[str(bid_price)],
        "ask": ask_price,
        "ask_size": asks[str(ask_price)],
        "spread": ask_price - bid_price,
    }

# Initializer and Reducer for first aggregation

def initializer_agg(event: dict) -> dict:
    return {
        "ticker": event["ticker"],
        "timestamp": event["timestamp"],
        "bid_min": event["bid"],
        "bid_max": event["bid"],
        "bid_mean": event["bid"],
        "bid_sum": event["bid"],
        "bid_count": 1,
        "bid_size_count": event["bid_size"],
        "ask_min": event["ask"],
        "ask_max": event["ask"],
        "ask_mean": event["ask"],
        "ask_sum": event["ask"],
        "ask_count": 1,
        "ask_size_count": event["ask_size"],
        "spread": event["spread"],
    }

def reducer_agg(aggregated: dict, event: dict) -> dict:
    return {
        "ticker": aggregated["ticker"],
        "timestamp": aggregated["timestamp"],
        "bid_min": min(aggregated['bid_min'], event['bid']),
        "bid_max": max(aggregated['bid_max'], event['bid']),
        "bid_mean": float(aggregated["bid_sum"] + event["bid"]) / (aggregated["bid_count"] + 1),
        "bid_sum": aggregated["bid_sum"] + event["bid"],
        "bid_count": aggregated["bid_count"] + 1,
        "bid_size_count": aggregated["bid_size_count"] + event["bid_size"],
        "ask_min": min(aggregated['ask_min'], event['ask']),
        "ask_max": max(aggregated['ask_max'], event['ask']),
        "ask_mean": float(aggregated["ask_sum"] + event["ask"]) / (aggregated["ask_count"] + 1),
        "ask_sum": aggregated["ask_sum"] + event["ask"],
        "ask_count": aggregated["ask_count"] + 1,
        "ask_size_count": aggregated["ask_size_count"] + event["ask_size"],
        "spread": event["spread"],
    }

In [10]:

# Initializer and Reducer for Accumulative Aggregations

def initializer_acc_agg(event: dict) -> dict:
    return {
        "ticker": event["ticker"],
        "timestamp": event["timestamp"],
        "bid_min": event["bid_min"],
        "bid_max": event["bid_max"],
        "bid_mean": event["bid_mean"],
        "bid_sum": event["bid_sum"],
        "bid_count": 1,
        "bid_size_count": event["bid_size_count"],
        "ask_min": event["ask_min"],
        "ask_max": event["ask_max"],
        "ask_mean": event["ask_mean"],
        "ask_sum": event["ask_sum"],
        "ask_count": 1,
        "ask_size_count": event["ask_size_count"],
        "spread": event["spread"],
    }

def reducer_acc_agg(aggregated: dict, event: dict) -> dict:
    return {
        "ticker": aggregated["ticker"],
        "timestamp": aggregated["timestamp"],
        "bid_min": min(aggregated['bid_min'], event['bid_min']),
        "bid_max": max(aggregated['bid_max'], event['bid_max']),
        "bid_mean": float(aggregated["bid_sum"] + event["bid_sum"]) / (aggregated["bid_count"] + 1),
        "bid_sum": aggregated["bid_sum"] + event["bid_sum"],
        "bid_count": aggregated["bid_count"] + 1,
        "bid_size_count": aggregated["bid_size_count"] + event["bid_size_count"],
        "ask_min": min(aggregated['ask_min'], event['ask_min']),
        "ask_max": max(aggregated['ask_max'], event['ask_max']),
        "ask_mean": float(aggregated["ask_sum"] + event["ask_sum"]) / (aggregated["ask_count"] + 1),
        "ask_sum": aggregated["ask_sum"] + event["ask_sum"],
        "ask_count": aggregated["ask_count"] + 1,
        "ask_size_count": aggregated["ask_size_count"] + event["ask_size_count"],
        "spread": event["spread"],
    }

In [11]:
sdf = app.dataframe(input_topic)

# filter "l2update" events
sdf = sdf[sdf["type"] == "l2update"]

# compute final price and size from all changes
sdf = sdf.apply(compute_price_and_size, stateful=True)

# apply on-demand transformation
sdf["ticker"] = sdf["ticker"].apply(to_upper)

# insert all events to orderbook feature group
sdf = sdf.update(lambda event: multi_part_insert(event, orderbook_fg))

#
#  1st Aggregation - 1 Minute
#

# convert datetime to str - for serialization of the state
sdf["timestamp"] = sdf["timestamp"].apply(lambda t: t.isoformat())

# perform window 1-minute aggregations
sdf = (
    # Define a tumbling window
    sdf.tumbling_window(timedelta(seconds=5))  # set 5 seconds for demo and debugging

    # Create a "reduce" aggregation with "reducer" and "initializer" functions
    .reduce(reducer=reducer_agg, initializer=initializer_agg)

    # Emit results only for closed windows
    .final()
    
    # extract value
    .apply(lambda result: result["value"])
)

# revert timestamp string to datetime
sdf["timestamp"] = sdf["timestamp"].apply(datetime.fromisoformat)

# insert to orderbook_agg feature group
sdf = sdf.update(lambda event: multi_part_insert(event, orderbook_agg_1m_fg))

#
#  2nd Aggregation - 2 Minutes
#

# convert datetime to str - for serialization of the state
sdf["timestamp"] = sdf["timestamp"].apply(lambda t: t.isoformat())

# perform window aggregations for 2 minutes
sdf = (
    # Define a tumbling window
    sdf.tumbling_window(timedelta(seconds=10)) # set 10 seconds for demo and debugging

    # Create a "reduce" aggregation with "reducer" and "initializer" functions
    .reduce(reducer=reducer_acc_agg, initializer=initializer_acc_agg)

    # Emit results only for closed windows
    .final()
    
    # extract value
    .apply(lambda result: result["value"])
)

# revert timestamp string to datetime
sdf["timestamp"] = sdf["timestamp"].apply(datetime.fromisoformat)

# insert to orderbook_agg FG topic
sdf = sdf.update(lambda event: multi_part_insert(event, orderbook_agg_2m_fg))

#
#  3rd Aggregation - 3 Minutes
#

# convert datetime to str - for serialization of the state
sdf["timestamp"] = sdf["timestamp"].apply(lambda t: t.isoformat())

# perform window aggregations for 2 minutes
sdf = (
    # Define a tumbling window
    sdf.tumbling_window(timedelta(seconds=15))  # set 15 seconds for demo and debugging

    # Create a "reduce" aggregation with "reducer" and "initializer" functions
    .reduce(reducer=reducer_acc_agg, initializer=initializer_acc_agg)

    # Emit results only for closed windows
    .final()
    
    # extract value
    .apply(lambda result: result["value"])
)

# revert timestamp string to datetime
sdf["timestamp"] = sdf["timestamp"].apply(datetime.fromisoformat)

# insert to orderbook_agg FG topic
sdf = sdf.update(lambda event: multi_part_insert(event, orderbook_agg_3m_fg))

In [12]:
# start materialization job
# orderbook_fg.materialization_job.run(await_termination=False)
# orderbook_agg_1m_fg.materialization_job.run(await_termination=False)
# orderbook_agg_2m_fg.materialization_job.run(await_termination=False)
# orderbook_agg_3m_fg.materialization_job.run(await_termination=False)

# start quixstreams app
app.run(sdf)

[2024-04-25 17:04:01,010] [INFO] : Starting the Application with the config: broker_address="10.0.2.15:9091" consumer_group="my-group-id" auto_offset_reset="earliest"


2024-04-25 17:04:01,010 INFO: Starting the Application with the config: broker_address="10.0.2.15:9091" consumer_group="my-group-id" auto_offset_reset="earliest"


[2024-04-25 17:04:01,014] [INFO] : Topics required for this application: "btc_usd_orderbook"


2024-04-25 17:04:01,014 INFO: Topics required for this application: "btc_usd_orderbook"


[2024-04-25 17:04:01,021] [INFO] : Validating Kafka topics exist and are configured correctly...


2024-04-25 17:04:01,021 INFO: Validating Kafka topics exist and are configured correctly...


[2024-04-25 17:04:01,097] [INFO] : Kafka topics validation complete


2024-04-25 17:04:01,097 INFO: Kafka topics validation complete


[2024-04-25 17:04:01,101] [INFO] : Initializing state directory at "/srv/hops/jupyter/Projects/QuixStreams/QuixStreams__meb10000/f3a9d74201380520e05fb447d9e2df25b30f4315e343c2f54db875a1e10c5785/state/my-group-id"


2024-04-25 17:04:01,101 INFO: Initializing state directory at "/srv/hops/jupyter/Projects/QuixStreams/QuixStreams__meb10000/f3a9d74201380520e05fb447d9e2df25b30f4315e343c2f54db875a1e10c5785/state/my-group-id"


[2024-04-25 17:04:01,111] [INFO] : Waiting for incoming messages


2024-04-25 17:04:01,111 INFO: Waiting for incoming messages
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
>> Ev

[2024-04-25 17:05:02,307] [INFO] : Stop processing of StreamingDataFrame


>> Event delived successfully to QuixStreams_orderbook
>> Event delived successfully to QuixStreams_orderbook
2024-04-25 17:05:02,307 INFO: Stop processing of StreamingDataFrame


In [None]:
# orderbook_agg_1m_fg.materialization_job.run(await_termination=False)
# orderbook_agg_2m_fg.materialization_job.run(await_termination=False)
# orderbook_agg_3m_fg.materialization_job.run(await_termination=False)