## Cannot read Kafka messages

In [15]:
import json
import os
import typing

import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.io import kafka
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.runners.interactive import interactive_runner
from apache_beam.runners.portability import flink_runner

In [16]:
class EventLog(typing.NamedTuple):
    ip: str
    id: str
    lat: float
    lng: float
    user_agent: str
    age_bracket: str
    opted_into_marketing: bool
    http_request: str
    http_response: int
    file_size_bytes: int
    event_datetime: str
    event_ts: int


beam.coders.registry.register_coder(EventLog, beam.coders.RowCoder)


def decode_message(kafka_kv: tuple):
    # Incoming Kafka records must have a key associated.
    # Otherwise, Beam throws an exception with null keys.
    #   Example: (b'key', b'value')
    return kafka_kv[1].decode("utf-8")


def create_message(element: EventLog):
    key = {"event_id": element.id, "event_ts": element.event_ts}
    value = element._asdict()
    print(key)
    return json.dumps(key).encode("utf-8"), json.dumps(value).encode("utf-8")


def parse_json(element: str):
    row = json.loads(element)
    # lat/lng sometimes empty string
    if not row["lat"] or not row["lng"]:
        row = {**row, **{"lat": -1, "lng": -1}}
    return EventLog(**row)

In [17]:
pipeline_opts = {
    "job_name": "kafka-io",
    "environment_type": "LOOPBACK",
    "streaming": True,
    "parallelism": 3,
    "experiments": [
        "use_deprecated_read"
    ],  ## https://github.com/apache/beam/issues/20979
    "checkpointing_interval": "60000",
}
options = PipelineOptions([], **pipeline_opts)
# Required, else it will complain that when importing worker functions
options.view_as(SetupOptions).save_main_session = True

p = beam.Pipeline(
    interactive_runner.InteractiveRunner(underlying_runner=flink_runner.FlinkRunner()), options=options
)
events = (
    p
    | "Read from Kafka"
    >> kafka.ReadFromKafka(
        consumer_config={
            "bootstrap.servers": os.getenv(
                "BOOTSTRAP_SERVERS",
                "host.docker.internal:29092",
            ),
            "auto.offset.reset": "earliest",
            # "enable.auto.commit": "true",
            "group.id": "kafka-io",
        },
        topics=["website-visit"],
    )
    | "Decode messages" >> beam.Map(decode_message)
    | "Parse elements" >> beam.Map(parse_json).with_output_types(EventLog)
)
results = p.run()
result.wait_until_finish()
# ib.options.recording_duration = "120s"
# ib.show(events)



KeyError: 'beam:transform:org.apache.beam:kafka_read_with_metadata:v2'

In [None]:
result.get(events)