# Kafka consume

In [None]:
from confluent_kafka import Consumer, KafkaException
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroDeserializer
from confluent_kafka.serialization import SerializationContext, MessageField
from typing import List, Dict

KAFKA_CONFIG = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "notebook-consumer-group",
    "auto.offset.reset": "earliest",
    "enable.auto.commit": False,
}

SCHEMA_CONFIG = {"url": "http://localhost:8081"}
schema_registry_client = SchemaRegistryClient(SCHEMA_CONFIG)
avro_deserializer = AvroDeserializer(schema_registry_client)


def consume_kafka_messages(topic: str, max_messages: int) -> List[Dict]:
    consumer = Consumer(KAFKA_CONFIG)
    consumer.subscribe([topic])

    messages = []
    print(f"Subscribed to topic: {topic}")
    print(f"Waiting for up to {max_messages} Avro-encoded messages...")

    try:
        while len(messages) < max_messages:
            msg = consumer.poll(timeout=2.0)

            if msg is None:
                print("No message received.")
                continue
            if msg.error():
                raise KafkaException(msg.error())

            value = avro_deserializer(
                msg.value(), SerializationContext(topic, MessageField.VALUE)
            )

            if value is not None:
                messages.append(value)
                print(f"Received message {len(messages)} of {max_messages}")
                consumer.commit(msg)

    finally:
        consumer.close()

    return messages


messages = consume_kafka_messages("storefront.public.orders", 5)
messages

Subscribed to topic: storefront.public.orders
Waiting for up to 5 Avro-encoded messages...
No message received.
Received message 1 of 5
Received message 2 of 5
Received message 3 of 5
Received message 4 of 5
Received message 5 of 5


[{'before': None,
  'after': {'id': 7,
   'customer_id': 7,
   'status': 'pending',
   'total': Decimal('2770.64'),
   'created_at': 1742973185794302},
  'source': {'version': '3.0.8.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1742994813553,
   'snapshot': 'true',
   'db': 'storefront',
   'sequence': '[null,"26838336"]',
   'ts_us': 1742994813553780,
   'ts_ns': 1742994813553780000,
   'schema': 'public',
   'table': 'orders',
   'txId': 800,
   'lsn': 26838336,
   'xmin': None},
  'transaction': None,
  'op': 'r',
  'ts_ms': 1742994815695,
  'ts_us': 1742994815695499,
  'ts_ns': 1742994815695499213},
 {'before': None,
  'after': {'id': 8,
   'customer_id': 8,
   'status': 'shipped',
   'total': Decimal('1940.91'),
   'created_at': 1742973186816395},
  'source': {'version': '3.0.8.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1742994813553,
   'snapshot': 'true',
   'db': 'storefront',
   'sequence': '[null,"26838336"]',
   '

In [2]:
messages[0]["after"]

{'id': 7,
 'customer_id': 7,
 'status': 'pending',
 'total': Decimal('2770.64'),
 'created_at': 1742973185794302}

# Read S3

In [1]:
import os
import boto3
import fastavro
from smart_open import open as smart_open
from dotenv import load_dotenv
from typing import List, Dict

load_dotenv()

MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT")
MINIO_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
MINIO_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

# Create reusable S3 client
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
    region_name="us-east-1",
)


def read_sample_avro_records(
    bucket: str, topic_prefix: str, max_records: int = 5
) -> List[Dict]:
    response = s3.list_objects_v2(Bucket=bucket, Prefix=topic_prefix)
    files = [
        obj["Key"]
        for obj in response.get("Contents", [])
        if obj["Key"].endswith(".avro")
    ]

    print(
        f"Found {len(files)} Avro files in bucket '{bucket}' with prefix '{topic_prefix}'; top 5:"
    )
    for f in files[:5]:
        print(" -", f)

    if not files:
        return []

    records = []
    with smart_open(
        f"s3://{bucket}/{files[0]}", "rb", transport_params={"client": s3}
    ) as f:
        reader = fastavro.reader(f)
        for i, record in enumerate(reader):
            records.append(record)
            if i + 1 >= max_records:
                break

    return records


records = read_sample_avro_records(
    "ingest", "kafka/storefront.public.orders", max_records=10
)
records

Found 20 Avro files in bucket 'ingest' with prefix 'kafka/storefront.public.orders'; top 5:
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000000000.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000000003.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000000006.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000000009.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000000012.avro


[{'before': None,
  'after': {'id': 1,
   'customer_id': 1,
   'status': 'pending',
   'total': Decimal('1136.11'),
   'created_at': 1742973179538101},
  'source': {'version': '3.0.8.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1743101733055,
   'snapshot': 'first_in_data_collection',
   'db': 'storefront',
   'sequence': '[null,"26926296"]',
   'ts_us': 1743101733055211,
   'ts_ns': 1743101733055211000,
   'schema': 'public',
   'table': 'orders',
   'txId': 877,
   'lsn': 26926296,
   'xmin': None},
  'transaction': None,
  'op': 'r',
  'ts_ms': 1743101733575,
  'ts_us': 1743101733575508,
  'ts_ns': 1743101733575508253},
 {'before': None,
  'after': {'id': 2,
   'customer_id': 2,
   'status': 'pending',
   'total': Decimal('1357.97'),
   'created_at': 1742973180587430},
  'source': {'version': '3.0.8.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1743101733055,
   'snapshot': 'true',
   'db': 'storefront',
   'sequence': '[nul