# Kafka consume

In [1]:
from confluent_kafka import Consumer, KafkaException
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroDeserializer
from confluent_kafka.serialization import SerializationContext, MessageField
from typing import List, Dict

KAFKA_CONFIG = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "notebook-consumer-group",
    "auto.offset.reset": "earliest",
    "enable.auto.commit": False,
}

SCHEMA_CONFIG = {"url": "http://localhost:8081"}
schema_registry_client = SchemaRegistryClient(SCHEMA_CONFIG)
avro_deserializer = AvroDeserializer(schema_registry_client)


def consume_kafka_messages(topic: str, max_messages: int) -> List[Dict]:
    consumer = Consumer(KAFKA_CONFIG)
    consumer.subscribe([topic])

    messages = []
    print(f"Subscribed to topic: {topic}")
    print(f"Waiting for up to {max_messages} Avro-encoded messages...")

    try:
        while len(messages) < max_messages:
            msg = consumer.poll(timeout=2.0)

            if msg is None:
                print("No message received.")
                continue
            if msg.error():
                raise KafkaException(msg.error())

            value = avro_deserializer(
                msg.value(), SerializationContext(topic, MessageField.VALUE)
            )

            if value is not None:
                messages.append(value)
                print(f"Received message {len(messages)} of {max_messages}")
                consumer.commit(msg)

    finally:
        consumer.close()

    return messages


messages = consume_kafka_messages("storefront.public.orders", 5)
messages

Subscribed to topic: storefront.public.orders
Waiting for up to 5 Avro-encoded messages...
No message received.
Received message 1 of 5
Received message 2 of 5
Received message 3 of 5
Received message 4 of 5
Received message 5 of 5


[{'before': None,
  'after': {'id': 16,
   'customer_id': 191,
   'status': 'pending',
   'total': Decimal('1346.03'),
   'created_at': 1743622326513896},
  'source': {'version': '2.5.4.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1743700779075,
   'snapshot': 'true',
   'db': 'storefront',
   'sequence': '[null,"40023408"]',
   'schema': 'public',
   'table': 'orders',
   'txId': 773,
   'lsn': 40023408,
   'xmin': None},
  'op': 'r',
  'ts_ms': 1743700808422,
  'transaction': None},
 {'before': None,
  'after': {'id': 17,
   'customer_id': 465,
   'status': 'pending',
   'total': Decimal('1730.24'),
   'created_at': 1743622326514983},
  'source': {'version': '2.5.4.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1743700779075,
   'snapshot': 'true',
   'db': 'storefront',
   'sequence': '[null,"40023408"]',
   'schema': 'public',
   'table': 'orders',
   'txId': 773,
   'lsn': 40023408,
   'xmin': None},
  'op': 'r',
  'ts_ms':

In [2]:
messages[0]["after"]

{'id': 16,
 'customer_id': 191,
 'status': 'pending',
 'total': Decimal('1346.03'),
 'created_at': 1743622326513896}

# Read S3

In [3]:
import os
import boto3
import fastavro
from smart_open import open as smart_open
from dotenv import load_dotenv
from typing import List, Dict

load_dotenv("../.streaming.env")
load_dotenv("../.lake.env")

MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT")
MINIO_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
MINIO_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
    region_name="us-east-1",
)


def read_sample_avro_records(
    bucket: str, topic_prefix: str, max_records: int = 5
) -> List[Dict]:
    response = s3.list_objects_v2(Bucket=bucket, Prefix=topic_prefix)
    files = [
        obj["Key"]
        for obj in response.get("Contents", [])
        if obj["Key"].endswith(".avro")
    ]

    print(
        f"Found {len(files)} Avro files in bucket '{bucket}' with prefix '{topic_prefix}'; top 5:"
    )
    for f in files[:5]:
        print(" -", f)

    if not files:
        return []

    records = []
    with smart_open(
        f"s3://{bucket}/{files[0]}", "rb", transport_params={"client": s3}
    ) as f:
        reader = fastavro.reader(f)
        for i, record in enumerate(reader):
            records.append(record)
            if i + 1 >= max_records:
                break

    return records


records = read_sample_avro_records(
    "raw", "kafka/storefront.public.orders", max_records=10
)
records

Found 10 Avro files in bucket 'raw' with prefix 'kafka/storefront.public.orders'; top 5:
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000000000.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000001000.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000002000.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000003000.avro
 - kafka/storefront.public.orders/partition=0/storefront.public.orders+0+0000004000.avro


[{'before': None,
  'after': {'id': 1,
   'customer_id': 36,
   'status': 'cancelled',
   'total': Decimal('501.85'),
   'created_at': 1743622326493705},
  'source': {'version': '2.5.4.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1743700779075,
   'snapshot': 'first_in_data_collection',
   'db': 'storefront',
   'sequence': '[null,"40023408"]',
   'schema': 'public',
   'table': 'orders',
   'txId': 773,
   'lsn': 40023408,
   'xmin': None},
  'op': 'r',
  'ts_ms': 1743700808415,
  'transaction': None},
 {'before': None,
  'after': {'id': 2,
   'customer_id': 325,
   'status': 'delivered',
   'total': Decimal('348.36'),
   'created_at': 1743622326496597},
  'source': {'version': '2.5.4.Final',
   'connector': 'postgresql',
   'name': 'storefront',
   'ts_ms': 1743700779075,
   'snapshot': 'true',
   'db': 'storefront',
   'sequence': '[null,"40023408"]',
   'schema': 'public',
   'table': 'orders',
   'txId': 773,
   'lsn': 40023408,
   'xmin': None},
  'o

In [4]:
# Check row count postgres == avro count s3

import pandas as pd
import psycopg

load_dotenv("../.source.env", override=True)

conn = psycopg.connect(
    host="localhost",
    port="4444",
    dbname="storefront",
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD"),
)

tables = ["customers", "order_items", "orders", "payments", "products"]
bucket = "raw"
prefix_template = "kafka/storefront.public.{}"

tables_actual_counts = {}
with conn.cursor() as cur:
    for table in tables:
        cur.execute(f"SELECT COUNT(*) FROM public.{table}")
        count = cur.fetchone()[0]
        tables_actual_counts[table] = count


def count_avro_records(bucket: str, prefix: str) -> int:
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    files = [
        obj["Key"]
        for obj in response.get("Contents", [])
        if obj["Key"].endswith(".avro")
    ]

    total_records = 0
    for file_key in files:
        with smart_open(
            f"s3://{bucket}/{file_key}", "rb", transport_params={"client": s3}
        ) as f:
            reader = fastavro.reader(f)
            total_records += sum(1 for _ in reader)
    return total_records


record_counts = []
for table, expected in tables_actual_counts.items():
    prefix = prefix_template.format(table)
    actual = count_avro_records(bucket, prefix)
    record_counts.append(
        {
            "table": table,
            "postgres_count": expected,
            "s3_count": actual,
            "match": expected == actual,
        }
    )

df_counts = pd.DataFrame(record_counts)
df_counts

Unnamed: 0,table,postgres_count,s3_count,match
0,customers,5000,5000,True
1,order_items,24963,24963,True
2,orders,10000,10000,True
3,payments,10000,10000,True
4,products,10000,10000,True
