In [1]:
import waterhealer as wh
from dask.distributed import Client
import json
import time
import random

In [2]:
source = wh.from_kafka(
    ['testing'],
    {
        'bootstrap.servers': 'localhost:9095, localhost:9096',
        'group.id': 'group-test-3',
        'auto.offset.reset': 'latest',
    },
    debug = True)

In [3]:
client = Client('localhost:8786', loop = source.loop)
client

0,1
Client  Scheduler: tcp://localhost:8786  Dashboard: http://localhost:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB


In [4]:
def json_loads(row):
    return (row[0], json.loads(row[1]))

def plus(row):
    if random.random() > 0.5:
        time.sleep(10)
    no = row[1]['no']
    if no > 50:
        raise Exception('row is bigger than 50')
    return (row[0], no + 1)

In [5]:
outputs = []
source.scatter().map(json_loads).map(plus)\
.buffer(5).gather()\
.partition(5)\
.map(wh.healing_batch, stream = source)\
.flatten().sink(outputs.append)

Output()

In [6]:
source.start()

In [7]:
from kafka import KafkaProducer
from kafka.partitioner import RoundRobinPartitioner

In [8]:
producer = KafkaProducer(
    bootstrap_servers = ['localhost:9095', 'localhost:9096'],
    api_version = (0, 10),
    partitioner = RoundRobinPartitioner(),
)

In [9]:
producer.bootstrap_connected()

True

In [10]:
from datetime import datetime
import json
from itertools import cycle

def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding = 'utf-8')
        value_bytes = bytes(value, encoding = 'utf-8')
        x = producer_instance.send(topic_name, value = value_bytes)
        return True
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))
        return False

## purposely raise the error

Based on function `plus`, if input bigger than 10, it will throw error.

In [11]:
import confluent_kafka as ck

source.consumer.committed([ck.TopicPartition('testing', 0)])[0].offset

444

In [12]:
import time

for count in range(50):
    data = {'no': count, 'datetime': str(datetime.now())}
    print(data)
    publish_message(
        producer, 'testing', 'streaming', json.dumps(data)
    )
    time.sleep(0.2)

{'no': 0, 'datetime': '2020-04-07 19:49:48.079269'}


topic: testing, partition: 0, offset: 444, data: b'{"no": 0, "datetime": "2020-04-07 19:49:48.079269"}'
topic: testing, partition: 1, offset: 409, data: b'{"no": 1, "datetime": "2020-04-07 19:49:48.284202"}'


{'no': 1, 'datetime': '2020-04-07 19:49:48.284202'}


topic: testing, partition: 2, offset: 429, data: b'{"no": 2, "datetime": "2020-04-07 19:49:48.486857"}'


{'no': 2, 'datetime': '2020-04-07 19:49:48.486857'}


topic: testing, partition: 3, offset: 397, data: b'{"no": 3, "datetime": "2020-04-07 19:49:48.691427"}'


{'no': 3, 'datetime': '2020-04-07 19:49:48.691427'}
{'no': 4, 'datetime': '2020-04-07 19:49:48.896152'}


topic: testing, partition: 4, offset: 396, data: b'{"no": 4, "datetime": "2020-04-07 19:49:48.896152"}'
topic: testing, partition: 5, offset: 371, data: b'{"no": 5, "datetime": "2020-04-07 19:49:49.100615"}'


{'no': 5, 'datetime': '2020-04-07 19:49:49.100615'}


topic: testing, partition: 6, offset: 385, data: b'{"no": 6, "datetime": "2020-04-07 19:49:49.306138"}'


{'no': 6, 'datetime': '2020-04-07 19:49:49.306138'}


topic: testing, partition: 7, offset: 367, data: b'{"no": 7, "datetime": "2020-04-07 19:49:49.508063"}'


{'no': 7, 'datetime': '2020-04-07 19:49:49.508063'}
{'no': 8, 'datetime': '2020-04-07 19:49:49.712886'}
{'no': 9, 'datetime': '2020-04-07 19:49:49.918293'}
{'no': 10, 'datetime': '2020-04-07 19:49:50.123708'}
{'no': 11, 'datetime': '2020-04-07 19:49:50.327677'}
{'no': 12, 'datetime': '2020-04-07 19:49:50.531298'}
{'no': 13, 'datetime': '2020-04-07 19:49:50.736390'}
{'no': 14, 'datetime': '2020-04-07 19:49:50.941276'}
{'no': 15, 'datetime': '2020-04-07 19:49:51.146028'}
{'no': 16, 'datetime': '2020-04-07 19:49:51.351378'}
{'no': 17, 'datetime': '2020-04-07 19:49:51.556762'}
{'no': 18, 'datetime': '2020-04-07 19:49:51.761714'}
{'no': 19, 'datetime': '2020-04-07 19:49:51.966037'}
{'no': 20, 'datetime': '2020-04-07 19:49:52.169764'}
{'no': 21, 'datetime': '2020-04-07 19:49:52.376065'}
{'no': 22, 'datetime': '2020-04-07 19:49:52.581145'}
{'no': 23, 'datetime': '2020-04-07 19:49:52.786285'}
{'no': 24, 'datetime': '2020-04-07 19:49:52.990812'}
{'no': 25, 'datetime': '2020-04-07 19:49:53.19599

In [18]:
source.memory

defaultdict(<function waterhealer.core.from_kafka.__init__.<locals>.<lambda>()>,
            {'testing<!>0': ExpiringDict([(445, False)]),
             'testing<!>1': ExpiringDict([(410, False)]),
             'testing<!>2': ExpiringDict([(430, False)]),
             'testing<!>3': ExpiringDict([(398, False)]),
             'testing<!>4': ExpiringDict([(397, False)]),
             'testing<!>5': ExpiringDict([(371, True), (372, False)]),
             'testing<!>6': ExpiringDict([(385, False)]),
             'testing<!>7': ExpiringDict([(367, False)]),
             'testing<!>8': ExpiringDict([(382, False)]),
             'testing<!>9': ExpiringDict([(363, False)])})

topic: testing, partition: 6, offset: 386, data: b'{"no": 16, "datetime": "2020-04-07 19:49:51.351378"}'
topic: testing, partition: 7, offset: 368, data: b'{"no": 17, "datetime": "2020-04-07 19:49:51.556762"}'
topic: testing, partition: 8, offset: 383, data: b'{"no": 18, "datetime": "2020-04-07 19:49:51.761714"}'
topic: testing, partition: 9, offset: 364, data: b'{"no": 19, "datetime": "2020-04-07 19:49:51.966037"}'
topic: testing, partition: 0, offset: 446, data: b'{"no": 20, "datetime": "2020-04-07 19:49:52.169764"}'
topic: testing, partition: 1, offset: 411, data: b'{"no": 21, "datetime": "2020-04-07 19:49:52.376065"}'
topic: testing, partition: 2, offset: 431, data: b'{"no": 22, "datetime": "2020-04-07 19:49:52.581145"}'
topic: testing, partition: 3, offset: 399, data: b'{"no": 23, "datetime": "2020-04-07 19:49:52.786285"}'
topic: testing, partition: 4, offset: 398, data: b'{"no": 24, "datetime": "2020-04-07 19:49:52.990812"}'
topic: testing, partition: 5, offset: 373, data: b'{"no

In [None]:
outputs

Problem with `healing_batch`, if a message throw an exception, entire batch will not update.

As you can see, we don't get data for 11 and 12.