In [1]:
import waterhealer as wh
from waterhealer import Stream
import json
import time
import confluent_kafka as ck

In [2]:
source = wh.from_kafka(
    ['testing'],
    {
        'bootstrap.servers': 'localhost:9095, localhost:9096',
        'group.id': 'group-test-3',
        'auto.offset.reset': 'earliest',
    },
    debug = True)

In [3]:
from dask.distributed import Client

client = Client(loop = source.loop)
client

0,1
Client  Scheduler: tcp://127.0.0.1:51722  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB


In [4]:
def json_loads(rows):
    results = []
    for row in rows:
        results.append(json.loads(row[1]))
        
    return results

def increment_left(rows):
    results = []
    for row in rows:
        row = row.copy()
        time.sleep(1)
        row['left'] = row['data'] + 1
        results.append(row)
    return results

def increment_right(rows):
    results = []
    for row in rows:
        row = row.copy()
        time.sleep(1)
        row['right'] = row['data'] + 1
        results.append(row)
    return results

def combine(tuples):
    
    left, right = tuples
    results = []
    for i in range(len(left)):
        time.sleep(1)
        results.append(left[i]['left'] + right[i]['right'])
        
    return results

In [5]:
partition = source.scatter().partition_time(5, checkpoint = True)
s = partition.map(json_loads, checkpoint = True)
s.buffer(8).gather().flatten().partition_time(5, checkpoint = True)

Output()

In [6]:
# source.visualize()

In [7]:
source.start()

In [8]:
from kafka import KafkaProducer
from kafka.partitioner import RoundRobinPartitioner

In [9]:
producer = KafkaProducer(
    bootstrap_servers = ['localhost:9095', 'localhost:9096'],
    api_version = (0, 10),
    partitioner = RoundRobinPartitioner(),
)

In [10]:
producer.bootstrap_connected()

True

In [11]:
from datetime import datetime
import json
from itertools import cycle

def publish_message(producer_instance, topic_name, key, value):
    try:
        key_bytes = bytes(key, encoding = 'utf-8')
        value_bytes = bytes(value, encoding = 'utf-8')
        x = producer_instance.send(topic_name, value = value_bytes)
        return True
    except Exception as ex:
        print('Exception in publishing message')
        print(str(ex))
        return False

In [12]:
import time

for i in range(10):
    data = {'i': i, 'data': i + 1}
    print(data)
    publish_message(
        producer, 'testing', 'streaming', json.dumps(data)
    )
    time.sleep(0.1)

topic: testing, partition: 0, offset: 3, data: b'{"i": 0, "data": 1}'


{'i': 0, 'data': 1}
{'i': 1, 'data': 2}


topic: testing, partition: 2, offset: 2, data: b'{"i": 2, "data": 3}'
topic: testing, partition: 1, offset: 2, data: b'{"i": 1, "data": 2}'
topic: testing, partition: 3, offset: 2, data: b'{"i": 3, "data": 4}'


{'i': 2, 'data': 3}
{'i': 3, 'data': 4}


topic: testing, partition: 4, offset: 2, data: b'{"i": 4, "data": 5}'
topic: testing, partition: 5, offset: 2, data: b'{"i": 5, "data": 6}'


{'i': 4, 'data': 5}
{'i': 5, 'data': 6}


topic: testing, partition: 6, offset: 2, data: b'{"i": 6, "data": 7}'
topic: testing, partition: 7, offset: 2, data: b'{"i": 7, "data": 8}'


{'i': 6, 'data': 7}
{'i': 7, 'data': 8}


topic: testing, partition: 8, offset: 2, data: b'{"i": 8, "data": 9}'
topic: testing, partition: 9, offset: 2, data: b'{"i": 9, "data": 10}'


{'i': 8, 'data': 9}
{'i': 9, 'data': 10}


In [14]:
%%time

source.wait()

CPU times: user 12.6 ms, sys: 2.5 ms, total: 15.1 ms
Wall time: 16.7 ms


True

In [15]:
source.checkpoint

{'from_kafka.scatter.partition_time': [({'partition': 0,
    'offset': 3,
    'topic': 'testing'},
   b'{"i": 0, "data": 1}'),
  ({'partition': 2, 'offset': 2, 'topic': 'testing'}, b'{"i": 2, "data": 3}'),
  ({'partition': 1, 'offset': 2, 'topic': 'testing'}, b'{"i": 1, "data": 2}'),
  ({'partition': 3, 'offset': 2, 'topic': 'testing'}, b'{"i": 3, "data": 4}'),
  ({'partition': 4, 'offset': 2, 'topic': 'testing'}, b'{"i": 4, "data": 5}'),
  ({'partition': 5, 'offset': 2, 'topic': 'testing'}, b'{"i": 5, "data": 6}'),
  ({'partition': 6, 'offset': 2, 'topic': 'testing'}, b'{"i": 6, "data": 7}'),
  ({'partition': 7, 'offset': 2, 'topic': 'testing'}, b'{"i": 7, "data": 8}'),
  ({'partition': 8, 'offset': 2, 'topic': 'testing'}, b'{"i": 8, "data": 9}'),
  ({'partition': 9, 'offset': 2, 'topic': 'testing'},
   b'{"i": 9, "data": 10}')],
 'from_kafka.scatter.partition_time.map.json_loads': [{'i': 0, 'data': 1},
  {'i': 2, 'data': 3},
  {'i': 1, 'data': 2},
  {'i': 3, 'data': 4},
  {'i': 4, 'd