Definition: Zookeeper is used for distributed coordination and management of Kafka brokers. It helps in managing broker metadata and leader election.

In [None]:
brew services start zookeeper


In [None]:
brew services start kafka


In [None]:
kafka-topics --create --topic test_topic --bootstrap-server localhost:9092 --partitions 3 --replication-factor 1


In [None]:
kafka-console-producer --topic test_topic --bootstrap-server localhost:9092


In [None]:
kafka-console-consumer --topic test_topic --bootstrap-server localhost:9092 --from-beginning


In [1]:
pip install confluent_kafka

Note: you may need to restart the kernel to use updated packages.


In [3]:
from confluent_kafka import Producer

# Configuration for the Kafka producer
conf = {
    'bootstrap.servers': 'localhost:9092'}

# Create a Producer instance
producer = Producer(conf)

# Define a callback function to be called upon message delivery
def delivery_report(err, msg):
    if err is not None:
        print(f"Message delivery failed: {err}")
    else:
        print(f"Message delivered to {msg.topic()} [{msg.partition()}]")

def send_message(topic, message):
    try:
        # Produce a message to the specified topic
        producer.produce(topic, message, callback=delivery_report)
        # Wait up to 1 second for events to be delivered
        producer.flush()
    except Exception as e:
        print(f"An error occurred: {e}")

In [4]:
topic = 'sample_topic'
send_message(topic, 'message to consumer from python')


Message delivered to sample_topic [0]


In [5]:
from confluent_kafka import Consumer, KafkaError

# Configuration for the Kafka consumer
conf = {
    'bootstrap.servers': 'localhost:9092',  # Address of the Kafka cluster
    'group.id': 'my-consumer-group-name',         # Consumer group ID
    'auto.offset.reset': 'earliest'          # Start reading from the earliest message
}

# Create a Consumer instance
consumer = Consumer(conf)

# Subscribe to the topic
topic = 'sample_topic'
consumer.subscribe([topic])

print(f"Consuming messages from topic '{topic}'...")

try:
    while True:
        # Poll for new messages
        msg = consumer.poll(timeout=1.0)  # Adjust timeout as needed

        if msg is None:
            continue
        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                # End of partition
                continue
            elif msg.error():
                # Log or handle other errors
                print(f"Error: {msg.error()}")
                break

        # Print the message
        print(f"Received message: {msg.value().decode('utf-8')}")

except KeyboardInterrupt:
    # Handle the interrupt gracefully
    print("Interrupted by user")

finally:
    # Close the consumer to clean up resources
    consumer.close()


Consuming messages from topic 'sample_topic'...
Received message: sample message
Received message: adfa
Received message: df
Received message: sdg
Received message: sdg
Received message: s
Received message: dg
Received message: s
Received message: dg
Received message: s
Received message: dg
Received message: s
Received message: g
Received message: 
Received message: s
Received message: gdsd
Received message: g
Received message: sdg
Received message: 
Received message: s
Received message: gds
Received message: a
Received message: g
Received message: asgf
Received message: asfgfsg
Received message: s
Received message: a
Received message: 
Received message: message to consumer from python
Received message: new message from producer to python
Interrupted by user


In [None]:
pip install pyspark


In [10]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Initialize SparkSession with Kafka package
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

# Define Kafka parameters
kafka_bootstrap_servers = 'localhost:9092'
kafka_topic = 'sample_topic'

# Define a function to start the stream and handle termination
def start_streaming():
    # Read data from Kafka
    df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic) \
        .load()

    # Convert Kafka data into DataFrame
    df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    # Define a file path to save the final data
    final_output_path = "/path/to/final_output"  # Adjust path as needed

    # Write Stream to a file
    query = df.writeStream \
        .format("console") \
        .outputMode("append") \
        .start()

    # Await termination
    query.awaitTermination()

    # Return the query so it can be managed or stopped later
    return query

# Start streaming and get the query object
query = start_streaming()

# Check the status of the query
query.status


24/09/19 22:03:00 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/7w/0689m70n4h7_6s_sjy4tfkmh0000gn/T/temporary-4fca20d9-33d1-4390-aa13-ad22754360d7. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/09/19 22:03:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/09/19 22:03:00 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/09/19 22:03:00 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/09/19 22:03:00 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/09/19 22:03:00 WARN AdminClientConfig: The configuration '

-------------------------------------------
Batch: 0
-------------------------------------------
+---+-----+
|key|value|
+---+-----+
+---+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+-------------------+
| key|              value|
+----+-------------------+
|NULL|create new messages|
+----+-------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+----+------+
| key| value|
+----+------+
|NULL|sample|
+----+------+

-------------------------------------------
Batch: 3
-------------------------------------------
+----+-----+
| key|value|
+----+-----+
|NULL|dummy|
+----+-----+



24/09/19 22:05:13 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
24/09/19 22:05:13 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
24/09/19 22:05:13 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
24/09/19 22:05:13 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
%3|1726763713.976|FAIL|rdkafka#producer-2| [thrd:localhost:9092/bootstrap]: localhost:9092/0: Connect to ipv6#[::1]:9092 failed: Connection refused (after 0ms in state CONNECT)
24/09/19 22:05:14 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.

KeyboardInterrupt: 

%3|1726763737.043|FAIL|rdkafka#producer-2| [thrd:localhost:9092/bootstrap]: localhost:9092/0: Connect to ipv6#[::1]:9092 failed: Connection refused (after 1ms in state CONNECT)
24/09/19 22:05:37 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
%3|1726763738.047|FAIL|rdkafka#producer-2| [thrd:localhost:9092/bootstrap]: localhost:9092/0: Connect to ipv4#127.0.0.1:9092 failed: Connection refused (after 0ms in state CONNECT)
24/09/19 22:05:38 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
24/09/19 22:05:39 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 0 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
%3|1726763740.056|FAIL|rdkafka#producer-2| [thrd:localhost:9092/bootstrap]: localhost:9092/0: Connect to ipv