Definition: Zookeeper is used for distributed coordination and management of Kafka brokers. It helps in managing broker metadata and leader election.

In [None]:
brew services start zookeeper


In [None]:
brew services start kafka


In [None]:
kafka-topics --create --topic test_topic --bootstrap-server localhost:9092 --partitions 3 --replication-factor 1


In [None]:
kafka-console-producer --topic test_topic --bootstrap-server localhost:9092


In [None]:
kafka-console-consumer --topic test_topic --bootstrap-server localhost:9092 --from-beginning


In [None]:
from confluent_kafka import Producer

# Configuration for the Kafka producer
conf = {
    'bootstrap.servers': ''}

# Create a Producer instance
producer = Producer(conf)

# Define a callback function to be called upon message delivery
def delivery_report(err, msg):
    if err is not None:
        print(f"Message delivery failed: {err}")
    else:
        print(f"Message delivered to {msg.topic()} [{msg.partition()}]")

def send_message(topic, message):
    try:
        # Produce a message to the specified topic
        producer.produce(topic, message, callback=delivery_report)
        # Wait up to 1 second for events to be delivered
        producer.flush()
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
topic = 'sample_topic'
send_message(topic, 'message to consumer ')


In [None]:
from confluent_kafka import Consumer, KafkaError

# Configuration for the Kafka consumer
conf = {
    'bootstrap.servers': '',  # Address of the Kafka cluster
    'group.id': 'my-consumer-group',         # Consumer group ID
    'auto.offset.reset': 'earliest'          # Start reading from the earliest message
}

# Create a Consumer instance
consumer = Consumer(conf)

# Subscribe to the topic
topic = 'sample_topic'
consumer.subscribe([topic])

print(f"Consuming messages from topic '{topic}'...")

try:
    while True:
        # Poll for new messages
        msg = consumer.poll(timeout=1.0)  # Adjust timeout as needed

        if msg is None:
            continue
        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                # End of partition
                continue
            elif msg.error():
                # Log or handle other errors
                print(f"Error: {msg.error()}")
                break

        # Print the message
        print(f"Received message: {msg.value().decode('utf-8')}")

except KeyboardInterrupt:
    # Handle the interrupt gracefully
    print("Interrupted by user")

finally:
    # Close the consumer to clean up resources
    consumer.close()


In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Initialize SparkSession with Kafka package
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1") \
    .getOrCreate()

# Define Kafka parameters
kafka_bootstrap_servers = ''
kafka_topic = 'sample_topic'

# Define a function to start the stream and handle termination
def start_streaming():
    # Read data from Kafka
    df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic) \
        .load()

    # Convert Kafka data into DataFrame
    df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    # Define a file path to save the final data
    final_output_path = "/path/to/final_output"  # Adjust path as needed

    # Write Stream to a file
    query = df.writeStream \
        .format("console") \
        .outputMode("append") \
        .start()

# Await termination
query.awaitTermination()

    # Return the query so it can be managed or stopped later
    return query

# Start streaming and get the query object
query = start_streaming()

# Check the status of the query
query.status
