In [2]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import random
import time
from datetime import datetime
from pyspark.sql import SparkSession
import traceback

# Function to create a sample XML string with customer login attempts
def create_sample_xml():
    root = ET.Element("Data")
    num_attempts = random.randint(3, 5)  # Generate 3-5 login attempts

    for _ in range(num_attempts):
        transaction = ET.SubElement(root, "Transaction")
        transaction_id = ET.SubElement(transaction, "TransactionId")
        transaction_id.text = str(random.randint(1, 100))

        customer_id = ET.SubElement(transaction, "Amount")
        customer_id.text = str(round(random.uniform(0, 1000), 2))

        customer_id = ET.SubElement(transaction, "CustomerId")
        customer_id.text = str(random.randint(1, 100))
        #customer_id.text = "abc"

        datetime_element = ET.SubElement(transaction, "DateTime")
        datetime_element.text = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

        location = ET.SubElement(transaction, "Location")
        location.text = random.choice(["New York", "London", "Tokyo", "Sydney", "Paris"])

        result = ET.SubElement(transaction, "Result")
        result.text = random.choice(["Success", "Failure"])

    # Pretty print the XML
    xmlstr = ET.tostring(root, encoding='utf8', method='xml')
    pretty_xml_as_string = minidom.parseString(xmlstr).toprettyxml(indent="   ")

    return pretty_xml_as_string

# Initialize Spark Session for Kafka
spark = SparkSession \
    .builder \
    .appName("XMLToKafkaProducer") \
    .master("spark://spark-test1:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()

kafka_server = "spark-test1:9092"
topic_name = "test-topic"

# Function to send XML to Kafka
def send_to_kafka(xml_data):
    df = spark.createDataFrame([(xml_data,)], ["value"])
    df.selectExpr("CAST(value AS STRING)") \
        .write \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_server) \
        .option("topic", topic_name) \
        .save()

# Generate and send XML data every 10 seconds
for i in range(0,1000):
    try:
        xml_data = create_sample_xml()
        if i == 0:
            print(f"Streaming xml every 10 seconds (sample):\n{xml_data}")
        send_to_kafka(xml_data)
        time.sleep(10)  # Wait for 10 seconds before next send
    except KeyboardInterrupt:
        print("Stopped XML Data Generation")
    except Exception as e:
        print(f"Exception occurred:\n{traceback.format_exc()}")


Streaming xml every 10 seconds (sample):
<?xml version="1.0" ?>
<Data>
   <Transaction>
      <TransactionId>68</TransactionId>
      <Amount>407.7</Amount>
      <CustomerId>99</CustomerId>
      <DateTime>2023-12-09T20:28:51</DateTime>
      <Location>Sydney</Location>
      <Result>Failure</Result>
   </Transaction>
   <Transaction>
      <TransactionId>33</TransactionId>
      <Amount>793.09</Amount>
      <CustomerId>50</CustomerId>
      <DateTime>2023-12-09T20:28:51</DateTime>
      <Location>New York</Location>
      <Result>Success</Result>
   </Transaction>
   <Transaction>
      <TransactionId>92</TransactionId>
      <Amount>325.33</Amount>
      <CustomerId>38</CustomerId>
      <DateTime>2023-12-09T20:28:51</DateTime>
      <Location>Tokyo</Location>
      <Result>Success</Result>
   </Transaction>
   <Transaction>
      <TransactionId>2</TransactionId>
      <Amount>840.58</Amount>
      <CustomerId>25</CustomerId>
      <DateTime>2023-12-09T20:28:51</DateTime>
      <Lo

                                                                                