# Producer

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from confluent_kafka import Producer
import json
import time

# URL of the page to scrape
url = "https://www.statistik.at/statistiken/tourismus-und-verkehr/fahrzeuge/kfz-bestand"

# Initialize the WebDriver (make sure you have the ChromeDriver installed and in your PATH)
options = Options()
options.add_argument('--headless')  # Run in headless mode (no GUI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Fetch the page content
driver.get(url)

# Wait for the JavaScript to load content (adjust the time as necessary)
time.sleep(10)

# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the WebDriver
driver.quit()

# Initialize a dictionary to store the data
data = {}

# Find the table with the required heading
table_heading = soup.find('span', class_='title-customized-padding', string='Kfz-Bestand 1990 bis 2023 (Tabelle)')
if not table_heading:
    print("Table heading not found.")
    exit()

# The table should be the next sibling of the heading
table = table_heading.find_next('table', class_='datatable')
if not table:
    print("Table not found.")
    exit()

# Find all rows in the table
rows = table.find_all("tr", class_=["datatable__tr odd", "datatable__tr even"])

print(f"Found {len(rows)} rows")

for row in rows:
    # Extract the year
    year_td = row.find("td", class_="datatable__td dtr-control")
    if year_td:
        year_text = year_td.get_text().strip()
        print(f"Found year: {year_text}")  # Debug print
        try:
            year = int(year_text)
            if year in range(2000, 2021):  # Interested in years 2000 to 2020
                # Extract the second value for Personenkraftwagen column
                values = row.find_all("td", class_="datatable__td datatable__td--right")
                if values and len(values) >= 2:
                    # Get the raw data for Personenkraftwagen
                    raw_value = values[1].get_text().strip().replace('\xa0', '')
                    print(f"Year: {year}, Value: {raw_value}")  # Debug print
                    data[year] = raw_value
        except ValueError:
            continue  # Skip rows where the year is not a valid integer

print(data)

# Kafka configuration
conf = {
    'bootstrap.servers': '127.0.0.1:29092'
}

# Create Producer instance
producer = Producer(conf)

# Kafka topic
topic = 'kfz_bestand'

# Produce the JSON data to Kafka topic
def delivery_report(err, msg):
    if err is not None:
        print(f"Message delivery failed: {err}")
    else:
        print(f"Message delivered to {msg.topic()} [{msg.partition()}]")

# Send each year data in a loop with a one-second interval
for year, value in data.items():
    json_data = json.dumps({year: value})
    producer.produce(topic, value=json_data, callback=delivery_report)
    producer.flush()
    time.sleep(1)

print("Data sent to Kafka topic")


Found 22 rows
Found year: 1990
Found year: 1995
Found year: 2000
Year: 2000, Value: 4097145
Found year: 2005
Year: 2005, Value: 4156743
Found year: 2006
Year: 2006, Value: 4204969
Found year: 2007
Year: 2007, Value: 4245583
Found year: 2008
Year: 2008, Value: 4284919
Found year: 2009
Year: 2009, Value: 4359944
Found year: 2010
Year: 2010, Value: 4441027
Found year: 2011
Year: 2011, Value: 4513421
Found year: 2012
Year: 2012, Value: 4584202
Found year: 2013
Year: 2013, Value: 4641308
Found year: 2014
Year: 2014, Value: 4694921
Found year: 2015
Year: 2015, Value: 4748048
Found year: 2016
Year: 2016, Value: 4821557
Found year: 2017
Year: 2017, Value: 4898578
Found year: 2018
Year: 2018, Value: 4978852
Found year: 2019
Year: 2019, Value: 5039548
Found year: 2020
Year: 2020, Value: 5091827
Found year: 2021
Found year: 2022
Found year: 2023
{2000: '4097145', 2005: '4156743', 2006: '4204969', 2007: '4245583', 2008: '4284919', 2009: '4359944', 2010: '4441027', 2011: '4513421', 2012: '4584202',

In [4]:
# Stop the Spark session


NameError: name 'spark' is not defined

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

import matplotlib.pyplot as plt
import seaborn as sns
from time import sleep
from IPython.display import clear_output
import pandas as pd

# Initialize Spark session with Kafka package
spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .getOrCreate()

# Define schema for the data
schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("value", StringType(), True)
])

# Create DataFrame representing the stream of input lines from Kafka
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:29092") \
    .option("subscribe", "kfz_bestand") \
    .option("startingOffsets", "earliest") \
    .load()

# Cast the value column to STRING
df = df.selectExpr("CAST(value AS STRING)")

# Parse JSON data
df = df.select(from_json(col("value"), schema).alias("data")).select("data.*")

# Write the streaming DataFrame to an in-memory table
queryStream = df.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("kfz_bestand_table") \
    .start()

# Initialize Seaborn
sns.set(style="whitegrid")
plt.rc('font', family='DejaVu Sans')

# Wait for the streaming query to be ready
sleep(10)  # Adjust the sleep time as needed to ensure the stream starts

try:
    i = 1
    while True:
        # Clear output
        clear_output(wait=True)
        print("**********************")
        print("General Info")
        print("**********************")
        print("Run:{}".format(i))
        if len(queryStream.recentProgress) > 0:
            print(queryStream.lastProgress)
            print("Stream timestamp:{}".format(queryStream.lastProgress.get("timestamp", "N/A")))
            event_time = queryStream.lastProgress.get("eventTime", {})
            if "watermark" in event_time:
                print("Watermark:{}".format(event_time["watermark"]))
            state_operators = queryStream.lastProgress.get("stateOperators", [])
            if state_operators:
                print("Total Rows:{}".format(state_operators[0].get("numRowsTotal", "N/A")))
                print("Updated Rows:{}".format(state_operators[0].get("numRowsUpdated", "N/A")))
                print("Memory used MB:{}".format((state_operators[0].get("memoryUsedBytes", 0)) * 0.000001))

        # Fetch data from the in-memory table
        df_pandas = spark.sql("SELECT * FROM kfz_bestand_table").toPandas()

        # Plot the data
        plt.figure(figsize=(10, 6))
        sns.barplot(x='year', y='value', data=df_pandas)
        plt.xlabel('Year')
        plt.ylabel('Kfz-Bestand Value')
        plt.title('Real-time Kfz-Bestand over Years')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Display DataFrame
        print("**********************")
        print("Table - Kfz-Bestand Data")
        print("**********************")
        display(df_pandas)

        # Sleep before the next update
        sleep(3)
        i += 1
except KeyboardInterrupt:
    print("Process interrupted.")
finally:
    queryStream.stop()
    spark.stop()


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.