In [None]:
pip install pyspark

In [None]:
pip install kafka-python

In [None]:
pip install bokeh

In [None]:
import json
import os
from kafka import KafkaConsumer
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import from_json, col, when,mean,var_pop
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import to_timestamp, from_unixtime, unix_timestamp, row_number, desc
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from pyspark.sql.window import Window

In [None]:
os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"

In [None]:
conf = SparkConf().setAppName("Report 3 : Component Temperature Realtime Report")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [None]:
kafkaParams = {
    "bootstrap_servers": "ec2-65-0-72-75.ap-south-1.compute.amazonaws.com:9092"}
topic = "IOTTemperatureStream01"

In [None]:
consumer = KafkaConsumer(topic, **kafkaParams)

In [None]:
df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema=StructType(
    [StructField("lane_number", StringType(), True),
        StructField("plant_name", StringType(), True),
        StructField("temperature", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("component_type", StringType(), True),
        StructField("component_manufacturer", StringType(), True),
     ])
)

In [None]:
def filterData(dataframe):
    last_10_minutes = datetime.now() - timedelta(minutes=10)
    last_30_minutes= datetime.now() - timedelta(minutes=30)
    filtered_max_temp_data = dataframe.filter(dataframe.timestamp > last_30_minutes)
    filtered_data = dataframe.filter(dataframe.timestamp > last_10_minutes)
    filtered_data = filtered_data.filter(col("temperature") > 50)
    component_counts = filtered_data.groupBy("component_type").count()
    window2 = Window.partitionBy("lane_number").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    window = Window.partitionBy("lane_number").orderBy(desc("temperature"))
    filtered_max_temp_data = filtered_max_temp_data.withColumn("avg_value", mean("temperature").over(window2))
    filtered_max_temp_data = filtered_max_temp_data.withColumn("variance", var_pop("temperature").over(window2))
    filtered_max_temp_data.show()
    max_temp_per_lane = filtered_max_temp_data.withColumn("row_number", row_number().over(window))
    max_temp_per_lane = max_temp_per_lane.filter(col("row_number") == 1)
    component_counts.show()
    max_temp_per_lane.show()
    gold_component_count = component_counts
    gold_temp_per_lane = max_temp_per_lane
    df_pandas = max_temp_per_lane.toPandas()
    p = figure(title="Line Plot", x_axis_label="lane Number", y_axis_label="temperature" )
    p.line(df_pandas['lane_number'], df_pandas['temperature'],  legend_label="max temperature",line_color="red")
    p.line(df_pandas['lane_number'], df_pandas['avg_value'],  legend_label="avg temperature")
    output_notebook()
    show(p)

In [None]:
def process(rdd):
    global df
    data = spark.read.json(rdd)
    bronze_data = data
    data = data.filter(data["component_info"].isNotNull())
    data = data.filter(data["timestamp"].isNotNull())
    if(data.count() > 0):
        data = data.withColumn("timestamp", when(col("timestamp").cast("double").isNotNull(
        ), col("timestamp").cast("double").cast("timestamp")).otherwise(col("timestamp")))
        data = data.withColumn("component_manufacturer",
                               data["component_info"]["component_manufacturer"])
        data = data.withColumn(
            "component_type", data["component_info"]["component_type"])
        data = data.drop("component_info")
        df=df.union(data)
    else:
        print("No data")
    silver_data = df
    filterData(df)

In [None]:
while True:
    messages = consumer.poll(1000)
    for tp, message in messages.items():
        for record in message:
            data = json.loads(record.value)
            if "component_info" in data and data["component_info"] and "component_type" in data["component_info"] and data["component_info"]["component_type"] and data['temperature'] is not None:
                rdd = sc.parallelize([record.value.decode('utf-8')])
                process(rdd)