In [1]:
import json
from kafka import KafkaConsumer
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import from_json, col, when
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import to_timestamp, from_unixtime, unix_timestamp
from pymongo import MongoClient

In [2]:
conf = SparkConf().setAppName("TemperatureProcessing")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)


23/01/12 15:52:48 WARN Utils: Your hostname, user-hp-pavilion-gaming-laptop-15-ec2xxx resolves to a loopback address: 127.0.1.1; using 192.168.184.92 instead (on interface wlo1)
23/01/12 15:52:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/12 15:52:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
kafkaParams = {
    "bootstrap_servers": "ec2-65-0-72-75.ap-south-1.compute.amazonaws.com:9092"}
topic = "IOTTemperatureStream01"


In [4]:
consumer = KafkaConsumer(topic, **kafkaParams)


In [5]:
df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema=StructType(
    [StructField("plant_name", StringType(), True),
        StructField("lane_number", StringType(), True),
        StructField("temperature", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("component_type", StringType(), True),
        StructField("component_manufacturer", StringType(), True),
     ])
)


In [6]:
def filterData(dataframe):
    last_10_minutes = datetime.now() - timedelta(minutes=10)
    last_30_minutes= datetime.now() - timedelta(minutes=30)
    filtered_max_temp_data = dataframe.filter(dataframe.timestamp < last_30_minutes)
    filtered_data = dataframe.filter(dataframe.timestamp > last_10_minutes)
    filtered_data = filtered_data.filter(col("temperature") > 50)
    component_counts = filtered_data.groupBy("component_type").count()
    max_temp_per_lane = filtered_max_temp_data.groupBy("lane_number", "component_type").agg(
        {"temperature": "max"}).withColumnRenamed("max(temperature)", "Max Temp")
    component_counts.show()
    max_temp_per_lane.show()

In [7]:
def process(rdd):
    print(rdd)
    global df
    data = spark.read.json(rdd)
    data = data.filter(data["component_info"].isNotNull())
    data = data.filter(data["timestamp"].isNotNull())
    if(data.count() > 0):
        data = data.withColumn("timestamp", when(col("timestamp").cast("double").isNotNull(
        ), col("timestamp").cast("double").cast("timestamp")).otherwise(col("timestamp")))
        data = data.withColumn("component_manufacturer",
                               data["component_info"]["component_manufacturer"])
        data = data.withColumn(
            "component_type", data["component_info"]["component_type"])
        data = data.drop("component_info")
        df=df.union(data)
    else:
        print("No data")
    df.show()
    filterData(df)


In [8]:
while True:
    messages = consumer.poll(1000)
    for tp, message in messages.items():
        for record in message:
            data = json.loads(record.value)
            print(data)
            if "component_info" in data and data["component_info"] and "component_type" in data["component_info"] and data["component_info"]["component_type"] and data['temperature'] is not None:
                rdd = sc.parallelize([record.value.decode('utf-8')])
                process(rdd)


{'plant_name': 'Plant1', 'lane_number': '1', 'timestamp': '1673518974', 'temperature': '47.93', 'component_info': {'component_type': 'solder_paste_printer', 'component_manufacturer': 'manufact2'}}
ParallelCollectionRDD[5] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
+----------+-----------+-----------+-------------------+--------------+----------------------+

+--------------+-----+
|component_type|count|
+--------------+-----+
+--------------+-----+

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
+-----------+--------------+--------+

{'plant_name': 'Plant2', 'lane_number': '6', 'timestamp': '1673518975', 'temperature': '59.83', 'component_info': {'component_type': 'optical_inspection_module', 'component_manufacturer': 'manufact2'}}
ParallelCollectionRDD[40] at readRDDFromFile at PythonRDD.scala:274
+-------

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    2|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant4', 'lane_number': '6', 'timestamp': '1673518983', 'temperature': '26.26', 'component_info': {'component_type': 'optical_inspection_module', 'component_manufacturer': 'manufact3'}}
ParallelCollectionRDD[390] at readRDDFromFile at PythonRDD.scala:274
+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     ma

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    2|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant3', 'lane_number': '2', 'timestamp': '1673518984', 'temperature': '33.94', 'component_info': {'component_type': 'solder_paste_printer', 'component_manufacturer': 'manufact2'}}
ParallelCollectionRDD[452] at readRDDFromFile at PythonRDD.scala:274
+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufac

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    2|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant1', 'lane_number': '1', 'timestamp': '1673518985', 'temperature': '30.88', 'component_info': {'component_type': 'solder_paste_printer', 'component_manufacturer': 'manufact2'}}
ParallelCollectionRDD[517] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  optical_inspectio...|
|         1|     Plant5|      52.44|2023-01-12 15:52:57|     manufact2|  solder_paste_printer|
|         2|     Plant4|      59.44|2023-01-12 15:52:58|     manufact1|         label_printer|
|         2|     Plant3|      89.20|2023-01-12 15:53:00|     manufact2|  solder_paste_printer|
|         3|     Plant1|       null|2023-01-12 10:23:01|     manufact1|  solder_paste_printer|
|         4|     Plant5|      59.98|2023-01-12 15:

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    2|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant2', 'lane_number': '6', 'timestamp': '1673518986', 'temperature': '96.49', 'component_info': {'component_type': 'optical_inspection_module', 'component_manufacturer': 'manufact3'}}
ParallelCollectionRDD[585] at readRDDFromFile at PythonRDD.scala:274
+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     ma

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    2|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant2', 'lane_number': '1', 'timestamp': '20230112102307', 'temperature': None, 'component_info': {}}
{'plant_name': 'Plant3', 'lane_number': '5', 'timestamp': '1673518988', 'temperature': '41.99', 'component_info': {'component_type': 'optical_inspection_module', 'component_manufacturer': 'manufact2'}}
ParallelCollectionRDD[656] at readRDDFromFile at PythonRDD.scala:274
+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|202

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    2|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant4', 'lane_number': '1', 'timestamp': '1673518989', 'temperature': '87.10', 'component_info': {'component_type': 'label_printer', 'component_manufacturer': 'manufact1'}}
ParallelCollectionRDD[730] at readRDDFromFile at PythonRDD.scala:274
+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  op

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    3|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant4', 'lane_number': '4', 'timestamp': '1673518990', 'temperature': '25.09', 'component_info': {'component_type': 'solder_paste_printer', 'component_manufacturer': 'manufact1'}}
ParallelCollectionRDD[807] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  optical_inspectio...|
|         1|     Plant5|      52.44|2023-01-12 15:52:57|     manufact2|  solder_paste_printer|
|         2|     Plant4|      59.44|2023-01-12 15:52:58|     manufact1|         label_printer|
|         2|     Plant3|      89.20|2023-01-12 15:53:00|     manufact2|  solder_paste_printer|
|         3|     Plant1|       null|2023-01-12 10:23:01|     manufact1|  solder_paste_printer|
|         4|     Plant5|      59.98|2023-01-12 15:

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    3|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant2', 'lane_number': '2', 'timestamp': '1673518991', 'temperature': '68.57', 'component_info': {'component_type': 'label_printer', 'component_manufacturer': 'manufact1'}}
ParallelCollectionRDD[887] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  optical_inspectio...|
|         1|     Plant5|      52.44|2023-01-12 15:52:57|     manufact2|  solder_paste_printer|
|         2|     Plant4|      59.44|2023-01-12 15:52:58|     manufact1|         label_printer|
|         2|     Plant3|      89.20|2023-01-12 15:53:00|     manufact2|  solder_paste_printer|
|         3|     Plant1|       null|2023-01-12 10:23:01|     manufact1|  solder_paste_printer|
|         4|     Plant5|      59.98|2023-01-12 15:

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    4|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant4', 'lane_number': '3', 'timestamp': '1673518992', 'temperature': '33.82', 'component_info': {'component_type': 'label_printer', 'component_manufacturer': 'manufact3'}}
ParallelCollectionRDD[970] at readRDDFromFile at PythonRDD.scala:274
+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  op

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    4|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant4', 'lane_number': '4', 'timestamp': '1673518993', 'temperature': '87.19', 'component_info': {'component_type': 'solder_paste_printer', 'component_manufacturer': 'manufact1'}}
ParallelCollectionRDD[1056] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  optical_inspectio...|
|         1|     Plant5|      52.44|2023-01-12 15:52:57|     manufact2|  solder_paste_printer|
|         2|     Plant4|      59.44|2023-01-12 15:52:58|     manufact1|         label_printer|
|         2|     Plant3|      89.20|2023-01-12 15:53:00|     manufact2|  solder_paste_printer|
|         3|     Plant1|       null|2023-01-12 10:23:01|     manufact1|  solder_paste_printer|
|         4|     Plant5|      59.98|2023-01-12 15:

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    5|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant2', 'lane_number': '1', 'timestamp': '1673518994', 'temperature': '78.51', 'component_info': {'component_type': 'label_printer', 'component_manufacturer': 'manufact1'}}
ParallelCollectionRDD[1145] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  optical_inspectio...|
|         1|     Plant5|      52.44|2023-01-12 15:52:57|     manufact2|  solder_paste_printer|
|         2|     Plant4|      59.44|2023-01-12 15:52:58|     manufact1|         label_printer|
|         2|     Plant3|      89.20|2023-01-12 15:53:00|     manufact2|  solder_paste_printer|
|         3|     Plant1|       null|2023-01-12 10:23:01|     manufact1|  solder_paste_printer|
|         4|     Plant5|      59.98|2023-01-12 15:

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    6|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant5', 'lane_number': '2', 'timestamp': '1673518995', 'temperature': '48.61', 'component_info': {'component_type': 'label_printer', 'component_manufacturer': 'manufact1'}}
ParallelCollectionRDD[1237] at readRDDFromFile at PythonRDD.scala:274


                                                                                

+----------+-----------+-----------+-------------------+--------------+----------------------+
|plant_name|lane_number|temperature|          timestamp|component_type|component_manufacturer|
+----------+-----------+-----------+-------------------+--------------+----------------------+
|         1|     Plant1|      47.93|2023-01-12 15:52:54|     manufact2|  solder_paste_printer|
|         6|     Plant2|      59.83|2023-01-12 15:52:55|     manufact2|  optical_inspectio...|
|         3|     Plant4|      44.86|2023-01-12 15:52:56|     manufact2|  optical_inspectio...|
|         1|     Plant5|      52.44|2023-01-12 15:52:57|     manufact2|  solder_paste_printer|
|         2|     Plant4|      59.44|2023-01-12 15:52:58|     manufact1|         label_printer|
|         2|     Plant3|      89.20|2023-01-12 15:53:00|     manufact2|  solder_paste_printer|
|         3|     Plant1|       null|2023-01-12 10:23:01|     manufact1|  solder_paste_printer|
|         4|     Plant5|      59.98|2023-01-12 15:

                                                                                

+--------------+-----+
|component_type|count|
+--------------+-----+
|     manufact2|    3|
|     manufact1|    6|
|     manufact3|    1|
+--------------+-----+



                                                                                

+-----------+--------------+--------+
|lane_number|component_type|Max Temp|
+-----------+--------------+--------+
|     Plant1|     manufact1|    null|
+-----------+--------------+--------+

{'plant_name': 'Plant4', 'lane_number': '8', 'timestamp': '2023/01/12 10:23:16', 'temperature': '79.24', 'component_info': {}}
{'plant_name': 'Plant1', 'lane_number': '4', 'timestamp': '1673518997', 'temperature': '98.82', 'component_info': {'component_type': 'optical_inspection_module', 'component_manufacturer': 'manufact2'}}
ParallelCollectionRDD[1332] at readRDDFromFile at PythonRDD.scala:274


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/user/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/user/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                