In [1]:
from hops import hdfs
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import LongType
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1559755371379_0005,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fd424fb6f60>

In [2]:
    df = spark \
        .read \
        .format("parquet") \
        .load(hdfs.project_path() + "Resources/iot-benchmarks/data/topic-lwm2m-3303-temperature") \
        .filter(F.col('endpointClientName').contains('node-latency-rem-20'))

    df.cache().count()

17076

In [3]:
df.printSchema()

root
 |-- measurement: struct (nullable = true)
 |    |-- timestamp: long (nullable = false)
 |    |-- endpointClientName: string (nullable = false)
 |    |-- instanceId: integer (nullable = false)
 |    |-- gatewayId: integer (nullable = false)
 |    |-- ipsoObject: struct (nullable = false)
 |    |    |-- sensorValue: double (nullable = false)
 |    |    |-- minMeasuredValue: double (nullable = true)
 |    |    |-- maxMeasuredValue: double (nullable = true)
 |    |    |-- minRangeValue: double (nullable = true)
 |    |    |-- maxRangeValue: double (nullable = true)
 |    |    |-- sensorUnits: string (nullable = true)
 |    |    |-- resetMinAndMaxMeasuredValues: boolean (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- endpointClientName: string (nullable = true)

In [28]:
timeFmt = "yyyy-MM-dd HH:mm:ss.SSS"
sensDf = df \
    .select(F.to_timestamp(F.col('measurement.timestamp')/1000).alias('measurementTmp'), \
            'kafkaTimestamp', \
            (F.col('measurement.timestamp')).alias('measurementTmpLong'), \
            'endpointClientName') \
    .withColumn('kafkaTmpLong', F.substring_index('kafkaTimestamp', '.', -1).cast('float')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 10, F.col('kafkaTmpLong')*100).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 100, F.col('kafkaTmpLong')*10).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', (F.unix_timestamp('kafkaTimestamp', format=timeFmt)*1000 + F.col('kafkaTmpLong'))) \
    .withColumn("diffMilliseconds", (F.col('kafkaTmpLong') - F.col('measurementTmpLong')))
    

In [29]:
sensDf.show(5, False)

+-----------------------+-----------------------+------------------+---------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName   |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+---------------------+-------------+----------------+
|2019-06-06 12:45:12.39 |2019-06-06 12:45:15.713|1559825112390     |node-latency-rem-20-2|1559825115713|3323            |
|2019-06-06 12:45:14.388|2019-06-06 12:45:17.102|1559825114388     |node-latency-rem-20-2|1559825117102|2714            |
|2019-06-06 12:45:16.389|2019-06-06 12:45:18.473|1559825116389     |node-latency-rem-20-2|1559825118473|2084            |
|2019-06-06 12:45:18.389|2019-06-06 12:45:19.946|1559825118389     |node-latency-rem-20-2|1559825119946|1557            |
|2019-06-06 12:45:20.388|2019-06-06 12:45:21.571|1559825120388     |node-latency-rem-20-2|1559825121571|1183            |
+-----------------------

In [30]:
sensDf.printSchema()

root
 |-- measurementTmp: timestamp (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- measurementTmpLong: long (nullable = true)
 |-- endpointClientName: string (nullable = true)
 |-- kafkaTmpLong: long (nullable = true)
 |-- diffMilliseconds: long (nullable = true)

In [31]:
sensDf.agg(F.avg(F.col('diffMilliseconds'))).show()

+---------------------+
|avg(diffMilliseconds)|
+---------------------+
|   106.40114780979152|
+---------------------+

In [32]:
sensDf.sort(F.col('diffMilliseconds').desc()).show(10, False)

+-----------------------+-----------------------+------------------+---------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName   |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+---------------------+-------------+----------------+
|2019-06-06 12:45:13.114|2019-06-06 12:45:16.092|1559825113114     |node-latency-rem-20-4|1559825116920|3806            |
|2019-06-06 12:45:12.066|2019-06-06 12:45:15.713|1559825112066     |node-latency-rem-20-1|1559825115713|3647            |
|2019-06-06 12:45:12.39 |2019-06-06 12:45:15.713|1559825112390     |node-latency-rem-20-2|1559825115713|3323            |
|2019-06-06 12:45:13.449|2019-06-06 12:45:16.469|1559825113449     |node-latency-rem-20-5|1559825116469|3020            |
|2019-06-06 12:45:12.71 |2019-06-06 12:45:15.703|1559825112710     |node-latency-rem-20-3|1559825115703|2993            |
|2019-06-06 12:45:13.737

In [33]:
%%spark -o sensDf

In [34]:
%%local
sensDf





VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()