In [1]:
from hops import hdfs
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import LongType
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7,application_1559755371379_0024,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f6a8c45eef0>

In [2]:
    df = spark \
        .read \
        .format("parquet") \
        .load(hdfs.project_path() + "Resources/iot-benchmarks/data/topic-lwm2m-3303-temperature") \
        .filter(F.col('endpointClientName').contains('node-latency-rem-100'))

    df.cache().count()

15613

In [3]:
df.printSchema()

root
 |-- measurement: struct (nullable = true)
 |    |-- timestamp: long (nullable = false)
 |    |-- endpointClientName: string (nullable = false)
 |    |-- instanceId: integer (nullable = false)
 |    |-- gatewayName: string (nullable = false)
 |    |-- ipsoObject: struct (nullable = false)
 |    |    |-- sensorValue: double (nullable = false)
 |    |    |-- minMeasuredValue: double (nullable = true)
 |    |    |-- maxMeasuredValue: double (nullable = true)
 |    |    |-- minRangeValue: double (nullable = true)
 |    |    |-- maxRangeValue: double (nullable = true)
 |    |    |-- sensorUnits: string (nullable = true)
 |    |    |-- resetMinAndMaxMeasuredValues: boolean (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- endpointClientName: string (nullable = true)

In [4]:
timeFmt = "yyyy-MM-dd HH:mm:ss.SSS"
sensDf = df \
    .select(F.to_timestamp(F.col('measurement.timestamp')/1000).alias('measurementTmp'), \
            'kafkaTimestamp', \
            (F.col('measurement.timestamp')).alias('measurementTmpLong'), \
            'endpointClientName') \
    .withColumn('kafkaTmpLong', F.substring_index('kafkaTimestamp', '.', -1).cast('float')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 10, F.col('kafkaTmpLong')*100).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 100, F.col('kafkaTmpLong')*10).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', (F.unix_timestamp('kafkaTimestamp', format=timeFmt)*1000 + F.col('kafkaTmpLong'))) \
    .withColumn("diffMilliseconds", (F.col('kafkaTmpLong') - F.col('measurementTmpLong')))
    

In [5]:
sensDf.show(5, False)

+-----------------------+-----------------------+------------------+-----------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName     |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+-----------------------+-------------+----------------+
|2019-06-13 08:31:16.064|2019-06-13 08:31:44.652|1560414676064     |node-latency-rem-100-15|1560414704652|28588           |
|2019-06-13 08:31:14.065|2019-06-13 08:31:44.653|1560414674065     |node-latency-rem-100-15|1560414704653|30588           |
|2019-06-13 08:31:22.065|2019-06-13 08:31:44.667|1560414682065     |node-latency-rem-100-15|1560414704667|22602           |
|2019-06-13 08:31:20.064|2019-06-13 08:31:45.108|1560414680064     |node-latency-rem-100-15|1560414705108|25044           |
|2019-06-13 08:31:24.064|2019-06-13 08:31:45.168|1560414684064     |node-latency-rem-100-15|1560414705168|21104           |
+-------

In [6]:
sensDf.printSchema()

root
 |-- measurementTmp: timestamp (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- measurementTmpLong: long (nullable = true)
 |-- endpointClientName: string (nullable = true)
 |-- kafkaTmpLong: long (nullable = true)
 |-- diffMilliseconds: long (nullable = true)

In [7]:
sensDf.agg(F.avg(F.col('diffMilliseconds'))).show()

+---------------------+
|avg(diffMilliseconds)|
+---------------------+
|   371.04413271842174|
+---------------------+

In [8]:
sensDf.sort(F.col('diffMilliseconds').desc()).show(10, False)

+-----------------------+-----------------------+------------------+-----------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName     |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+-----------------------+-------------+----------------+
|2019-06-13 08:31:13.587|2019-06-13 08:31:44.628|1560414673587     |node-latency-rem-100-7 |1560414704628|31041           |
|2019-06-13 08:31:13.658|2019-06-13 08:31:44.628|1560414673658     |node-latency-rem-100-14|1560414704628|30970           |
|2019-06-13 08:31:13.834|2019-06-13 08:31:44.564|1560414673834     |node-latency-rem-100-9 |1560414704564|30730           |
|2019-06-13 08:31:13.962|2019-06-13 08:31:44.628|1560414673962     |node-latency-rem-100-2 |1560414704628|30666           |
|2019-06-13 08:31:14.065|2019-06-13 08:31:44.653|1560414674065     |node-latency-rem-100-15|1560414704653|30588           |
|2019-06

In [9]:
%%spark -o sensDf

In [10]:
%%local
sensDf





VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()