In [1]:
from hops import hdfs
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import LongType
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1559755371379_0021,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f7633867ef0>

In [2]:
    df = spark \
        .read \
        .format("parquet") \
        .load(hdfs.project_path() + "Resources/iot-benchmarks/data/topic-lwm2m-3303-temperature") \
        .filter(F.col('endpointClientName').contains('node-latency-rem-20'))

    df.cache().count()

3240

In [3]:
df.printSchema()

root
 |-- measurement: struct (nullable = true)
 |    |-- timestamp: long (nullable = false)
 |    |-- endpointClientName: string (nullable = false)
 |    |-- instanceId: integer (nullable = false)
 |    |-- gatewayName: string (nullable = false)
 |    |-- ipsoObject: struct (nullable = false)
 |    |    |-- sensorValue: double (nullable = false)
 |    |    |-- minMeasuredValue: double (nullable = true)
 |    |    |-- maxMeasuredValue: double (nullable = true)
 |    |    |-- minRangeValue: double (nullable = true)
 |    |    |-- maxRangeValue: double (nullable = true)
 |    |    |-- sensorUnits: string (nullable = true)
 |    |    |-- resetMinAndMaxMeasuredValues: boolean (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- endpointClientName: string (nullable = true)

In [4]:
timeFmt = "yyyy-MM-dd HH:mm:ss.SSS"
sensDf = df \
    .select(F.to_timestamp(F.col('measurement.timestamp')/1000).alias('measurementTmp'), \
            'kafkaTimestamp', \
            (F.col('measurement.timestamp')).alias('measurementTmpLong'), \
            'endpointClientName') \
    .withColumn('kafkaTmpLong', F.substring_index('kafkaTimestamp', '.', -1).cast('float')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 10, F.col('kafkaTmpLong')*100).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 100, F.col('kafkaTmpLong')*10).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', (F.unix_timestamp('kafkaTimestamp', format=timeFmt)*1000 + F.col('kafkaTmpLong'))) \
    .withColumn("diffMilliseconds", (F.col('kafkaTmpLong') - F.col('measurementTmpLong')))
    

In [5]:
sensDf.show(5, False)

+-----------------------+-----------------------+------------------+---------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName   |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+---------------------+-------------+----------------+
|2019-06-12 16:28:30.573|2019-06-12 16:28:33.341|1560356910573     |node-latency-rem-20-4|1560356913341|2768            |
|2019-06-12 16:28:32.573|2019-06-12 16:28:34.883|1560356912573     |node-latency-rem-20-4|1560356914883|2310            |
|2019-06-12 16:28:34.577|2019-06-12 16:28:36.333|1560356914577     |node-latency-rem-20-4|1560356916333|1756            |
|2019-06-12 16:28:36.573|2019-06-12 16:28:37.549|1560356916573     |node-latency-rem-20-4|1560356917549|976             |
|2019-06-12 16:28:38.572|2019-06-12 16:28:38.845|1560356918572     |node-latency-rem-20-4|1560356918845|273             |
+-----------------------

In [6]:
sensDf.printSchema()

root
 |-- measurementTmp: timestamp (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- measurementTmpLong: long (nullable = true)
 |-- endpointClientName: string (nullable = true)
 |-- kafkaTmpLong: long (nullable = true)
 |-- diffMilliseconds: long (nullable = true)

In [7]:
sensDf.agg(F.avg(F.col('diffMilliseconds'))).show()

+---------------------+
|avg(diffMilliseconds)|
+---------------------+
|    94.64012345679012|
+---------------------+

In [8]:
sensDf.sort(F.col('diffMilliseconds').desc()).show(10, False)

+-----------------------+-----------------------+------------------+----------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName    |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+----------------------+-------------+----------------+
|2019-06-12 16:28:27.77 |2019-06-12 16:28:31.699|1560356907770     |node-latency-rem-20-2 |1560356911699|3929            |
|2019-06-12 16:28:27.89 |2019-06-12 16:28:31.697|1560356907890     |node-latency-rem-20-1 |1560356911697|3807            |
|2019-06-12 16:28:28.274|2019-06-12 16:28:31.698|1560356908274     |node-latency-rem-20-3 |1560356911698|3424            |
|2019-06-12 16:28:31.588|2019-06-12 16:28:34.085|1560356911588     |node-latency-rem-20-6 |1560356914850|3262            |
|2019-06-12 16:28:31.543|2019-06-12 16:28:34.068|1560356911543     |node-latency-rem-20-14|1560356914680|3137            |
|2019-06-12 16:2

In [9]:
%%spark -o sensDf

In [10]:
%%local
sensDf





VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()