In [3]:
from hops import hdfs
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import LongType
spark

<pyspark.sql.session.SparkSession object at 0x7f4862e34f60>

In [4]:
    df = spark \
        .read \
        .format("parquet") \
        .load(hdfs.project_path() + "Resources/iot-benchmarks/data/topic-lwm2m-3303-temperature") \
        .filter(F.col('endpointClientName')=='node-latency-rem-0-1')

    df.cache().count()

1182

In [5]:
df.printSchema()

root
 |-- measurement: struct (nullable = true)
 |    |-- timestamp: long (nullable = false)
 |    |-- endpointClientName: string (nullable = false)
 |    |-- instanceId: integer (nullable = false)
 |    |-- gatewayId: integer (nullable = false)
 |    |-- ipsoObject: struct (nullable = false)
 |    |    |-- sensorValue: double (nullable = false)
 |    |    |-- minMeasuredValue: double (nullable = true)
 |    |    |-- maxMeasuredValue: double (nullable = true)
 |    |    |-- minRangeValue: double (nullable = true)
 |    |    |-- maxRangeValue: double (nullable = true)
 |    |    |-- sensorUnits: string (nullable = true)
 |    |    |-- resetMinAndMaxMeasuredValues: boolean (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- endpointClientName: string (nullable = true)

In [6]:
timeFmt = "yyyy-MM-dd HH:mm:ss.SSS"
sensDf = df \
    .select(F.to_timestamp(F.col('measurement.timestamp')/1000).alias('measurementTmp'), \
            'kafkaTimestamp', \
            (F.col('measurement.timestamp')).alias('measurementTmpLong'), \
            'endpointClientName') \
    .withColumn('kafkaTmpLong', F.substring_index('kafkaTimestamp', '.', -1).cast('float')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 100, F.col('kafkaTmpLong')*10).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', (F.unix_timestamp('kafkaTimestamp', format=timeFmt)*1000 + F.col('kafkaTmpLong'))) \
    .withColumn("diffMilliseconds", (F.col('kafkaTmpLong') - F.col('measurementTmpLong')))
    

In [7]:
sensDf.show(5, False)

+-----------------------+-----------------------+------------------+--------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName  |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+--------------------+-------------+----------------+
|2019-06-06 10:52:21.77 |2019-06-06 10:52:22.783|1559818341770     |node-latency-rem-0-1|1559818342783|1013            |
|2019-06-06 10:52:23.762|2019-06-06 10:52:23.779|1559818343762     |node-latency-rem-0-1|1559818343779|17              |
|2019-06-06 10:52:25.761|2019-06-06 10:52:25.782|1559818345761     |node-latency-rem-0-1|1559818345782|21              |
|2019-06-06 10:52:27.762|2019-06-06 10:52:27.775|1559818347762     |node-latency-rem-0-1|1559818347775|13              |
|2019-06-06 10:52:29.762|2019-06-06 10:52:29.776|1559818349762     |node-latency-rem-0-1|1559818349776|14              |
+-----------------------+-------

In [8]:
sensDf.printSchema()

root
 |-- measurementTmp: timestamp (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- measurementTmpLong: long (nullable = true)
 |-- endpointClientName: string (nullable = true)
 |-- kafkaTmpLong: long (nullable = true)
 |-- diffMilliseconds: long (nullable = true)

In [9]:
sensDf.agg(F.avg(F.col('diffMilliseconds'))).show()

+---------------------+
|avg(diffMilliseconds)|
+---------------------+
|   15.094754653130288|
+---------------------+

In [10]:
sensDf.sort(F.col('diffMilliseconds').desc()).show(10, False)

+-----------------------+-----------------------+------------------+--------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName  |kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+--------------------+-------------+----------------+
|2019-06-06 10:52:21.77 |2019-06-06 10:52:22.783|1559818341770     |node-latency-rem-0-1|1559818342783|1013            |
|2019-06-06 10:53:11.761|2019-06-06 10:53:11.789|1559818391761     |node-latency-rem-0-1|1559818391789|28              |
|2019-06-06 11:05:51.76 |2019-06-06 11:05:51.782|1559819151760     |node-latency-rem-0-1|1559819151782|22              |
|2019-06-06 10:54:05.763|2019-06-06 10:54:05.785|1559818445763     |node-latency-rem-0-1|1559818445785|22              |
|2019-06-06 10:54:29.761|2019-06-06 10:54:29.783|1559818469761     |node-latency-rem-0-1|1559818469783|22              |
|2019-06-06 11:26:45.76 |2019-06

In [11]:
sensDf = sensDf.where(F.col('diffMilliseconds') < 1000)
sensDf.count()

1181

In [12]:
%%spark -o sensDf

In [15]:
%%local
sensDf





VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()