In [1]:
from hops import hdfs
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import LongType
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9,application_1559316644877_0006,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fca07f08f60>

In [2]:
df = spark \
    .read \
    .format("parquet") \
    .load(hdfs.project_path() + "Resources/iot-benchmarks/data/topic-lwm2m-3303-temperature") \
    .filter(F.col('endpointClientName').contains('node-latency-20-'))

df.cache().count()

6033

In [3]:
df.printSchema()

root
 |-- measurement: struct (nullable = true)
 |    |-- timestamp: long (nullable = false)
 |    |-- endpointClientName: string (nullable = false)
 |    |-- instanceId: integer (nullable = false)
 |    |-- gatewayId: integer (nullable = false)
 |    |-- ipsoObject: struct (nullable = false)
 |    |    |-- sensorValue: double (nullable = false)
 |    |    |-- minMeasuredValue: double (nullable = true)
 |    |    |-- maxMeasuredValue: double (nullable = true)
 |    |    |-- minRangeValue: double (nullable = true)
 |    |    |-- maxRangeValue: double (nullable = true)
 |    |    |-- sensorUnits: string (nullable = true)
 |    |    |-- resetMinAndMaxMeasuredValues: boolean (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- endpointClientName: string (nullable = true)

In [4]:
timeFmt = "yyyy-MM-dd HH:mm:ss.SSS"
sensDf = df \
    .select(F.to_timestamp(F.col('measurement.timestamp')/1000).alias('measurementTmp'), \
            'kafkaTimestamp', \
            (F.col('measurement.timestamp')).alias('measurementTmpLong'), \
            'endpointClientName') \
    .withColumn('kafkaTmpLong', F.substring_index('kafkaTimestamp', '.', -1).cast('float')) \
    .withColumn('kafkaTmpLong', F.when(F.col('kafkaTmpLong') < 100, F.col('kafkaTmpLong')*10).otherwise(F.col('kafkaTmpLong')).cast('long')) \
    .withColumn('kafkaTmpLong', (F.unix_timestamp('kafkaTimestamp', format=timeFmt)*1000 + F.col('kafkaTmpLong'))) \
    .withColumn("diffMilliseconds", (F.col('kafkaTmpLong') - F.col('measurementTmpLong')))
    

In [5]:
sensDf.show(5, False)

+-----------------------+-----------------------+------------------+------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName|kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+------------------+-------------+----------------+
|2019-06-03 17:13:32.753|2019-06-03 17:13:32.767|1559582012753     |node-latency-20-3 |1559582012767|14              |
|2019-06-03 17:13:34.749|2019-06-03 17:13:34.764|1559582014749     |node-latency-20-3 |1559582014764|15              |
|2019-06-03 17:13:36.749|2019-06-03 17:13:36.764|1559582016749     |node-latency-20-3 |1559582016764|15              |
|2019-06-03 17:13:38.749|2019-06-03 17:13:38.764|1559582018749     |node-latency-20-3 |1559582018764|15              |
|2019-06-03 17:13:40.749|2019-06-03 17:13:40.765|1559582020749     |node-latency-20-3 |1559582020765|16              |
+-----------------------+-----------------------

In [6]:
sensDf.printSchema()

root
 |-- measurementTmp: timestamp (nullable = true)
 |-- kafkaTimestamp: timestamp (nullable = true)
 |-- measurementTmpLong: long (nullable = true)
 |-- endpointClientName: string (nullable = true)
 |-- kafkaTmpLong: long (nullable = true)
 |-- diffMilliseconds: long (nullable = true)

In [7]:
sensDf.agg(F.avg(F.col("diffMilliseconds"))).show()

+---------------------+
|avg(diffMilliseconds)|
+---------------------+
|    26.21316094811868|
+---------------------+

In [8]:
sensDf.sort(F.col('diffMilliseconds').desc()).show(10, False)

+-----------------------+-----------------------+------------------+------------------+-------------+----------------+
|measurementTmp         |kafkaTimestamp         |measurementTmpLong|endpointClientName|kafkaTmpLong |diffMilliseconds|
+-----------------------+-----------------------+------------------+------------------+-------------+----------------+
|2019-06-03 17:13:58.025|2019-06-03 17:13:58.044|1559582038025     |node-latency-20-14|1559582038440|415             |
|2019-06-03 17:13:36.023|2019-06-03 17:13:36.038|1559582016023     |node-latency-20-14|1559582016380|357             |
|2019-06-03 17:16:40.022|2019-06-03 17:16:40.037|1559582200022     |node-latency-20-14|1559582200370|348             |
|2019-06-03 17:15:20.022|2019-06-03 17:15:20.037|1559582120022     |node-latency-20-14|1559582120370|348             |
|2019-06-03 17:15:08.022|2019-06-03 17:15:08.037|1559582108022     |node-latency-20-14|1559582108370|348             |
|2019-06-03 17:13:50.023|2019-06-03 17:13:50.037

In [9]:
%%spark -o sensDf

In [10]:
%%local
sensDf





VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()