In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('streaming1') \
    .master("local") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [17]:
static = spark.read.json('/Users/hyunseokjung/data/spark_guide/activity-data/')
dataSchema = static.schema

                                                                                

In [18]:
dataSchema

StructType([StructField('Arrival_Time', LongType(), True), StructField('Creation_Time', LongType(), True), StructField('Device', StringType(), True), StructField('Index', LongType(), True), StructField('Model', StringType(), True), StructField('User', StringType(), True), StructField('gt', StringType(), True), StructField('x', DoubleType(), True), StructField('y', DoubleType(), True), StructField('z', DoubleType(), True)])

In [19]:
static.first()

                                                                                

Row(Arrival_Time=1424686735090, Creation_Time=1424686733090638193, Device='nexus4_1', Index=18, Model='nexus4', User='g', gt='stand', x=0.0003356934, y=-0.0005645752, z=-0.018814087)

In [20]:
streaming = spark.readStream.schema(dataSchema) \
    .option("maxFilesPerTrigger", 1) \
    .json('/Users/hyunseokjung/data/spark_guide/activity-data/')

In [21]:
activityCounts = streaming.groupBy("gt").count()

In [22]:
spark.conf.set("spark.sql.shuffle.partitions", 5)

In [23]:
activityQuery = activityCounts.writeStream \
    .queryName("activity_counts") \
    .format("memory").outputMode("complete") \
    .start()

22/11/29 20:10:10 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/b9/8j1qkty17r9244bwmgfs9kdc0000gn/T/temporary-7cc618b6-42d4-47fd-9748-446b2411bea2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/11/29 20:10:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


IllegalArgumentException: Cannot start query with name activity_counts as a query with that name is already active in this SparkSession

In [None]:
# activityQuery.awaitTermination()

In [None]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x10b5adcd0>]

In [None]:
streaming.columns

['Arrival_Time',
 'Creation_Time',
 'Device',
 'Index',
 'Model',
 'User',
 'gt',
 'x',
 'y',
 'z']

In [None]:
from pyspark.sql.functions import expr

simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'")) \
    .where("stairs") \
    .where("gt is not null") \
    .select("gt", "Model", "Arrival_Time", "Creation_Time") \
    .writeStream \
    .queryName("simple_transform") \
    .format("memory") \
    .outputMode("append") \
    .start()

22/11/29 20:08:56 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/b9/8j1qkty17r9244bwmgfs9kdc0000gn/T/temporary-4c390311-7274-421d-bc00-1ca89c362220. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/11/29 20:08:56 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [None]:
spark.sql("select * from simple_transform").show(3)

+---+-----+------------+-------------+
| gt|Model|Arrival_Time|Creation_Time|
+---+-----+------------+-------------+
+---+-----+------------+-------------+



In [None]:
deviceModelStats = streaming.cube("gt", "model").avg() \
    .drop("avg(Arrival_Time)") \
    .drop("avg(Creation_Time)") \
    .drop("avg(Index)") \
    .writeStream.queryName("device_counts").format("memory") \
    .outputMode("complete") \
    .start()

22/11/29 20:08:57 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/b9/8j1qkty17r9244bwmgfs9kdc0000gn/T/temporary-6e9cea27-a5bb-4ffb-a1c4-ead24545ed29. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/11/29 20:08:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [None]:
spark.sql("select * from deviceModelStats").show(3)

[Stage 6:>                                                          (0 + 1) / 1]

AnalysisException: Table or view not found: deviceModelStats; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [deviceModelStats], [], false


In [None]:
historicalAgg = static.groupBy("gt", "model").avg()
deviceModelStats = streaming.drop("Arrival_Time", "Creation_Time", "Index") \
    .cube("gt", "model").avg() \
    .join(historicalAgg, ["gt", "model"]) \
    .writeStream.queryName("device_counts").format("memory") \
    .outputMode("complete") \
    .start()