## Import libraries

In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

## Create SparkSession object

In [2]:
builder = (SparkSession.builder
           .appName("config-triggers")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
          )

spark = configure_spark_with_delta_pip(builder, ['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9d18aa23-7a8e-459a-95d1-194b7cffabef;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in centra

## Create dataframe

In [3]:
df = (spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load()
     )


## Parse the JSON messages

In [5]:
schema = (StructType(
    [
        StructField('id', IntegerType(), True),
        StructField('name', StringType(), True),
        StructField('age', IntegerType(), True),
        StructField('gender', StringType(), True),
        StructField('country', StringType(), True)
    ]))

df = df.withColumn('value', from_json(col('value').cast('STRING'), schema))
                   

## Extract the nested fields

In [6]:
df = (df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country')
))


    
    

## Applying triggers

In [7]:
query = (df.writeStream
         .format('console')
         .outputMode('append')
         .trigger(processingTime= '20 seconds')
         .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 57|user51| 59|     F|      USA|
| 61|user12| 43|     M|    China|
| 47|user54| 20|     F|       UK|
| 80|user23| 39|     F|       UK|
| 42|user40| 57|     F|    China|
| 37|user28| 42|     F|   Brazil|
| 70|user34| 20|     F|       UK|
| 55|user52| 35|     F|    China|
| 43|user43| 56|     M|   Brazil|
| 14|user65| 46|     F|      USA|
| 40|user59| 49|     M|    China|
| 82|user35| 33|     F|      USA|
| 46| user1| 44|     F|    China|
| 97|user83| 59|     M|   Brazil|
| 75|user90| 49|     M|    China|
| 29|user16| 50|     F|Australia|
| 27|user14| 19|     F|   Brazil|
| 46|user62| 36|     M|    China|
| 67|user76| 29|     F|   Canada|
| 22|user88| 64|     M|   Brazil|
+---+------+---+------+---------+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 68| user3| 43|     F|    India|
| 16|user51| 43|     F|Australia|
+---+------+---+------+---------+

-------------------------------------------
Batch: 2
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 36|user14| 60|     M|   Canada|
|  3|user90| 50|     F|Australia|
+---+------+---+------+---------+

-------------------------------------------
Batch: 3
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 59|user74| 61|     M|       UK|
| 77|user37| 36|     M|Australia|
+---+------+---+------+---------+



In [8]:
query.stop()

## Applying one-time trigger

In [9]:
query = (df.writeStream
         .format('console')
         .outputMode('append')
         .trigger(once=True)
         .start())



-------------------------------------------
Batch: 0
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 57|user51| 59|     F|      USA|
| 61|user12| 43|     M|    China|
| 47|user54| 20|     F|       UK|
| 80|user23| 39|     F|       UK|
| 42|user40| 57|     F|    China|
| 37|user28| 42|     F|   Brazil|
| 70|user34| 20|     F|       UK|
| 55|user52| 35|     F|    China|
| 43|user43| 56|     M|   Brazil|
| 14|user65| 46|     F|      USA|
| 40|user59| 49|     M|    China|
| 82|user35| 33|     F|      USA|
| 46| user1| 44|     F|    China|
| 97|user83| 59|     M|   Brazil|
| 75|user90| 49|     M|    China|
| 29|user16| 50|     F|Australia|
| 27|user14| 19|     F|   Brazil|
| 46|user62| 36|     M|    China|
| 67|user76| 29|     F|   Canada|
| 22|user88| 64|     M|   Brazil|
+---+------+---+------+---------+
only showing top 20 rows



In [10]:
query.stop()

In [None]:
spark.stop()