## Import libraries

In [1]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,from_json,avg
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

## Create a SparkSession object

In [2]:
builder = (SparkSession.builder
           .appName("transform-filter-streaming")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
          )

spark = configure_spark_with_delta_pip(builder, ['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-abbca0f9-203a-4c86-8483-49669895c72b;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in centra

## Create a dataframe

In [3]:
df = (spark.readStream
      .format('kafka')
      .option('kafka.bootstrap.servers', 'kafka:9092')
      .option('subscribe', 'users')
      .option('startingOffsets', 'earliest')
      .load()
     )



## Parse the JSON messages

In [4]:
schema = (StructType([
            StructField('id', IntegerType(), True),
            StructField('name', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('gender', StringType(), True),
            StructField('country', StringType(), True)
            ])
         )

df = df.withColumn('value', from_json(col('value').cast('STRING'), schema))
    

## Extract the nested fields

In [5]:
df = (df.select(col('value.id').alias('id'),
                col('value.name').alias('name'),
                col('value.age').alias('age'),
                col('value.gender').alias('gender'),
                col('value.country').alias('country')
               ))
    

## Define Transformations and filters

In [6]:
df = (df.select('age','country','gender')
        .filter("age >= 21")
        .groupBy('country','gender')
        .agg(avg('age').alias('avg_age'))
     )


## Write the transformed data to the console

In [7]:
query = (df.writeStream
         .outputMode('complete')
         .format('console')
         .start())



                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|37.714285714285715|
|    Chine|     M|              42.5|
|   Brazil|     M|              37.0|
|      USA|     F|            37.875|
|Australia|     F| 43.72727272727273|
|   Canada|     M|46.666666666666664|
|       UK|     M|              43.0|
|      USA|     M|             38.25|
|    India|     M|              46.0|
|    China|     M|              37.0|
|    China|     F|38.857142857142854|
|   Canada|     F|              36.0|
|Australia|     M|              40.0|
|    India|     F|             46.05|
|       UK|     F| 39.77777777777778|
|    Chine|     F|             40.25|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|37.714285714285715|
|    Chine|     M|              42.5|
|   Brazil|     M|              37.0|
|      USA|     F|            37.875|
|Australia|     F| 43.72727272727273|
|   Canada|     M|46.666666666666664|
|       UK|     M|              43.0|
|      USA|     M|             38.25|
|    India|     M|              46.0|
|    China|     M|              37.0|
|    China|     F|             40.75|
|   Canada|     F|              36.0|
|Australia|     M|              40.0|
|    India|     F|             46.05|
|       UK|     F| 39.77777777777778|
|    Chine|     F|             40.25|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|37.714285714285715|
|    Chine|     M|              42.5|
|   Brazil|     M|              37.0|
|      USA|     F|            37.875|
|Australia|     F| 43.72727272727273|
|   Canada|     M|46.666666666666664|
|       UK|     M|              43.0|
|      USA|     M|             38.25|
|    India|     M|              46.0|
|    China|     M|              37.0|
|    China|     F|             40.75|
|   Canada|     F|              36.0|
|Australia|     M| 39.23076923076923|
|    India|     F|             46.05|
|       UK|     F| 39.77777777777778|
|    Chine|     F|             40.25|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|37.714285714285715|
|    Chine|     M|              42.5|
|   Brazil|     M|              37.0|
|      USA|     F|            37.875|
|Australia|     F| 43.72727272727273|
|   Canada|     M|46.666666666666664|
|       UK|     M|              43.0|
|      USA|     M|             38.25|
|    India|     M|              46.0|
|    China|     M|              37.0|
|    China|     F|43.333333333333336|
|   Canada|     F|              36.0|
|Australia|     M| 39.23076923076923|
|    India|     F|             46.05|
|       UK|     F| 39.77777777777778|
|    Chine|     F|             40.25|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|37.714285714285715|
|    Chine|     M|              42.5|
|   Brazil|     M|              37.0|
|      USA|     F|            37.875|
|Australia|     F| 43.72727272727273|
|   Canada|     M|46.666666666666664|
|       UK|     M|              43.0|
|      USA|     M|             38.25|
|    India|     M|              46.0|
|    China|     M|              37.0|
|    China|     F|43.333333333333336|
|   Canada|     F|              36.0|
|Australia|     M| 39.23076923076923|
|    India|     F|46.142857142857146|
|       UK|     F| 39.77777777777778|
|    Chine|     F|             40.25|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|37.714285714285715|
|    Chine|     M|              42.5|
|   Brazil|     M|              37.0|
|      USA|     F|            37.875|
|Australia|     F| 43.72727272727273|
|   Canada|     M|46.666666666666664|
|       UK|     M|              43.0|
|      USA|     M|             38.25|
|    India|     M|              46.0|
|    China|     M|              37.0|
|    China|     F|43.333333333333336|
|   Canada|     F|              36.0|
|Australia|     M|39.285714285714285|
|    India|     F|46.142857142857146|
|       UK|     F| 39.77777777777778|
|    Chine|     F|             40.25|
+---------+------+------------------+



In [8]:
query.stop()

In [9]:
spark.stop()