In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1'). \
    config('spark.ui.port', '0'). \
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Kafka and Spark Integration'). \
    master('yarn'). \
    getOrCreate()

In [2]:
kafka_bootstrap_servers = 'w01.itversity.com:9092,w02.itversity.com:9092'

In [3]:
df = spark. \
  readStream. \
  format('kafka'). \
  option('kafka.bootstrap.servers', kafka_bootstrap_servers). \
  option('subscribe', f'{username}_retail'). \
  load()

In [4]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



We can use below code snippet using CLI to preview the data using `console`.
```python
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"). \
    writeStream. \
    outputMode("update"). \
    format("console"). \
    option('truncate', 'false'). \
    trigger(processingTime='5 seconds'). \
    start()
```

In [12]:
from pyspark.sql.functions import date_format, to_date, split, lit, substring, length

In [21]:
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"). \
    withColumn('year', to_date(substring(split('value', ' ')[3], 2, 21), '[dd/MMM/yyyy:HH:mm:ss')). \
    writeStream. \
    format("memory"). \
    queryName("log_messages"). \
    start()

<pyspark.sql.streaming.StreamingQuery at 0x7f25642d0828>

In [22]:
spark.sql('SELECT * FROM log_messages').show(truncate=False)

+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|key |value                                                                                                                                                             |year      |
+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|null|116.89.138.146 - - [22/Aug/2021:13:45:18 -0800] "GET /departments HTTP/1.1" 200 362 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"|2021-08-22|
+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+



In [23]:
spark.sql('SELECT count(1) FROM log_messages').show(truncate=False)

+--------+
|count(1)|
+--------+
|10      |
+--------+

