In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1'). \
    config('spark.ui.port', '0'). \
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Kafka and Spark Integration'). \
    master('yarn'). \
    getOrCreate()

In [2]:
kafka_bootstrap_servers = 'w01.itversity.com:9092,w02.itversity.com:9092'

In [3]:
df = spark. \
  readStream. \
  format('kafka'). \
  option('kafka.bootstrap.servers', kafka_bootstrap_servers). \
  option('subscribe', f'{username}_retail'). \
  load()

In [4]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [13]:
from pyspark.sql.functions import lit, date_format, to_date, split, substring

In [14]:
l = [('X',)]

In [15]:
dual = spark.createDataFrame(l, schema='dummy STRING')

In [16]:
dual.select(to_date(lit('2021-Jan-21'), 'yyyy-MMM-dd')).show()

+-------------------------------------+
|to_date('2021-Jan-21', 'yyyy-MMM-dd')|
+-------------------------------------+
|                           2021-01-21|
+-------------------------------------+



In [17]:
dual.select(to_date(lit('31/Dec/2021:00:37:39'), 'dd/MMM/yyyy:HH:mm:ss')).show()

+-------------------------------------------------------+
|to_date('31/Dec/2021:00:37:39', 'dd/MMM/yyyy:HH:mm:ss')|
+-------------------------------------------------------+
|                                             2021-12-31|
+-------------------------------------------------------+



In [10]:
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"). \
    writeStream. \
    format("console"). \
    option('truncate', 'false'). \
    trigger(processingTime='5 seconds'). \
    start()

<pyspark.sql.streaming.StreamingQuery at 0x7fa48af6c240>

In [None]:
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"). \
    withColumn('year', date_format(to_date(split('value', ' ')[3], '[dd/MMM/yyyy:HH:mm:ss'), 'yyyy')). \
    withColumn('month', date_format(to_date(split('value', ' ')[3], '[dd/MMM/yyyy:HH:mm:ss'), 'MM')). \
    withColumn('dayofmonth', date_format(to_date(split('value', ' ')[3], '[dd/MMM/yyyy:HH:mm:ss'), 'dd')). \
    writeStream. \
    format("console"). \
    option('truncate', 'false'). \
    trigger(processingTime='5 seconds'). \
    start()

In [18]:
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"). \
    withColumn('log_date', to_date(substring(split('value', ' ')[3], 2, 21), '[dd/MMM/yyyy:HH:mm:ss')). \
    withColumn('year', date_format('log_date', 'yyyy')). \
    withColumn('month', date_format('log_date', 'MM')). \
    withColumn('dayofmonth', date_format('log_date', 'dd')). \
    writeStream. \
    format("memory"). \
    queryName("log_messages"). \
    start()

<pyspark.sql.streaming.StreamingQuery at 0x7fa48af6cbe0>

In [19]:
spark.sql('SELECT * FROM log_messages').show(truncate=False)

+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----+-----+----------+
|key |value                                                                                                                                                                                                                                      |log_date  |year|month|dayofmonth|
+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----+-----+----------+
|null|148.168.116.252 - - [22/Aug/2021:13:48:58 -0800] "GET /departments HTTP/1.1" 200 1192 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"      

In [None]:
spark.sql('SELECT count(1) FROM log_messages').show(truncate=False)