In [2]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1'). \
    config('spark.ui.port', '0'). \
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Kafka and Spark Integration'). \
    master('yarn'). \
    getOrCreate()

In [3]:
kafka_bootstrap_servers = 'w01.itversity.com:9092,w02.itversity.com:9092'

In [4]:
df = spark. \
  readStream. \
  format('kafka'). \
  option('kafka.bootstrap.servers', kafka_bootstrap_servers). \
  option('subscribe', f'{username}_retail'). \
  load()

In [5]:
from pyspark.sql.functions import date_format, to_date, split, substring

In [6]:
df.selectExpr("CAST(key AS STRING) AS key", "CAST(value AS STRING) AS value").printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [7]:
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"). \
    withColumn('log_date', to_date(substring(split('value', ' ')[3], 2, 21), '[dd/MMM/yyyy:HH:mm:ss')). \
    withColumn('year', date_format('log_date', 'yyyy')). \
    withColumn('month', date_format('log_date', 'MM')). \
    withColumn('dayofmonth', date_format('log_date', 'dd')). \
    writeStream. \
    partitionBy('year', 'month', 'dayofmonth'). \
    format('csv'). \
    option("checkpointLocation", f'/user/{username}/kafka/retail_logs/gen_logs/checkpoint'). \
    option('path', f'/user/{username}/kafka/retail_logs/gen_logs/data'). \
    trigger(processingTime='30 seconds'). \
    start()

<pyspark.sql.streaming.StreamingQuery at 0x7ff1e5f972b0>

In [8]:
!hdfs dfs -ls /user/${USER}/kafka/retail_logs/gen_logs

Found 2 items
drwxr-xr-x   - itversity itversity          0 2021-08-22 13:51 /user/itversity/kafka/retail_logs/gen_logs/checkpoint
drwxr-xr-x   - itversity itversity          0 2021-08-22 13:51 /user/itversity/kafka/retail_logs/gen_logs/data


In [10]:
!hdfs dfs -ls -R /user/${USER}/kafka/retail_logs/gen_logs/data

drwxr-xr-x   - itversity itversity          0 2021-08-22 13:53 /user/itversity/kafka/retail_logs/gen_logs/data/_spark_metadata
-rw-r--r--   3 itversity itversity          2 2021-08-22 13:52 /user/itversity/kafka/retail_logs/gen_logs/data/_spark_metadata/0
-rw-r--r--   3 itversity itversity        884 2021-08-22 13:52 /user/itversity/kafka/retail_logs/gen_logs/data/_spark_metadata/1
-rw-r--r--   3 itversity itversity        887 2021-08-22 13:52 /user/itversity/kafka/retail_logs/gen_logs/data/_spark_metadata/2
-rw-r--r--   3 itversity itversity        887 2021-08-22 13:53 /user/itversity/kafka/retail_logs/gen_logs/data/_spark_metadata/3
drwxr-xr-x   - itversity itversity          0 2021-08-22 13:52 /user/itversity/kafka/retail_logs/gen_logs/data/year=2021
drwxr-xr-x   - itversity itversity          0 2021-08-22 13:52 /user/itversity/kafka/retail_logs/gen_logs/data/year=2021/month=08
drwxr-xr-x   - itversity itversity          0 2021-08-22 13:53 /user/itversity/kafka/retail_logs/gen_logs/