## Convert Streaming Log Messages to JSON

Let us apply required transformations to convert streaming log messages to JSON using regular expressions.

In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Overview of Structured Streaming'). \
    master('yarn'). \
    getOrCreate()

In [2]:
spark.conf.set('spark.sql.shuffle.partitions', '2')

In [3]:
import socket
hostname = socket.gethostname()

In [4]:
log_messages = spark. \
    readStream. \
    format("socket"). \
    option("host", hostname). \
    option("port", 9000). \
    load()

In [5]:
log_messages.isStreaming

True

In [6]:
log_messages.printSchema()

root
 |-- value: string (nullable = true)



In [7]:
from pyspark.sql.functions import regexp_extract

In [8]:
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(.*?)" (\d+) (\d+) (\S+) "(.*?)"'

In [9]:
log_messages_extracted = log_messages.withColumn('ipaddress', regexp_extract('value', APACHE_ACCESS_LOG_PATTERN, 1)). \
    withColumn('message_ts', regexp_extract('value', APACHE_ACCESS_LOG_PATTERN, 4)). \
    withColumn('message_endpoint', regexp_extract('value', APACHE_ACCESS_LOG_PATTERN, 5)). \
    drop('value')

In [10]:
from pyspark.sql.functions import to_json, struct

In [11]:
log_messages_extracted_json = log_messages_extracted. \
    select(
        to_json(
            struct([log_messages_extracted[x] for x in log_messages_extracted.columns])
        ).alias("value")
    )

In [12]:
log_messages_extracted_json. \
    writeStream. \
    format("memory"). \
    queryName("log_messages"). \
    start()

<pyspark.sql.streaming.StreamingQuery at 0x7f9407761e80>

In [18]:
spark.sql('SELECT * FROM log_messages').show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|{"ipaddress":"189.192.235.73","message_ts":"06/Sep/2021:23:10:51 -0800","message_endpoint":"GET /department/footwear/products HTTP/1.1"}            |
|{"ipaddress":"157.139.197.15","message_ts":"06/Sep/2021:23:10:53 -0800","message_endpoint":"GET /departments HTTP/1.1"}                             |
|{"ipaddress":"167.76.15.9","message_ts":"06/Sep/2021:23:10:55 -0800","message_endpoint":"GET /departments HTTP/1.1"}                                |
|{"ipaddress":"46.230.139.195","message_ts":"06/Sep/2021:23:10:57 -0800","message_endpoint":"G

In [None]:
spark.sql('SELECT count(1) FROM log_messages').show(truncate=False)