In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Overview of Structured Streaming'). \
    master('yarn'). \
    getOrCreate()

In [2]:
log_messages = '''152.66.31.153 - - [21/Aug/2021:22:57:29 -0800] "GET /department/fitness/products HTTP/1.1" 404 436 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
7.75.106.19 - - [21/Aug/2021:22:57:30 -0800] "GET /add_to_cart/1228 HTTP/1.1" 200 1938 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.76.4 (KHTML, like Gecko) Version/7.0.4 Safari/537.76.4"
178.182.201.250 - - [21/Aug/2021:22:57:31 -0800] "GET /login HTTP/1.1" 200 1028 "-" "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
161.251.115.98 - - [21/Aug/2021:22:57:32 -0800] "GET /departments HTTP/1.1" 200 915 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
32.162.64.231 - - [21/Aug/2021:22:57:33 -0800] "GET /departments HTTP/1.1" 200 1040 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0"
126.205.158.172 - - [21/Aug/2021:22:57:34 -0800] "GET /checkout HTTP/1.1" 200 1802 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0"
131.250.79.136 - - [21/Aug/2021:22:57:35 -0800] "GET /support HTTP/1.1" 200 1015 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
11.186.99.191 - - [21/Aug/2021:22:57:36 -0800] "GET /department/team%20sports/categories HTTP/1.1" 200 2014 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"
70.96.85.132 - - [21/Aug/2021:22:57:37 -0800] "GET /checkout HTTP/1.1" 200 648 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.76.4 (KHTML, like Gecko) Version/7.0.4 Safari/537.76.4"
52.27.254.176 - - [21/Aug/2021:22:57:38 -0800] "GET /departments HTTP/1.1" 200 268 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"'''

In [3]:
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(.*?)" (\d+) (\d+) (\S+) "(.*?)"'

In [6]:
messages = list(map(lambda msg: (msg, ), log_messages.splitlines()))

In [7]:
messages_df = spark. \
    createDataFrame(messages, schema='message STRING')

In [9]:
from pyspark.sql.functions import regexp_extract

In [10]:
messages_df.withColumn('ipaddress', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 1)). \
    withColumn('message_ts', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 4)). \
    drop('message'). \
    show(truncate=False)

+---------------+--------------------------+
|ipaddress      |message_ts                |
+---------------+--------------------------+
|152.66.31.153  |21/Aug/2021:22:57:29 -0800|
|7.75.106.19    |21/Aug/2021:22:57:30 -0800|
|178.182.201.250|21/Aug/2021:22:57:31 -0800|
|161.251.115.98 |21/Aug/2021:22:57:32 -0800|
|32.162.64.231  |21/Aug/2021:22:57:33 -0800|
|126.205.158.172|21/Aug/2021:22:57:34 -0800|
|131.250.79.136 |21/Aug/2021:22:57:35 -0800|
|11.186.99.191  |21/Aug/2021:22:57:36 -0800|
|70.96.85.132   |21/Aug/2021:22:57:37 -0800|
|52.27.254.176  |21/Aug/2021:22:57:38 -0800|
+---------------+--------------------------+



In [11]:
messages_df.withColumn('ipaddress', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 1)). \
    withColumn('message_ts', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 4)). \
    withColumn('message_endpoint', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 5)). \
    drop('message'). \
    show(truncate=False)

+---------------+--------------------------+-------------------------------------------------+
|ipaddress      |message_ts                |message_endpoint                                 |
+---------------+--------------------------+-------------------------------------------------+
|152.66.31.153  |21/Aug/2021:22:57:29 -0800|GET /department/fitness/products HTTP/1.1        |
|7.75.106.19    |21/Aug/2021:22:57:30 -0800|GET /add_to_cart/1228 HTTP/1.1                   |
|178.182.201.250|21/Aug/2021:22:57:31 -0800|GET /login HTTP/1.1                              |
|161.251.115.98 |21/Aug/2021:22:57:32 -0800|GET /departments HTTP/1.1                        |
|32.162.64.231  |21/Aug/2021:22:57:33 -0800|GET /departments HTTP/1.1                        |
|126.205.158.172|21/Aug/2021:22:57:34 -0800|GET /checkout HTTP/1.1                           |
|131.250.79.136 |21/Aug/2021:22:57:35 -0800|GET /support HTTP/1.1                            |
|11.186.99.191  |21/Aug/2021:22:57:36 -0800|GET /d

In [12]:
from pyspark.sql.functions import *

In [16]:
to_json?

[0;31mSignature:[0m [0mto_json[0m[0;34m([0m[0mcol[0m[0;34m,[0m [0moptions[0m[0;34m=[0m[0;34m{[0m[0;34m}[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType`
into a JSON string. Throws an exception, in the case of an unsupported type.

:param col: name of column containing a struct, an array or a map.
:param options: options to control converting. accepts the same options as the JSON datasource.
                Additionally the function supports the `pretty` option which enables
                pretty JSON generation.

>>> from pyspark.sql import Row
>>> from pyspark.sql.types import *
>>> data = [(1, Row(name='Alice', age=2))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json='{"age":2,"name":"Alice"}')]
>>> data = [(1, [Row(name='Alice', age=2), Row(name='Bob', age=3)])]
>>> df = spark.createDat

In [14]:
messages_extracted = messages_df.withColumn('ipaddress', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 1)). \
    withColumn('message_ts', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 4)). \
    withColumn('message_endpoint', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 5)). \
    drop('message')

In [27]:
messages_extracted.select(to_json(struct([messages_extracted[x] for x in messages_extracted.columns])).alias("value")).show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                         |
+----------------------------------------------------------------------------------------------------------------------------------------------+
|{"ipaddress":"152.66.31.153","message_ts":"21/Aug/2021:22:57:29 -0800","message_endpoint":"GET /department/fitness/products HTTP/1.1"}        |
|{"ipaddress":"7.75.106.19","message_ts":"21/Aug/2021:22:57:30 -0800","message_endpoint":"GET /add_to_cart/1228 HTTP/1.1"}                     |
|{"ipaddress":"178.182.201.250","message_ts":"21/Aug/2021:22:57:31 -0800","message_endpoint":"GET /login HTTP/1.1"}                            |
|{"ipaddress":"161.251.115.98","message_ts":"21/Aug/2021:22:57:32 -0800","message_endpoint":"GET /departments HTTP/1.1"}          

In [17]:
from pyspark.sql import Row

In [18]:
from pyspark.sql.types import *

In [20]:
data = [(1, Row(name='Alice', age=2))]

In [21]:
df = spark.createDataFrame(data, ("key", "value"))

In [23]:
df.printSchema()

root
 |-- key: long (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: long (nullable = true)



In [22]:
df.select(to_json(df.value).alias("json")).collect()

[Row(json='{"name":"Alice","age":2}')]