In [3]:
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(.*?)" (\d+) (\d+) (\S+) "(.*?)"'

In [4]:
log_messages = '''152.66.31.153 - - [21/Aug/2021:22:57:29 -0800] "GET /department/fitness/products HTTP/1.1" 404 436 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
7.75.106.19 - - [21/Aug/2021:22:57:30 -0800] "GET /add_to_cart/1228 HTTP/1.1" 200 1938 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.76.4 (KHTML, like Gecko) Version/7.0.4 Safari/537.76.4"
178.182.201.250 - - [21/Aug/2021:22:57:31 -0800] "GET /login HTTP/1.1" 200 1028 "-" "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
161.251.115.98 - - [21/Aug/2021:22:57:32 -0800] "GET /departments HTTP/1.1" 200 915 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
32.162.64.231 - - [21/Aug/2021:22:57:33 -0800] "GET /departments HTTP/1.1" 200 1040 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0"
126.205.158.172 - - [21/Aug/2021:22:57:34 -0800] "GET /checkout HTTP/1.1" 200 1802 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0"
131.250.79.136 - - [21/Aug/2021:22:57:35 -0800] "GET /support HTTP/1.1" 200 1015 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
11.186.99.191 - - [21/Aug/2021:22:57:36 -0800] "GET /department/team%20sports/categories HTTP/1.1" 200 2014 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"
70.96.85.132 - - [21/Aug/2021:22:57:37 -0800] "GET /checkout HTTP/1.1" 200 648 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.76.4 (KHTML, like Gecko) Version/7.0.4 Safari/537.76.4"
52.27.254.176 - - [21/Aug/2021:22:57:38 -0800] "GET /departments HTTP/1.1" 200 268 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"'''

In [5]:
messages = log_messages.splitlines()

In [6]:
import re

In [7]:
match = re.search(APACHE_ACCESS_LOG_PATTERN, messages[0])

In [8]:
match.group()

'152.66.31.153 - - [21/Aug/2021:22:57:29 -0800] "GET /department/fitness/products HTTP/1.1" 404 436 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"'

In [9]:
match.group(4)

'21/Aug/2021:22:57:29 -0800'

In [10]:
match.group(5)

'GET /department/fitness/products HTTP/1.1'

In [11]:
match.group(6)

'404'

In [12]:
match.group(7)

'436'

In [13]:
match.group(8)

'"-"'

In [14]:
match.group(9)

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'

In [15]:
match.groups()

('152.66.31.153',
 '-',
 '-',
 '21/Aug/2021:22:57:29 -0800',
 'GET /department/fitness/products HTTP/1.1',
 '404',
 '436',
 '"-"',
 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36')

In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Log Message Parsing'). \
    master('yarn'). \
    getOrCreate()

In [17]:
df = spark. \
    createDataFrame(list(map(lambda msg: (msg, ), messages)), schema='message STRING')

In [18]:
df.printSchema()

root
 |-- message: string (nullable = true)



In [19]:
df.show()

+--------------------+
|             message|
+--------------------+
|152.66.31.153 - -...|
|7.75.106.19 - - [...|
|178.182.201.250 -...|
|161.251.115.98 - ...|
|32.162.64.231 - -...|
|126.205.158.172 -...|
|131.250.79.136 - ...|
|11.186.99.191 - -...|
|70.96.85.132 - - ...|
|52.27.254.176 - -...|
+--------------------+



In [20]:
from pyspark.sql.functions import *

In [21]:
regexp_extract?

[0;31mSignature:[0m [0mregexp_extract[0m[0;34m([0m[0mstr[0m[0;34m,[0m [0mpattern[0m[0;34m,[0m [0midx[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Extract a specific group matched by a Java regex, from the specified string column.
If the regex did not match, or the specified group did not match, an empty string is returned.

>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d='100')]
>>> df = spark.createDataFrame([('foo',)], ['str'])
>>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
[Row(d='')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
[Row(d='')]

.. versionadded:: 1.5
[0;31mFile:[0m      /opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [26]:
df.withColumn('ipaddress', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 1)). \
    withColumn('message_ts', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 4)). \
    drop('message'). \
    show(truncate=False)

+---------------+--------------------------+
|ipaddress      |message_ts                |
+---------------+--------------------------+
|152.66.31.153  |21/Aug/2021:22:57:29 -0800|
|7.75.106.19    |21/Aug/2021:22:57:30 -0800|
|178.182.201.250|21/Aug/2021:22:57:31 -0800|
|161.251.115.98 |21/Aug/2021:22:57:32 -0800|
|32.162.64.231  |21/Aug/2021:22:57:33 -0800|
|126.205.158.172|21/Aug/2021:22:57:34 -0800|
|131.250.79.136 |21/Aug/2021:22:57:35 -0800|
|11.186.99.191  |21/Aug/2021:22:57:36 -0800|
|70.96.85.132   |21/Aug/2021:22:57:37 -0800|
|52.27.254.176  |21/Aug/2021:22:57:38 -0800|
+---------------+--------------------------+



In [27]:
df.withColumn('ipaddress', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 1)). \
    withColumn('message_ts', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 4)). \
    withColumn('message_endpoint', regexp_extract('message', APACHE_ACCESS_LOG_PATTERN, 5)). \
    drop('message'). \
    show(truncate=False)

+---------------+--------------------------+-------------------------------------------------+
|ipaddress      |message_ts                |message_endpoint                                 |
+---------------+--------------------------+-------------------------------------------------+
|152.66.31.153  |21/Aug/2021:22:57:29 -0800|GET /department/fitness/products HTTP/1.1        |
|7.75.106.19    |21/Aug/2021:22:57:30 -0800|GET /add_to_cart/1228 HTTP/1.1                   |
|178.182.201.250|21/Aug/2021:22:57:31 -0800|GET /login HTTP/1.1                              |
|161.251.115.98 |21/Aug/2021:22:57:32 -0800|GET /departments HTTP/1.1                        |
|32.162.64.231  |21/Aug/2021:22:57:33 -0800|GET /departments HTTP/1.1                        |
|126.205.158.172|21/Aug/2021:22:57:34 -0800|GET /checkout HTTP/1.1                           |
|131.250.79.136 |21/Aug/2021:22:57:35 -0800|GET /support HTTP/1.1                            |
|11.186.99.191  |21/Aug/2021:22:57:36 -0800|GET /d