In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [21]:
import re

from pyspark.sql.functions import coalesce, col, datediff, dense_rank, desc, explode, from_unixtime, lag, lit, rank, sum as sum_, trim, udf, unix_timestamp, window
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, StructField, StructType

In [22]:
DATA_PATH = "/tmp/spark_access.log"
DEVICE_MAP_PATH = "/home/daniel/code/gdd/accelerator/trainings/de_accelerator/spark/hackathon/setup/user_agents_map.csv"

In [28]:
DATA_PATH = "/home/daniel/code/gdd/accelerator/trainings/de_accelerator/spark/data/apachelog_large/"

# Step 1: Parse raw data (with correct dates)

In [35]:
df = spark.read.parquet(DATA_PATH)

In [36]:
df.show()

+--------------------+
|               value|
+--------------------+
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
|26.158.235.134 - ...|
+--------------------+
only showing top 20 rows



In [37]:
log_line_pattern = re.compile(r'(\S+).*\[(.*)\].*\"GET\s(.*)\sHTTP.*\".*(\d{3})\s(\d{1,5})\s\"\-\"\s\"(.*)\"$')

schema = StructType([
    StructField("ip_address", StringType()),
    StructField("timestamp", StringType()),
    StructField("url", StringType()),
    StructField("status_code", StringType()),
    StructField("response_bytes_size", StringType()),
    StructField("user_agent", StringType())
])

def parse_log_line(line):
    groups = log_line_pattern.findall(line)
    return [g for g in groups[0]]
    
parse_udf = udf(parse_log_line, schema)

In [38]:
parsed_lines = df.select(parse_udf("value").alias("parsed"))

In [40]:
parsed_lines.printSchema()

root
 |-- parsed: struct (nullable = true)
 |    |-- ip_address: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |    |-- url: string (nullable = true)
 |    |-- status_code: string (nullable = true)
 |    |-- response_bytes_size: string (nullable = true)
 |    |-- user_agent: string (nullable = true)



In [23]:
# gotcha: there are 2 different timestamp formats in the dataset
def to_timestamps(col, formats=("dd/MMM/yyyy:HH:mm:ss", "yyyy-MM-dd HH:mm:ss")):
    return coalesce(*[unix_timestamp(col, f) for f in formats])

In [41]:
timestamps_parsed = parsed_lines.select("parsed.*").withColumn("timestamp", to_timestamps("timestamp").cast("timestamp"))

In [44]:
timestamps_parsed.show()

+--------------+-------------------+----------------+-----------+-------------------+--------------------+
|    ip_address|          timestamp|             url|status_code|response_bytes_size|          user_agent|
+--------------+-------------------+----------------+-----------+-------------------+--------------------+
|26.158.235.134|2019-01-14 06:09:13|http://www.nu.nl|        200|               1233|Mozilla/5.0 (Linu...|
|26.158.235.134|2019-01-15 06:02:22|http://www.nu.nl|        200|               1193|Mozilla/5.0 (Linu...|
|26.158.235.134|2019-01-15 06:04:53|http://www.nu.nl|        200|               1520|Mozilla/5.0 (Linu...|
|26.158.235.134|2019-01-15 06:05:22|http://www.nu.nl|        200|                953|Mozilla/5.0 (Linu...|
|26.158.235.134|2019-01-15 06:06:27|http://www.nu.nl|        200|               1936|Mozilla/5.0 (Linu...|
|26.158.235.134|2019-01-15 06:07:14|http://www.nu.nl|        200|               2041|Mozilla/5.0 (Linu...|
|26.158.235.134|2019-01-15 06:08:37|h

In [26]:
parsed_data = timestamps_parsed.withColumn("status_code", col("status_code").cast("int")).withColumn("response_bytes_size", col("response_bytes_size").cast("int"))

In [27]:
parsed_data = parsed_data.withColumn("user_agent", trim(parsed_data.user_agent))

In [28]:
parsed_data.printSchema()

root
 |-- ip_address: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- status_code: integer (nullable = true)
 |-- response_bytes_size: integer (nullable = true)
 |-- user_agent: string (nullable = true)



In [29]:
parsed_data.select("user_agent").distinct().count()

982

# Step 2: Extract news categories per logline and display top N most popular categories overall

In [30]:
category_pattern = re.compile(r'(http|https)://www\.nu\.nl(/[A-Z,a-z,-]*/)?')

def extract_category(line):
    groups = category_pattern.findall(line)
    category = groups[0][1].replace("/", "")
    # explicitly return `None` in order to set category to null instead of empty string
    if category:
        return category
    return None

extract_category_udf = udf(extract_category, StringType())

In [31]:
category_extracted_data = parsed_data.withColumn("category", extract_category_udf("url"))

In [32]:
category_extracted_data.groupby("category").count().sort(desc("count")).show()

+-------------------+------+
|           category| count|
+-------------------+------+
|               null|284012|
|         wielrennen|  7689|
|             leiden|  7686|
|              games|  7374|
|          geldzaken|  7300|
|              wonen|  7257|
|               tech|  7237|
|               auto|  7170|
|         gezondheid|  7107|
|           politiek|  6952|
|              breda|  6870|
|              sport|  6849|
|alphen-aan-den-rijn|  6817|
|            voetbal|  6797|
|        opmerkelijk|  6776|
|           internet|  6704|
|            utrecht|  6694|
|         wetenschap|  6624|
|         achterklap|  6470|
|         buitenland|  6442|
+-------------------+------+
only showing top 20 rows



# Step 3: Show the top N most popular pages per (configurable) time window

In [None]:
categories_with_counts = category_extracted_data.groupBy(col("category"), window(col("timestamp"), "1 day")).count()

In [None]:
categories_with_counts.orderBy(col("window"), desc("count")).show(200)

In [33]:
# Step 4: Enrich with device types


In [34]:
categories_with_counts.orderBy(col("window"), desc("count")).show(200)

+-------------------+--------------------+-----+
|           category|              window|count|
+-------------------+--------------------+-----+
|               null|[2019-01-01 01:00...|  137|
|           internet|[2019-01-01 01:00...|   21|
|         wielrennen|[2019-01-01 01:00...|   16|
|         binnenland|[2019-01-01 01:00...|   10|
|           economie|[2019-01-01 01:00...|   10|
|         buitenland|[2019-01-01 01:00...|    8|
|              wonen|[2019-01-01 01:00...|    8|
|            weekend|[2019-01-01 01:00...|    8|
|            utrecht|[2019-01-01 01:00...|    8|
|              sport|[2019-01-01 01:00...|    6|
|               auto|[2019-01-01 01:00...|    5|
|             leiden|[2019-01-01 01:00...|    5|
|        opmerkelijk|[2019-01-01 01:00...|    5|
|         gezondheid|[2019-01-01 01:00...|    3|
|              breda|[2019-01-01 01:00...|    3|
|           politiek|[2019-01-01 01:00...|    2|
|           algemeen|[2019-01-01 01:00...|    1|
|         achterklap

# Step 4: Enrich with device types


In [43]:
devices = spark.read.csv(DEVICE_MAP_PATH).toDF("user_agent","device_type")

In [44]:
# Gotcha: not all user agents in the mapping actually have a corresponding device type. This leads to nulls further on.
devices.filter(devices.device_type.isNull()).show()

+--------------------+-----------+
|          user_agent|device_type|
+--------------------+-----------+
|Mozilla/5.0 (X11;...|       null|
|Mozilla/5.0 (iPho...|       null|
|Mozilla/5.0 (Wind...|       null|
|Mozilla/5.0 Apple...|       null|
|Mozilla/5.0 (Wind...|       null|
|http://www.userag...|       null|
|Mozilla/5.0 (X11;...|       null|
|Mozilla/5.0 (comp...|       null|
+--------------------+-----------+



In [45]:
# Gotcha: make sure to do a left join, otherwise you'll lose records.
enriched = category_extracted_data.join(devices, "user_agent", "left")

In [46]:
enriched.groupBy("device_type").count().show()

+-----------+------+
|device_type| count|
+-----------+------+
|    desktop|379624|
| smartphone| 29068|
|       null|  5001|
|     tablet| 94651|
+-----------+------+



In [47]:
enriched.filter(enriched.device_type.isNull()).select("user_agent").distinct().count()

8

In [48]:
enriched.printSchema()

root
 |-- user_agent: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- status_code: integer (nullable = true)
 |-- response_bytes_size: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- device_type: string (nullable = true)



# Step 5: Compute the top N most popular news categories per device type per time window

In [49]:
categories_with_counts = enriched.groupBy(col("category"), col("device_type"), window(col("timestamp"), "1 week"))
res = categories_with_counts.count()

In [50]:
enriched.groupBy(col("ip_address"), col("user_agent")).count().orderBy(desc("count")).show()

+---------------+--------------------+-----+
|     ip_address|          user_agent|count|
+---------------+--------------------+-----+
|   1.224.57.173|Mozilla/5.0 (X11;...|  212|
| 207.77.122.202|Mozilla/5.0 (X11;...|  212|
| 203.207.221.76|Mozilla/5.0 (Wind...|  211|
|  63.24.164.223|Mozilla/5.0 (iPad...|  211|
|  182.238.43.53|Mozilla/5.0 (Wind...|  210|
|  35.247.89.168|Mozilla/5.0 (Wind...|  210|
| 139.223.175.47|Mozilla/5.0 (Linu...|  207|
|196.122.195.248|Mozilla/5.0 (Maci...|  207|
|   8.189.50.250|Mozilla/5.0 (iPad...|  203|
| 71.100.160.217|Mozilla/5.0 (Maci...|  203|
|132.118.133.110|http://www.userag...|  203|
|  95.36.138.114|Mozilla/5.0 (Linu...|  202|
|     8.98.41.55|Mozilla/5.0 (Wind...|  201|
| 168.70.209.174|Mozilla/5.0 (Wind...|  201|
| 95.111.141.235|Mozilla/5.0 (Wind...|  201|
| 191.90.221.142|Mozilla/5.0 (Wind...|  199|
|   13.2.191.123|Mozilla/5.0 (Wind...|  199|
|    174.2.37.21|Mozilla/5.0 (Wind...|  199|
|205.212.170.250|Mozilla/5.0 (iPho...|  198|
| 39.153.2

# 6: Trending Categories

In [51]:
categoryWindow = Window \
  .partitionBy("category") \
  .orderBy("window")
rankingWindow = Window \
  .partitionBy(col("window")) \
  .orderBy(desc("diff"))

dfWithLag = res \
  .withColumn("prev_count", lag("count", 1, 0) \
  .over(categoryWindow)) \
  .withColumn("diff", (col("count") - col("prev_count"))) \

result = dfWithLag \
  .withColumn("rank", rank().over(rankingWindow)) \
  .where(col("rank") <= 5) \
  .orderBy("window", "rank")


In [52]:
result.show()

+--------------+-----------+--------------------+-----+----------+-----+----+
|      category|device_type|              window|count|prev_count| diff|rank|
+--------------+-----------+--------------------+-----+----------+-----+----+
|          null|    desktop|[2018-12-27 01:00...|  347|         0|  347|   1|
|          null|     tablet|[2018-12-27 01:00...|  123|        25|   98|   2|
|      internet|     tablet|[2018-12-27 01:00...|   40|         0|   40|   3|
| entertainment|    desktop|[2018-12-27 01:00...|   25|         0|   25|   4|
|    wielrennen|    desktop|[2018-12-27 01:00...|   25|         0|   25|   4|
|          null|    desktop|[2019-01-03 01:00...| 8227|       123| 8104|   1|
|    wielrennen|    desktop|[2019-01-03 01:00...|  327|        48|  279|   2|
|cultuur-overig|    desktop|[2019-01-03 01:00...|  278|         6|  272|   3|
|    gezondheid|    desktop|[2019-01-03 01:00...|  262|         4|  258|   4|
|          auto|    desktop|[2019-01-03 01:00...|  280|        2

# 7: Session Duration

In [53]:
# assign each pair of ip_address + user_agent a unique id (i.e. a rank)
enriched_with_id = enriched.withColumn("user_id", dense_rank().over(Window.orderBy(enriched.ip_address, enriched.user_agent)))

In [54]:
w = Window.partitionBy("user_id").orderBy("timestamp")
timeDiff = coalesce(unix_timestamp('timestamp')- lag(unix_timestamp('timestamp'),1).over(w), lit(0))

indicator = (timeDiff > 3600).cast("integer")
session_id = sum_(indicator).over(w).alias("session_id")
enriched_sessionized = enriched_with_id.select("*", session_id)

In [55]:
enriched_sessionized.printSchema()

root
 |-- user_agent: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- status_code: integer (nullable = true)
 |-- response_bytes_size: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- session_id: long (nullable = true)



In [56]:
enriched_sessionized.show()

+--------------------+----------+-------------------+--------------------+-----------+-------------------+--------+-----------+-------+----------+
|          user_agent|ip_address|          timestamp|                 url|status_code|response_bytes_size|category|device_type|user_id|session_id|
+--------------------+----------+-------------------+--------------------+-----------+-------------------+--------+-----------+-------+----------+
|Mozilla/5.0 (Wind...|1.10.66.79|2019-01-24 08:07:35|https://www.nu.nl...|        200|                635|    apps|     tablet|      1|         0|
|Mozilla/5.0 (Wind...|1.10.66.79|2019-01-26 08:00:01|https://www.nu.nl...|        200|               1624|    apps|     tablet|      1|         1|
|Mozilla/5.0 (Wind...|1.10.66.79|2019-01-27 08:05:24|    http://www.nu.nl|        200|               1174|    null|     tablet|      1|         2|
|Mozilla/5.0 (Wind...|1.10.66.79|2019-01-28 08:02:35|    http://www.nu.nl|        200|               2078|    null|   