### After Week 12

In [6]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf

In [7]:
@udf('boolean')
def is_purchase(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False

In [8]:
raw_events = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .option("startingOffsets", "earliest") \
        .option("endingOffsets", "latest") \
        .load()

In [9]:
purchase_events = raw_events \
        .select(raw_events.value.cast('string').alias('raw'),
                raw_events.timestamp.cast('string')) \
        .filter(is_purchase('raw'))

In [10]:
extracted_purchase_events = purchase_events \
        .rdd \
        .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
        .toDF()

In [11]:
extracted_purchase_events.printSchema()

root
 |-- Accept: string (nullable = true)
 |-- Host: string (nullable = true)
 |-- User-Agent: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- sword_type: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [12]:
extracted_purchase_events.show()

+------+-----------------+---------------+--------------+----------+--------------------+
|Accept|             Host|     User-Agent|    event_type|sword_type|           timestamp|
+------+-----------------+---------------+--------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|us

In [13]:
extracted_purchase_events \
        .write \
        .mode('overwrite') \
        .parquet('/tmp/purchases')

Pyspark Code

In [14]:
purchases = spark.read.parquet('/tmp/purchases')

In [15]:
purchases.show()

+------+-----------------+---------------+--------------+----------+--------------------+
|Accept|             Host|     User-Agent|    event_type|sword_type|           timestamp|
+------+-----------------+---------------+--------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|us

In [16]:
purchases.registerTempTable('purchases')

In [18]:
purchases_by_example2 = spark.sql("select * from purchases where Host = 'user1.comcast.com'")
purchases_by_example2.show()

+------+-----------------+---------------+--------------+----------+--------------------+
|Accept|             Host|     User-Agent|    event_type|sword_type|           timestamp|
+------+-----------------+---------------+--------------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|user1.comcast.com|ApacheBench/2.3|purchase_sword|  falchion|2020-12-04 20:54:...|
|   */*|us

In [19]:
df = purchases_by_example2.toPandas()

df.describe()

Unnamed: 0,Accept,Host,User-Agent,event_type,sword_type,timestamp
count,10,10,10,10,10,10
unique,1,1,1,1,1,10
top,*/*,user1.comcast.com,ApacheBench/2.3,purchase_sword,falchion,2020-12-04 20:54:58.286
freq,10,10,10,10,10,1


## Simple Analytics

- How many events?
- How many join guilds?
- How many events from each user?

### After week 13

In [20]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType

In [21]:
def purchase_sword_event_schema():
    """
    root
    |-- Accept: string (nullable = true)
    |-- Host: string (nullable = true)
    |-- User-Agent: string (nullable = true)
    |-- event_type: string (nullable = true)
    |-- timestamp: string (nullable = true)
    """
    return StructType([
        StructField("Accept", StringType(), True),
        StructField("Host", StringType(), True),
        StructField("User-Agent", StringType(), True),
        StructField("event_type", StringType(), True),
    ])

In [22]:
@udf('boolean')
def is_sword_purchase(event_as_json):
    """udf for filtering events
    """
    event = json.loads(event_as_json)
    if event['event_type'] == 'purchase_sword':
        return True
    return False

In [23]:
raw_events = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .load()

In [24]:
sword_purchases = raw_events \
        .filter(is_sword_purchase(raw_events.value.cast('string'))) \
        .select(raw_events.value.cast('string').alias('raw_event'),
                raw_events.timestamp.cast('string'),
                from_json(raw_events.value.cast('string'),
                          purchase_sword_event_schema()).alias('json')) \
        .select('raw_event', 'timestamp', 'json.*')

In [25]:
sink = sword_purchases \
        .writeStream \
        .format("parquet") \
        .option("checkpointLocation", "/tmp/checkpoints_for_sword_purchases") \
        .option("path", "/tmp/sword_purchases") \
        .trigger(processingTime="10 seconds") \
        .start()

In [26]:
sink.stop()