In [None]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf

In [None]:
@udf('string')
def munge_event(event_as_json):
    event = json.loads(event_as_json)
    event['Host'] = "moe"
    event['Cache-Control'] = "no-cache"
    return json.dumps(event)

In [None]:
raw_events = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .option("startingOffsets", "earliest") \
        .option("endingOffsets", "latest") \
        .load()

In [None]:
munged_events = raw_events \
        .select(raw_events.value.cast('string').alias('raw'),
                raw_events.timestamp.cast('string')) \
        .withColumn('munged', munge_event('raw'))

In [None]:
munged_events.show()

In [None]:
extracted_events = munged_events \
        .rdd \
        .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.munged))) \
        .toDF()

In [None]:
sword_purchases = extracted_events \
        .filter(extracted_events.event_type == 'purchase_sword')

In [None]:
sword_purchases.show()

In [None]:
default_hits = extracted_events \
        .filter(extracted_events.event_type == 'default')

In [None]:
default_hits.show()

# New with Week 12

In [1]:
import json
from pyspark.sql import Row
from pyspark.sql.functions import udf

In [2]:
@udf('boolean')
def is_buy_sword(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'buy_sword':
        return True
    return False

In [4]:
@udf('boolean')
def is_join_guild(event_as_json):
    event = json.loads(event_as_json)
    if event['event_type'] == 'join_guild':
        return True
    return False

In [5]:
raw_events = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:29092") \
        .option("subscribe", "events") \
        .option("startingOffsets", "earliest") \
        .option("endingOffsets", "latest") \
        .load()

In [6]:
raw_events.show(5)

+----+--------------------+------+---------+------+--------------------+-------------+
| key|               value| topic|partition|offset|           timestamp|timestampType|
+----+--------------------+------+---------+------+--------------------+-------------+
|null|[7B 22 48 6F 73 7...|events|        0|     0|2021-04-07 23:17:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     1|2021-04-07 23:17:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     2|2021-04-07 23:17:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     3|2021-04-07 23:17:...|            0|
|null|[7B 22 48 6F 73 7...|events|        0|     4|2021-04-07 23:17:...|            0|
+----+--------------------+------+---------+------+--------------------+-------------+
only showing top 5 rows



### Join Guild Events

In [7]:
join_guild_events = raw_events \
        .select(raw_events.value.cast('string').alias('raw'),
                raw_events.timestamp.cast('string')) \
        .filter(is_join_guild('raw'))

In [8]:
join_guild_events.show()

+--------------------+--------------------+
|                 raw|           timestamp|
+--------------------+--------------------+
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07 23:17:...|
|{"Accept": "*/*",...|2021-04-07

In [9]:
extracted_join_guild_events = join_guild_events \
        .rdd \
        .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
        .toDF()

In [10]:
extracted_join_guild_events.show(5)

+------+-----------------+---------------+----------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|guild_type|           timestamp|
+------+-----------------+---------------+----------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
+------+-----------------+---------------+----------+----------+--------------------+
only showing top 5 rows



In [11]:
#write to HDFS
extracted_join_guild_events \
        .write \
        .mode('overwrite') \
        .parquet('/tmp/join_guild')

In [12]:
#read back in what we just wrote to HDFS
join_guild = spark.read.parquet('/tmp/join_guild')

In [14]:
join_guild.show(5)

+------+-----------------+---------------+----------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|guild_type|           timestamp|
+------+-----------------+---------------+----------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
+------+-----------------+---------------+----------+----------+--------------------+
only showing top 5 rows



In [15]:
join_guild.registerTempTable('join_guild')

In [16]:
join_guild_spark_df = spark.sql("select * from join_guild where Host = 'user1.comcast.com'")

In [17]:
join_guild_spark_df.show()

+------+-----------------+---------------+----------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|guild_type|           timestamp|
+------+-----------------+---------------+----------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild|     smith|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3|join_guild| 

In [18]:
join_guild_df = join_guild_spark_df.toPandas()
join_guild_df.head()

Unnamed: 0,Accept,Host,User-Agent,event_type,guild_type,timestamp
0,*/*,user1.comcast.com,ApacheBench/2.3,join_guild,smith,2021-04-07 23:17:35.622
1,*/*,user1.comcast.com,ApacheBench/2.3,join_guild,smith,2021-04-07 23:17:35.626
2,*/*,user1.comcast.com,ApacheBench/2.3,join_guild,smith,2021-04-07 23:17:35.63
3,*/*,user1.comcast.com,ApacheBench/2.3,join_guild,smith,2021-04-07 23:17:35.634
4,*/*,user1.comcast.com,ApacheBench/2.3,join_guild,smith,2021-04-07 23:17:35.64


In [19]:
join_guild_df.describe()

Unnamed: 0,Accept,Host,User-Agent,event_type,guild_type,timestamp
count,10,10,10,10,10,10
unique,1,1,1,1,1,10
top,*/*,user1.comcast.com,ApacheBench/2.3,join_guild,smith,2021-04-07 23:17:35.626
freq,10,10,10,10,10,1


### Buy Sword 

In [21]:
buy_sword_events = raw_events \
        .select(raw_events.value.cast('string').alias('raw'),
                raw_events.timestamp.cast('string')) \
        .filter(is_buy_sword('raw'))

In [23]:
buy_sword_events.show(5)

+--------------------+--------------------+
|                 raw|           timestamp|
+--------------------+--------------------+
|{"Host": "user1.c...|2021-04-07 23:17:...|
|{"Host": "user1.c...|2021-04-07 23:17:...|
|{"Host": "user1.c...|2021-04-07 23:17:...|
|{"Host": "user1.c...|2021-04-07 23:17:...|
|{"Host": "user1.c...|2021-04-07 23:17:...|
+--------------------+--------------------+
only showing top 5 rows



In [24]:
extracted_buy_sword_events = buy_sword_events \
        .rdd \
        .map(lambda r: Row(timestamp=r.timestamp, **json.loads(r.raw))) \
        .toDF()

In [25]:
extracted_buy_sword_events.show(5)

+------+-----------------+---------------+----------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|sword_type|           timestamp|
+------+-----------------+---------------+----------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
+------+-----------------+---------------+----------+----------+--------------------+
only showing top 5 rows



In [26]:
#write to HDFS
extracted_buy_sword_events \
        .write \
        .mode('overwrite') \
        .parquet('/tmp/buy_sword')

In [27]:
#read back in what we just wrote to HDFS
buy_sword = spark.read.parquet('/tmp/buy_sword')

In [29]:
buy_sword.show(5)

+------+-----------------+---------------+----------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|sword_type|           timestamp|
+------+-----------------+---------------+----------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
+------+-----------------+---------------+----------+----------+--------------------+
only showing top 5 rows



In [30]:
buy_sword.registerTempTable('buy_sword')

In [31]:
buy_sword_spark_df = spark.sql("select * from buy_sword where Host = 'user1.comcast.com'")

In [33]:
buy_sword_spark_df.show(5)

+------+-----------------+---------------+----------+----------+--------------------+
|Accept|             Host|     User-Agent|event_type|sword_type|           timestamp|
+------+-----------------+---------------+----------+----------+--------------------+
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
|   */*|user1.comcast.com|ApacheBench/2.3| buy_sword|     short|2021-04-07 23:17:...|
+------+-----------------+---------------+----------+----------+--------------------+
only showing top 5 rows



In [34]:
buy_sword_df = buy_sword_spark_df.toPandas()
buy_sword_df.head()

Unnamed: 0,Accept,Host,User-Agent,event_type,sword_type,timestamp
0,*/*,user1.comcast.com,ApacheBench/2.3,buy_sword,short,2021-04-07 23:17:29.199
1,*/*,user1.comcast.com,ApacheBench/2.3,buy_sword,short,2021-04-07 23:17:29.203
2,*/*,user1.comcast.com,ApacheBench/2.3,buy_sword,short,2021-04-07 23:17:29.21
3,*/*,user1.comcast.com,ApacheBench/2.3,buy_sword,short,2021-04-07 23:17:29.214
4,*/*,user1.comcast.com,ApacheBench/2.3,buy_sword,short,2021-04-07 23:17:29.218


In [35]:
buy_sword_df.describe()

Unnamed: 0,Accept,Host,User-Agent,event_type,sword_type,timestamp
count,10,10,10,10,10,10
unique,1,1,1,1,1,10
top,*/*,user1.comcast.com,ApacheBench/2.3,buy_sword,short,2021-04-07 23:17:29.21
freq,10,10,10,10,10,1


# New in Week 13

# Simply Analytics