## Assignment 9.3 ##
### By Kurt Stoneburner ###

In [1]:
import os
import shutil
import json
from pathlib import Path

import pandas as pd

from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin.new_topic import NewTopic
from kafka.errors import TopicAlreadyExistsError

from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark import SparkConf
from pyspark.sql.functions import window, from_json, col, expr, to_json, struct, when
from pyspark.sql.types import StringType, TimestampType, DoubleType, StructField, StructType
from pyspark.sql.functions import udf

current_dir = Path(os.getcwd()).absolute()
checkpoint_dir = current_dir.joinpath('checkpoints')
joined_checkpoint_dir = checkpoint_dir.joinpath('joined')
locations_checkpoint_dir = checkpoint_dir.joinpath('locations')
accelerations_checkpoint_dir = checkpoint_dir.joinpath('accelerations')

if joined_checkpoint_dir.exists():
    shutil.rmtree(joined_checkpoint_dir)

joined_checkpoint_dir.mkdir(parents=True, exist_ok=True)

### Configuration Parameters 

> **TODO:** Change the configuration prameters to the appropriate values for your setup.

In [4]:
config = dict(
    bootstrap_servers=['kafka.kafka.svc.cluster.local:9092'],
    first_name='Kurt',
    last_name='Stoneburner'
)

config['client_id'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)
config['topic_prefix'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)

config['locations_topic'] = '{}-locations'.format(config['topic_prefix'])
config['accelerations_topic'] = '{}-accelerations'.format(config['topic_prefix'])
config['joined_topic'] = '{}-joined'.format(config['topic_prefix'])

config

{'bootstrap_servers': ['kafka.kafka.svc.cluster.local:9092'],
 'first_name': 'Kurt',
 'last_name': 'Stoneburner',
 'client_id': 'StoneburnerKurt',
 'topic_prefix': 'StoneburnerKurt',
 'locations_topic': 'StoneburnerKurt-locations',
 'accelerations_topic': 'StoneburnerKurt-accelerations',
 'joined_topic': 'StoneburnerKurt-joined'}

### Create Topic Utility Function

The `create_kafka_topic` helps create a Kafka topic based on your configuration settings.  For instance, if your first name is *John* and your last name is *Doe*, `create_kafka_topic('locations')` will create a topic with the name `DoeJohn-locations`.  The function will not create the topic if it already exists. 

In [5]:
def create_kafka_topic(topic_name, config=config, num_partitions=1, replication_factor=1):
    bootstrap_servers = config['bootstrap_servers']
    client_id = config['client_id']
    topic_prefix = config['topic_prefix']
    name = '{}-{}'.format(topic_prefix, topic_name)
    
    admin_client = KafkaAdminClient(
        bootstrap_servers=bootstrap_servers, 
        client_id=client_id
    )
    
    topic = NewTopic(
        name=name,
        num_partitions=num_partitions,
        replication_factor=replication_factor
    )

    topic_list = [topic]
    try:
        admin_client.create_topics(new_topics=topic_list)
        print('Created topic "{}"'.format(name))
    except TopicAlreadyExistsError as e:
        print('Topic "{}" already exists'.format(name))

create_kafka_topic('joined')

Topic "StoneburnerKurt-joined" already exists


**TODO:** This code is identical to the code used in 9.1 to publish acceleration and location data to the `LastnameFirstname-simple` topic. You will need to add in the code you used to create the `df_accelerations` dataframe. In order to read data from this topic, make sure that you are running the notebook you created in assignment 8 that publishes acceleration and location data to the LastnameFirstname-simple topic.

In [6]:
spark = SparkSession\
    .builder\
    .appName("Assignment09")\
    .getOrCreate()

df_locations = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka.kafka.svc.cluster.local:9092") \
  .option("subscribe", config['locations_topic']) \
  .load()

## TODO: Add code to create the df_accelerations dataframe
df_accelerations = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka.kafka.svc.cluster.local:9092") \
  .option("subscribe", config['accelerations_topic']) \
  .load()


The following code defines a Spark schema for location and acceleration data as well as a user-defined function (UDF) for parsing the location and acceleration JSON data. 

In [7]:
location_schema = StructType([
    StructField('offset', DoubleType(), nullable=True),
    StructField('id', StringType(), nullable=True),
    StructField('ride_id', StringType(), nullable=True),
    StructField('uuid', StringType(), nullable=True),
    StructField('course', DoubleType(), nullable=True),
    StructField('latitude', DoubleType(), nullable=True),
    StructField('longitude', DoubleType(), nullable=True),
    StructField('geohash', StringType(), nullable=True),
    StructField('speed', DoubleType(), nullable=True),
    StructField('accuracy', DoubleType(), nullable=True),
])

acceleration_schema = StructType([
    StructField('offset', DoubleType(), nullable=True),
    StructField('id', StringType(), nullable=True),
    StructField('ride_id', StringType(), nullable=True),
    StructField('uuid', StringType(), nullable=True),
    StructField('x', DoubleType(), nullable=True),
    StructField('y', DoubleType(), nullable=True),
    StructField('z', DoubleType(), nullable=True),
])

udf_parse_acceleration = udf(lambda x: json.loads(x.decode('utf-8')), acceleration_schema)
udf_parse_location = udf(lambda x: json.loads(x.decode('utf-8')), location_schema)

**TODO:**  

* Complete the code to create the `accelerationsWithWatermark` dataframe. 
  * Select the `timestamp` field with the alias `acceleration_timestamp`
  * Use the `udf_parse_acceleration` UDF to parse the JSON values
  * Select the `ride_id` as `acceleration_ride_id`
  * Select the `x`, `y`, and `z` columns
  * Use the same watermark timespan used in the `locationsWithWatermark` dataframe

In [8]:
locationsWithWatermark = df_locations \
  .select(
    col('timestamp').alias('location_timestamp'), 
    udf_parse_location(df_locations['value']).alias('json_value')
   ) \
  .select(
    col('location_timestamp'), 
    col('json_value.ride_id').alias('location_ride_id'),
    col('json_value.speed').alias('speed'),
    col('json_value.latitude').alias('latitude'),
    col('json_value.longitude').alias('longitude'),
    col('json_value.geohash').alias('geohash'),
    col('json_value.accuracy').alias('accuracy')
  ) \
 .withWatermark('location_timestamp', "2 seconds")



In [14]:
df_accelerations.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [10]:
accelerationsWithWatermark = df_accelerations \
  .select(
    col('timestamp').alias('accelerations_timestamp'), 
    udf_parse_acceleration(df_accelerations['value']).alias('json_value')
   )  \
   .select(
        col('accelerations_timestamp'),
        col('json_value.ride_id').alias('acceleration_ride_id'),
        col('json_value.x').alias('x'),
        col('json_value.y').alias('y'),
        col('json_value.z').alias('z'),
)\
.withWatermark('accelerations_timestamp', "2 seconds")
accelerationsWithWatermark.printSchema()

root
 |-- accelerations_timestamp: timestamp (nullable = true)
 |-- acceleration_ride_id: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



**TODO:**  

* Complete the code to create the `df_joined` dataframe. See http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#stream-stream-joins for additional information. 

In [11]:
df_joined = ''
df_joined = locationsWithWatermark \
.join(accelerationsWithWatermark, 
      expr("""
      location_ride_id = acceleration_ride_id
      """
          )) \
.select(
    col('location_ride_id').alias('ride_id'),
    col('location_timestamp'),
    col('speed'),
    col('latitude'),
    col('longitude'),
    col('geohash'),
    col('accuracy'),
    col('accelerations_timestamp'),
    col('x'),
    col('y'),
    col('z'),
   
)



df_joined.printSchema()

root
 |-- ride_id: string (nullable = true)
 |-- location_timestamp: timestamp (nullable = true)
 |-- speed: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- geohash: string (nullable = true)
 |-- accuracy: double (nullable = true)
 |-- accelerations_timestamp: timestamp (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [12]:
#df_joined = locationsWithWatermark \
#.join(accelerationsWithWatermark)
#df_joined.printSchema()

If you correctly created the `df_joined` dataframe, you should be able to use the following code to create a streaming query that outputs results to the `LastnameFirstname-joined` topic. 

In [13]:
ds_joined = df_joined \
  .withColumn(
    'value', 
    to_json(
        struct(
            'ride_id', 'location_timestamp', 'speed', 
            'latitude', 'longitude', 'geohash', 'accuracy', 
            'accelerations_timestamp', 'x', 'y', 'z'
        )
    )
    ).withColumn(
     'key', col('ride_id')
    ) \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka.kafka.svc.cluster.local:9092") \
  .option("topic", config['joined_topic']) \
  .option("checkpointLocation", str(joined_checkpoint_dir)) \
  .start()

try:
    ds_joined.awaitTermination()
except KeyboardInterrupt:
    print("STOPPING STREAMING DATA")

StreamingQueryException: Writing job aborted.
=== Streaming Query ===
Identifier: [id = 46058cd5-ae85-417e-bebc-70a3138b6ea4, runId = d25b0093-fa9f-40c3-8638-34efab131f8a]
Current Committed Offsets: {KafkaV2[Subscribe[StoneburnerKurt-locations]]: {"StoneburnerKurt-locations":{"0":10705}},KafkaV2[Subscribe[StoneburnerKurt-accelerations]]: {"StoneburnerKurt-accelerations":{"0":10602}}}
Current Available Offsets: {KafkaV2[Subscribe[StoneburnerKurt-locations]]: {"StoneburnerKurt-locations":{"0":10706}},KafkaV2[Subscribe[StoneburnerKurt-accelerations]]: {"StoneburnerKurt-accelerations":{"0":10602}}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
WriteToMicroBatchDataSource org.apache.spark.sql.kafka010.KafkaStreamingWrite@5f1eb19a
+- Project [cast(key#145 as string) AS key#159, cast(value#132 as string) AS value#160]
   +- Project [ride_id#120, location_timestamp#42-T2000ms, speed#48, latitude#49, longitude#50, geohash#51, accuracy#52, accelerations_timestamp#66-T2000ms, x#72, y#73, z#74, value#132, ride_id#120 AS key#145]
      +- Project [ride_id#120, location_timestamp#42-T2000ms, speed#48, latitude#49, longitude#50, geohash#51, accuracy#52, accelerations_timestamp#66-T2000ms, x#72, y#73, z#74, to_json(struct(ride_id, ride_id#120, location_timestamp, location_timestamp#42-T2000ms, speed, speed#48, latitude, latitude#49, longitude, longitude#50, geohash, geohash#51, accuracy, accuracy#52, accelerations_timestamp, accelerations_timestamp#66-T2000ms, x, x#72, y, y#73, z, z#74), Some(Etc/UTC)) AS value#132]
         +- Project [location_ride_id#47 AS ride_id#120, location_timestamp#42-T2000ms, speed#48, latitude#49, longitude#50, geohash#51, accuracy#52, accelerations_timestamp#66-T2000ms, x#72, y#73, z#74]
            +- Join Inner, (location_ride_id#47 = acceleration_ride_id#71)
               :- EventTimeWatermark location_timestamp#42: timestamp, 2 seconds
               :  +- Project [location_timestamp#42, json_value#44.ride_id AS location_ride_id#47, json_value#44.speed AS speed#48, json_value#44.latitude AS latitude#49, json_value#44.longitude AS longitude#50, json_value#44.geohash AS geohash#51, json_value#44.accuracy AS accuracy#52]
               :     +- Project [timestamp#12 AS location_timestamp#42, <lambda>(value#8) AS json_value#44]
               :        +- StreamingDataSourceV2Relation [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@5788f11c, KafkaV2[Subscribe[StoneburnerKurt-locations]]
               +- EventTimeWatermark accelerations_timestamp#66: timestamp, 2 seconds
                  +- Project [accelerations_timestamp#66, json_value#68.ride_id AS acceleration_ride_id#71, json_value#68.x AS x#72, json_value#68.y AS y#73, json_value#68.z AS z#74]
                     +- Project [timestamp#33 AS accelerations_timestamp#66, <lambda>(value#29) AS json_value#68]
                        +- StreamingDataSourceV2Relation [key#28, value#29, topic#30, partition#31, offset#32L, timestamp#33, timestampType#34], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@791cfe1e, KafkaV2[Subscribe[StoneburnerKurt-accelerations]]
