# 1. Imports

In [1]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import json
import logging
import pickle
import time
import datetime

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("consumer")


config = json.load(open("config.json"))
print(json.dumps(config, indent=2))

{
  "global": {
    "kafka_bootstrap_servers": "kafka:9092",
    "kafka_topic": "test-structured-streaming",
    "kafka_consumer_group": "ss_job",
    "max_records_per_batch": 20
  }
}


In [2]:
kafka_packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{"2.12"}:{"3.3.0"}',
    'org.apache.kafka:kafka-clients:2.8.0',
    "org.apache.hadoop:hadoop-aws:3.3.0",
    "com.google.guava:guava:21.0",
    "org.apache.httpcomponents:httpcore:4.4.8"
]

# 2. Initialize Spark with Kafa Consumer

In [3]:
# Create Spark Configuration and set application name
conf = SparkConf().setAppName("KafkaExp")

# Default pyspark installation lacks kafka consumer libraries. Install kafka-client libs manually
kafka_packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{"2.12"}:{"3.3.0"}',
    'org.apache.kafka:kafka-clients:2.8.0',
    "org.apache.hadoop:hadoop-aws:3.3.0",
    "com.google.guava:guava:21.0",
    "org.apache.httpcomponents:httpcore:4.4.8"
]

# Provide kafka jar paths to driver and executors
kafka_jar_paths = '/mnt/home/prathyush/.ivy2/jars/'.join([
    "org.apache.hadoop_hadoop-client-runtime-3.3.2.jar",
    "org.apache.kafka_kafka-clients-2.8.1.jar",
    "hadoop-aws-2.7.5.jar",
    "aws-java-sdk-core-1.12.268.jar"
])

# Connect to Spark cluster (Cluster mode instead of local mode)
conf = (conf.setMaster('spark://spark:7077')
        .set('spark.jars.packages', ','.join(kafka_packages))
        .set('spark.driver.extraClassPath', '/mnt/home/prathyush/.ivy2/jars/*')
        .set('spark.executor.extraClassPath', '/mnt/home/prathyush/.ivy2/jars/*')
        )

# Create spark context
sc = SparkContext(conf=conf)

logger.info(f"Spark Driver memory: {sc._conf.get('spark.driver.memory')}")
logger.info(f"Spark Executor memory: {sc._conf.get('spark.executor.memory')}")
logger.info(
    f'Loaded jars:\n{json.dumps((sc._jsc.sc().listJars().toList().toString().replace("List(", "").replace(")", "").split(", ")), indent=2)}')
sc.setLogLevel("ERROR")

# Create spark session
spark = SparkSession(sc)
spark.conf.set("spark.sql.parquet.compression.codec", "gzip")
spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark.conf.set("parquet.enable.summary-metadata", "false")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.google.guava#guava added as a dependency
org.apache.httpcomponents#httpcore added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6f782454-06ad-4326-9f34-8e1c0d277c00;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.

22/12/16 07:22:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
INFO:consumer:Spark Driver memory: None
INFO:consumer:Spark Executor memory: None
INFO:consumer:Loaded jars:
[
  "spark://b4d43d382907:46871/jars/org.xerial.snappy_snappy-java-1.1.8.4.jar",
  "spark://b4d43d382907:46871/jars/org.lz4_lz4-java-1.8.0.jar",
  "spark://b4d43d382907:46871/jars/org.wildfly.openssl_wildfly-openssl-1.0.7.Final.jar",
  "spark://b4d43d382907:46871/jars/com.amazonaws_aws-java-sdk-bundle-1.11.563.jar",
  "spark://b4d43d382907:46871/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.3.0.jar",
  "spark://b4d43d382907:46871/jars/commons-logging_commons-logging-1.1.3.jar",
  "spark://b4d43d382907:46871/jars/com.google.code.findbugs_jsr305-3.0.0.jar",
  "spark://b4d43d382907:46871/jars/org.apache.kafka_kafka-clients-2.8.1.jar",
  "spark://b4d43d382907:46871/jars/org.spark-project.spark_unused-1.0.0.jar",
  "spark://b4d43d382907:46871/jars/org.

# 3. Test Kafka topic and connection

In [5]:
from confluent_kafka.admin import AdminClient

def test_kafka_connection(broker_conf:dict) -> None:
    """
    Function to test kafka connection
    :param broker_conf: Broker configuration
    :returns: None
    """
    client = AdminClient(broker_conf)
    topics = client.list_topics().topics
    if not topics:
        raise RuntimeError()
    print("Kafka Connection successful!")


broker_conf = {
    'bootstrap.servers': config["global"]["kafka_bootstrap_servers"]
}

# Test kafka connection
test_kafka_connection(broker_conf)


Kafka Connection successful!


# 5. Configure Spark-Kafka consumer options and Subscribe to Kafka Topic

In [6]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, LongType, ArrayType,FloatType,DoubleType
schema = StructType().add("VendorID", StringType(), True)\
                    .add("total_amount",FloatType(),True)\
                    .add("trip_distancet",DoubleType(),True)

In [7]:
spark.sparkContext.setLogLevel("ERROR")

# Configure spark kafka client options
spark_kafka_options = {
    "kafka.bootstrap.servers": config["global"]["kafka_bootstrap_servers"],
    "subscribe": config["global"]["kafka_topic"],
    "kafka.group.id": config["global"]["kafka_consumer_group"],
    "maxOffsetsPerTrigger": config["global"]["max_records_per_batch"],
    "startingOffsets": "earliest",
}

# Enable spark read stream
df = spark.readStream.format("kafka").options(**spark_kafka_options).load()

In [None]:
df.write.format("mongo").mode("append").option("uri",mongoURL).save()

# 6. Start spark structred streaming job

In [9]:
def min_max(awards_year):
    temp = []
    if awards_year is None:
        return []
    for year in awards_year:
        temp.append(int(year))
    return [min(temp),max(temp)]




In [12]:
# Lambda Function for processing each batch of record
def process_batch(batch_df, batch_idx):
    print(f"{batch_idx} | {batch_df.count()}")
    batch_df = batch_df.selectExpr("CAST(value AS STRING)").select(from_json("value", schema).alias("data"))
    batch_df = batch_df.selectExpr("data.name","data.contribs","data.awards","CAST(data.birth AS DATE) as birth","CAST(data.death AS DATE) as death")
    batch_df = batch_df.withColumn('death', F.when(F.col('death').isNull(), datetime.datetime.now().date()).otherwise(F.col('death')))
    batch_df= batch_df.withColumn("age", F.year(F.col("death"))-F.year(F.col("birth")))
    convertUDF = F.udf(lambda z: min_max(z))

    batch_df = batch_df.select(F.col("name"), F.col("age"),F.size(F.col("contribs")).alias("num_contribs"), convertUDF(F.col("awards.year")).alias("min_max"))
 
    # batch_df.write.mode('append').parquet("parqi.parquet")
    batch_df.write.mode('append').format("console")
    batch_df.collect()
    batch_df.show()
    return batch_df

# Structred streaming query

query = df.writeStream.trigger(processingTime='10 seconds').foreachBatch(process_batch).start()

INFO:py4j.clientserver:Python Server ready to receive messages
INFO:py4j.clientserver:Received command c on object id p1


0 | 20


                                                                                

+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|[1983, 2011]|
|{Matz, Yukihiro, ...| 57|           1|[2011, 2011]|
|{null, James, Gos...| 67|           1|[2002, 2007]|
|{null, Martin, Od...| 67|           1|          []|
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 

INFO:py4j.clientserver:Received command c on object id p1


1 | 20
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|[1983, 2011]|
|{Matz, Yukihiro, ...| 57|           1|[2011, 2011]|
|{null, James, Gos...| 67|           1|[2002, 2007]|
|{null, Martin, Od...| 67|           1|          []|
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|

# 7. Monitor structred streaming job progress

In [13]:
# Add startup delay
time.sleep(5)
# Update Job Status

print(query.status)
while query.status['isDataAvailable'] or query.status['isTriggerActive']:
    print(query.status)
    time.sleep(5)

# Stop query
query.stop()

logger.info("Structred streaming job completed successfully")

INFO:py4j.clientserver:Received command c on object id p1


2 | 19
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|[1983, 2011]|
|{Matz, Yukihiro, ...| 57|           1|[2011, 2011]|
|{null, James, Gos...| 67|           1|[2002, 2007]|
|{null, Martin, Od...| 67|           1|          []|
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|

INFO:py4j.clientserver:Received command c on object id p1


3 | 2
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


INFO:py4j.clientserver:Received command c on object id p1


4 | 2


                                                                                

+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


INFO:py4j.clientserver:Received command c on object id p1


5 | 3
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|[1983, 2011]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


INFO:py4j.clientserver:Received command c on object id p1


6 | 2
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{Matz, Yukihiro, ...| 57|           1|[2011, 2011]|
|{null, James, Gos...| 67|           1|[2002, 2007]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


INFO:py4j.clientserver:Received command c on object id p1


7 | 2
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, Martin, Od...| 67|           1|          []|
|{null, John, Backus}| 83|           4|[1967, 1993]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


INFO:py4j.clientserver:Received command c on object id p1


8 | 3


                                                                                

+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


INFO:py4j.clientserver:Received command c on object id p1


9 | 1
+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
+--------------------+---+------------+------------+

{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}
{'message': 'Waiting for next trigger', 'isDataAvailable': True, 'isTriggerActive': False}


KeyboardInterrupt: 

INFO:py4j.clientserver:Received command c on object id p1


10 | 3


                                                                                

+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|[1983, 2011]|
|{Matz, Yukihiro, ...| 57|           1|[2011, 2011]|
+--------------------+---+------------+------------+



INFO:py4j.clientserver:Received command c on object id p1


11 | 1


                                                                                

+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, James, Gos...| 67|           1|[2002, 2007]|
+--------------------+---+------------+------------+



In [12]:
df = spark.read.parquet("parqi.parquet")
df.show()

                                                                                

+--------------------+---+------------+------------+
|                name|age|num_contribs|     min_max|
+--------------------+---+------------+------------+
|{null, John, Backus}| 83|           4|[1967, 1993]|
|{null, John, McCa...| 84|           3|[1971, 1990]|
|{null, Grace, Hop...| 86|           4|[1969, 1991]|
|{null, Kristen, N...| 76|           2|[1999, 2001]|
|{null, Ole-Johan,...| 71|           2|[1999, 2001]|
|{null, Guido, van...| 66|           1|[2001, 2003]|
|{null, Dennis, Ri...| 70|           2|[1983, 2011]|
|{Matz, Yukihiro, ...| 57|           1|[2011, 2011]|
|{null, James, Gos...| 67|           1|[2002, 2007]|
|{null, Martin, Od...| 67|           1|          []|
+--------------------+---+------------+------------+

