### Pyspark code to read from 'orders' stream, preprocess, and write to BigQuery

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, DateType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("assignment2_stream1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

In [3]:
from pyspark.sql import functions as f
from pyspark.sql.functions import from_csv

# Define schema before reading the stream
orders_dataSchema = StructType(
        [StructField("order_id", StringType(), True),
         StructField("customer_id", StringType(), True),
         StructField("order_status", StringType(), True),
         StructField("order_purchase_time", StringType(), True),
         StructField("order_approved_at", StringType(), True),
         StructField("order_delivered_carrier_date", StringType(), True),       
         StructField("order_delivered_customer_date", StringType(), True),
         StructField("order_estimated_delivery_date", StringType(), True)
         ])

# Read the whole dataset as a batch
kafkaStream = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "orders") \
        .option("startingOffsets", "earliest") \
        .load()

df = kafkaStream.selectExpr("CAST(value AS STRING)")

df1 = df.select(from_csv(df.value, orders_dataSchema.simpleString()))

orders = df1.select(col("from_csv(value).*"))
orders.printSchema()

# preprocessing
orders = orders.withColumn("order_purchase_time",orders.order_purchase_time.cast(DateType())) # change to datatype
orders = orders.withColumn("order_approved_at",orders.order_approved_at.cast(DateType())) # change to datatype
orders = orders.withColumn("order_delivered_carrier_date",orders.order_delivered_carrier_date.cast(DateType())) # change to datatype
orders = orders.withColumn("order_delivered_customer_date",orders.order_delivered_customer_date.cast(DateType())) # change to datatype
orders = orders.withColumn("order_estimated_delivery_date",orders.order_estimated_delivery_date.cast(DateType())) # change to datatype

orders.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_time: string (nullable = true)
 |-- order_approved_at: string (nullable = true)
 |-- order_delivered_carrier_date: string (nullable = true)
 |-- order_delivered_customer_date: string (nullable = true)
 |-- order_estimated_delivery_date: string (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_time: date (nullable = true)
 |-- order_approved_at: date (nullable = true)
 |-- order_delivered_carrier_date: date (nullable = true)
 |-- order_delivered_customer_date: date (nullable = true)
 |-- order_estimated_delivery_date: date (nullable = true)



### Saving to BigQuery as batch

In [4]:
# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp_2093373"
spark.conf.set('temporaryGcsBucket', bucket)

In [5]:
def my_foreach_batch_function(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format('bigquery') \
      .option('table', 'de2022-362707.assignment2.orders') \
      .mode("overwrite") \
      .save()

# Write to a sink - here, the output is written to a Big Query Table
# ProcessingTime trigger with 60-seconds micro-batch interval as the dataset is large and does not get updated within the 60 second timeframe
# Using output mode append as only new rows need to be appeneded to BigQuery and no aggregating is done with previous data
orderQuery = orders.writeStream.outputMode("append") \
                    .trigger(processingTime = '60 seconds').foreachBatch(my_foreach_batch_function).start()
try:
    orderQuery.awaitTermination()
except KeyboardInterrupt:
    orderQuery.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Stoped the streaming query and the spark context


In [6]:
# Stop the spark context
spark.stop()