In [1]:
from caseconverter import snakecase
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DateType, DecimalType, IntegerType, LongType, StringType, StructField, StructType

In [2]:
spark = (
    SparkSession.builder.appName(
        "SampleEtlJob2"
    )
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql_2.12:3.1.2,org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,com.datastax.spark:spark-cassandra-connector_2.12:3.1.0",
    )
    .config(
        "spark.sql.streaming.checkpointLocation",
        "/tmp/speak_streaming/checkpoint",
    )
    .master(
        "spark://spark:7077"
    )
    .getOrCreate()
)



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7fbade6d-ffa9-410f-b976-78aaadb94c0c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found com.datastax.spark#spark-cassandra-connector_2.12;3.1.0 in 

In [3]:
catalog_df = spark.read.options(header=True, delimiter="|").csv("/mounts/spark-share/Products1.txt")

# replace missing itemType values with "OTHER"
catalog_df = catalog_df.fillna("OTHER", ["itemType"])

for col in catalog_df.columns:
    # capitalize all string fields
    catalog_df = catalog_df.withColumn(col, F.upper(catalog_df[col]))
    
    # convert column titles to snake case
    catalog_df = catalog_df.withColumnRenamed(col, snakecase(col))

# we will be splitting the incoming stream by manufacturer, need to replace invalid characters
catalog_df = catalog_df.withColumn("manufacturer", F.regexp_replace("manufacturer", "^", "VENDOR_"))
catalog_df = catalog_df.withColumn("manufacturer", F.regexp_replace("manufacturer", "\\s+", "_"))
catalog_df = catalog_df.withColumn("manufacturer", F.regexp_replace("manufacturer", "&", "AND"))
catalog_df = catalog_df.withColumn("manufacturer", F.regexp_replace("manufacturer", "'", ""))

catalog_df.show()

                                                                                

+-------------------+--------------------+--------+--------------+--------+----------+
|       manufacturer|        product_name|    size|     item_type|     sku|base_price|
+-------------------+--------------------+--------+--------------+--------+----------+
|   VENDOR_ZATARAINS|  JAMBALAYA RICE MIX|   12 OZ| RICE/RICE MIX|42081001|     $2.49|
|   VENDOR_ZATARAINS|  JAMBALAYA RICE MIX|    8 OZ| RICE/RICE MIX|42082001|     $1.79|
|     VENDOR_YUCATAN|   GUACAMOLE REGULAR|    8 OZ|         OTHER|42083001|     $3.99|
|       VENDOR_YUBAN|COFFEE ORIGINAL B...|   12 OZ|COFFEE/CREAMER|42084001|     $3.99|
|     VENDOR_YOPLAIT| GOGURT VARIETY PACK|    8 CT|        YOGURT|42085001|     $2.99|
|    VENDOR_WISHBONE|    ITALIAN DRESSING|   16 OZ|SALAD DRESSING|42086001|     $2.00|
|VENDOR_WHITE_CASTLE|CHEESEBURGER HEAT...|29.28 OZ|         OTHER|42087001|    $11.59|
|     VENDOR_WHISKAS| CHOICE CUTS POULTRY|   36 OZ|      PET FOOD|42088001|     $4.99|
|      VENDOR_WELCHS|FARMERS PICK CONC...| 

                                                                                

In [4]:
transaction_schema = StructType(
    [
        StructField("transaction_id", LongType(), False),
        StructField("customer_id", LongType(), False),
        StructField("sku", LongType(), False),
        StructField("sale_price", DecimalType(38, 2), False),
        StructField("date", DateType(), False),
        StructField("items_left", IntegerType(), False),
        StructField("total_cases_ordered", IntegerType(), False),
    ]
)

In [5]:
streaming_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:29092")
    .option("subscribe", "transactions")
    .option("startingOffsets", "earliest")
    .load()
)
streaming_df = (
    streaming_df.selectExpr("CAST(value AS STRING)")
    .select(F.from_json(F.col("value"), transaction_schema).alias("t"))
    .select("t.*")
)
streaming_df.printSchema()

root
 |-- transaction_id: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- sku: long (nullable = true)
 |-- sale_price: decimal(38,2) (nullable = true)
 |-- date: date (nullable = true)
 |-- items_left: integer (nullable = true)
 |-- total_cases_ordered: integer (nullable = true)



In [6]:
joined_df = (
    streaming_df.alias("t")
    .join(
        catalog_df.alias("c"),
        streaming_df.sku == catalog_df.sku,
    )
    .select(
        F.col("c.manufacturer").alias("topic"),
        F.col("c.sku").alias("key"),
        F.to_json(
            F.struct(
                F.struct(
                    "t.transaction_id",
                    "t.customer_id",
                    "t.sale_price",
                    "t.date",
                ).alias("transaction"),
                F.struct(
                    "c.product_name",
                    "c.size",
                    "c.item_type",
                ).alias("product_information"),
                F.struct(
                    "t.items_left",
                    "t.total_cases_ordered",
                ).alias("inventory_information"),
            )
        ).alias("value"),
    )
)
joined_df.printSchema()

root
 |-- topic: string (nullable = true)
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [7]:
joined_df.writeStream.outputMode("update").format(
    "kafka"
).option(
    "kafka.bootstrap.servers",
    "kafka:29092",
).start().awaitTermination()

                                                                                

KeyboardInterrupt: 

In [None]:
spark.stop()