In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Streaming with Static DF")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

In [None]:
# Initially unset (= 200). Try both values
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [None]:
from pyspark.sql import functions as F

# Rates a spark monotonically increasing id dataframe with a rate of 1/second
df_streaming = spark.readStream.format("rate").load()

In [None]:
# see the rate output before streaming to kafka
# df_streaming \
# .writeStream \
# .format("console") \
# .option("truncate", False) \
# .outputMode("update") \
# .trigger(processingTime="2 seconds") \
# .start() \
# .awaitTermination()

In [None]:
# Load the static employee dataframe
df_static = spark.read.csv(
    "hdfs://namenode:9000/input/data/employee_records.csv",
    inferSchema=True,
    header=True,
)
df_static = df_static.filter(
    F.col("salary") < 100000
)  # Apply this filter to reduce the size
df_static.count()

In [None]:
# Now we are going to join both dataframes using the id rate column as a dept_id
df_streaming = df_streaming.withColumn("department_id", F.col("value") % 11)
df = df_streaming.join(df_static, on="department_id")

# TEST WITH DIFFERENT PROCESSING_TIMES AND WITH DIFFERENT SQL.SHUFFLE.PARTITIONS

df.writeStream.format("console").option("truncate", False).outputMode("update").trigger(
    processingTime="8 seconds"
).start().awaitTermination()

In [None]:
spark.stop()