In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Optimizing Skewness and Spillage")
    .master("spark://spark-master:7077")
    .config("spark.cores.max", 8)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/26 17:44:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [36]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [31]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load("hdfs://namenode:9000/input/data/employee_records_skewed.csv")

In [33]:
# Read DEPT CSV data
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load("hdfs://namenode:9000/input/data/department_data.csv")

In [37]:
# Join Datasets

df_joined = emp.join(dept, on=emp.department_id==dept.department_id, how="left_outer")

In [38]:
df_joined.write.format("noop").mode("overwrite").save()

                                                                                

In [40]:
# Check the partition details to understand distribution
from pyspark.sql import functions as F

part_df = df_joined.withColumn("partition_id", F.spark_partition_id()).groupBy("partition_id").agg(F.count(F.lit(1)).alias("count"))

part_df.show()

                                                                                

+------------+------+
|partition_id| count|
+------------+------+
|         103| 20261|
|         122| 19887|
|          43|820545|
|         107| 19839|
|          49| 19799|
|          51| 19670|
|         102| 20120|
|          66| 19946|
|         174| 19936|
|          89| 19997|
+------------+------+



In [42]:
# Verify Employee data based on department_id
emp.groupBy("department_id").count().show()

+-------------+------+
|department_id| count|
+-------------+------+
|            1|820545|
|            6| 19799|
|            3| 19670|
|            5| 19946|
|            9| 19997|
|            4| 20120|
|            8| 20261|
|            7| 19839|
|           10| 19887|
|            2| 19936|
+-------------+------+



In [43]:
# Set shuffle partitions to a lesser number - 16

spark.conf.set("spark.sql.shuffle.partitions", 32)

In [55]:
# Salted Employee
salted_emp = emp.withColumn("salted_dept_id", F.concat("department_id", lit("_"), F.round(10*F.rand(), 0).cast("int")))
salted_emp.show()                                                     

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|salted_dept_id|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            1|          1_10|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            1|           1_3|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|          10_3|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|           1_5|
|  Michelle|   Elliott|      Air cabin crew|1975

In [56]:
salted_emp.groupby("salted_dept_id").count().orderBy("count", ascending=False).show()

[Stage 97:>                                                         (0 + 8) / 8]

+--------------+-----+
|salted_dept_id|count|
+--------------+-----+
|           1_2|82469|
|           1_7|82305|
|           1_3|82234|
|           1_4|82232|
|           1_9|82209|
|           1_1|82133|
|           1_5|81999|
|           1_6|81836|
|           1_8|81663|
|          1_10|40739|
|           1_0|40726|
|           9_9| 2088|
|           9_8| 2068|
|           5_9| 2066|
|           7_3| 2064|
|           2_7| 2062|
|           2_4| 2057|
|           4_8| 2055|
|           9_4| 2053|
|           4_7| 2045|
+--------------+-----+
only showing top 20 rows



                                                                                

In [57]:
# Salted Department

salted_dept = dept.withColumn("salted_dept_id", F.concat("department_id", lit("_"), F.round(10*F.rand(), 0).cast("int")))
salted_dept.where("department_id = 9").show()

+-------------+--------------------+--------------------+-----------+-----+-------+--------------+
|department_id|     department_name|         description|       city|state|country|salted_dept_id|
+-------------+--------------------+--------------------+-----------+-----+-------+--------------+
|            9|Mcmahon, Terrell ...|De-engineered hig...|Marychester|   MN|  Italy|           9_5|
+-------------+--------------------+--------------------+-----------+-----+-------+--------------+



In [58]:
# Lets make the salted join now
salted_joined_df = salted_emp.join(salted_dept, on=salted_emp.salted_dept_id==salted_dept.salted_dept_id, how="left_outer")

In [59]:
salted_joined_df.write.format("noop").mode("overwrite").save()

                                                                                

In [60]:
# Check the partition details to understand distribution
from pyspark.sql.functions import spark_partition_id, count

part_df = salted_joined_df.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))

part_df.show()

                                                                                

+-------------+------+
|partition_num| count|
+-------------+------+
|           12|   962|
|           18| 92394|
|           10| 93205|
|            1|  4006|
|            3|  5959|
|           27| 89132|
|           20|  6003|
|           29|  5945|
|           13|  8043|
|           14|  8081|
|            6| 89623|
|            9|  6197|
|           23|130801|
|            7| 85772|
|           11|  2977|
|           26|  3061|
|           30|  2032|
|           28| 86307|
|            8|   966|
|            0| 83967|
+-------------+------+
only showing top 20 rows

