In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Optimizing Skewness and Spillage")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/07 16:55:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [None]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = (
    spark.read.format("csv")
    .schema(_schema)
    .option("header", True)
    .load("hdfs://namenode:9000/input/data/employee_records_skewed.csv")
)

In [None]:
# Check the partition details to understand distribution
from pyspark.sql import functions as F

emp.withColumn("partition_id", F.spark_partition_id()).groupBy(
    "partition_id"
).count().show()

                                                                                

+------------+------+
|partition_id| count|
+------------+------+
|           1|130358|
|           6|130263|
|           3|130345|
|           5|130355|
|           4|130328|
|           7| 87633|
|           2|130349|
|           0|130369|
+------------+------+



In [5]:
# Verify Employee data based on department_id
emp.groupBy("department_id").count().show()

                                                                                

+-------------+------+
|department_id| count|
+-------------+------+
|            1|820545|
|            6| 19799|
|            3| 19670|
|            5| 19946|
|            9| 19997|
|            4| 20120|
|            8| 20261|
|            7| 19839|
|           10| 19887|
|            2| 19936|
+-------------+------+



In [6]:
emp.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            1|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            1|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|
|  Michelle|   Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|       (705)900-5337|604738.0|            1|
|    Ashley|   Montoya|        C

In [None]:
# THIS JOIN OPERATION IS GOING TO BE VERY EXPENSIVE IN ONLY 1 TASK (BECAUSE OF DATA SKEWNESS)
df = emp.filter(F.col("salary") < 20000).join(
    emp.filter(F.col("salary") < 20000).select(
        "department_id", *[F.col(c).alias(f"{c}_2") for c in emp.columns]
    ),
    on="department_id",
)

In [8]:
df.write.format("noop").mode("overwrite").save()

                                                                                

In [None]:
# Salted Employee
salted_emp = emp.withColumn(
    "salted_dept_id",
    F.concat("department_id", F.lit("_"), F.round(10 * F.rand(), 0).cast("int")),
)
salted_emp.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|salted_dept_id|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            1|           1_7|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            1|           1_9|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|          10_5|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|          1_10|
|  Michelle|   Elliott|      Air cabin crew|1975

In [None]:
# AFTER
salted_emp.groupby("salted_dept_id").count().orderBy("count", ascending=False).show()



+--------------+-----+
|salted_dept_id|count|
+--------------+-----+
|           1_9|82418|
|           1_2|82360|
|           1_6|82343|
|           1_3|82327|
|           1_5|82122|
|           1_4|81982|
|           1_8|81961|
|           1_7|81943|
|           1_1|81749|
|          1_10|40769|
|           1_0|40571|
|           8_1| 2135|
|           4_1| 2078|
|           4_3| 2073|
|           7_1| 2070|
|           8_7| 2068|
|           4_9| 2067|
|           8_2| 2063|
|           9_1| 2056|
|           6_7| 2052|
+--------------+-----+
only showing top 20 rows



                                                                                

In [None]:
salted_df = salted_emp.filter(F.col("salary") < 20000).join(
    salted_emp.filter(F.col("salary") < 20000).select(
        "salted_dept_id", F.col("email").alias("email2")
    ),
    on="salted_dept_id",
)
salted_df.write.format("noop").mode("overwrite").save()

                                                                                

In [None]:
salted_df.filter(F.col("department_id") == 1).groupBy("email").agg(
    F.last("salted_dept_id"), F.last("department_id")
).show()



+--------------------+--------------------+-------------------+
|               email|last(salted_dept_id)|last(department_id)|
+--------------------+--------------------+-------------------+
|adamsstephen@exam...|                 1_5|                  1|
|amymatthews@examp...|                 1_8|                  1|
|andres94@example.net|                 1_3|                  1|
| anita45@example.net|                 1_9|                  1|
|anthonylewis@exam...|                 1_3|                  1|
| aprille@example.net|                 1_2|                  1|
| aprilli@example.com|                 1_3|                  1|
|arobertson@exampl...|                 1_9|                  1|
|ashley77@example.net|                 1_0|                  1|
|benjaminsalas@exa...|                 1_0|                  1|
|bensonsamuel@exam...|                 1_2|                  1|
|cabrerasheri@exam...|                 1_4|                  1|
|cardenastimothy@e...|                 1

                                                                                

In [15]:
spark.stop()