In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Unique data & Window Functions")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

In [53]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [None]:
# Emp Data & Schema

emp_data = [
    ["001", "101", "John Doe", "30", "Male", "50000", "2015-01-01"],
    ["002", "101", "Jane Smith", "25", "Female", "45000", "2016-02-15"],
    ["003", "102", "Bob Brown", "35", "Male", "55000", "2014-05-01"],
    ["004", "102", "Alice Lee", "28", "Female", "48000", "2017-09-30"],
    ["005", "103", "Jack Chan", "40", "Male", "60000", "2013-04-01"],
    ["006", "103", "Jill Wong", "32", "Female", "52000", "2018-07-01"],
    ["007", "101", "James Johnson", "42", "Male", "70000", "2012-03-15"],
    ["008", "102", "Kate Kim", "29", "Female", "51000", "2019-10-01"],
    ["009", "103", "Tom Tan", "33", "Male", "58000", "2016-06-01"],
    ["010", "104", "Lisa Lee", "27", "Female", "47000", "2018-08-01"],
    ["011", "104", "David Park", "38", "Male", "65000", "2015-11-01"],
    ["012", "105", "Susan Chen", "31", "Female", "54000", "2017-02-15"],
    ["013", "106", "Brian Kim", "45", "Male", "75000", "2011-07-01"],
    ["014", "107", "Emily Lee", "26", "Female", "46000", "2019-01-01"],
    ["015", "106", "Michael Lee", "37", "Male", "63000", "2014-09-30"],
    ["016", "107", "Kelly Zhang", "30", "Female", "49000", "2018-04-01"],
    ["017", "105", "George Wang", "34", "Male", "57000", "2016-03-15"],
    ["018", "104", "Nancy Liu", "29", "", "50000", "2017-06-01"],
    ["019", "103", "Steven Chen", "36", "Male", "62000", "2015-08-01"],
    ["020", "102", "Grace Kim", "32", "Female", "53000", "2018-11-01"],
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [66]:
# Create emp DataFrame

emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

In [67]:
# Get unique data
# select distinct emp.* from emp
emp_unique = emp.distinct()
emp_unique.write.format("noop").mode("overwrite").save()

                                                                                

In [70]:
emp_unique.count()

20

In [47]:
emp.rdd.getNumPartitions()

8

In [49]:
emp_unique.rdd.getNumPartitions()

1

In [43]:
from pyspark.sql import functions as F

emp_unique = emp_unique.withColumn("partition_id", F.spark_partition_id())
emp_unique.show()

+-----------+-------------+-------------+---+------+------+----------+------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|partition_id|
+-----------+-------------+-------------+---+------+------+----------+------------+
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|           0|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|           0|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|           0|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|           0|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|           0|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|           0|
|        014|          107|    Emily Lee| 26|Female| 46000|2019-01-01|           0|
|        013|          106|    Brian Kim| 45|  Male| 75000|2011-07-01|           0|
|        019|          103|  Steven Chen| 36|  Male| 62000|2015-08-01|      

In [51]:
# Unique of department_ids
# select distinct department_id from emp
emp_dept_id = emp.select("department_id").distinct()
emp_dept_id.write.format("noop").mode("overwrite").save()

In [52]:
emp_dept_id.rdd.getNumPartitions()

1

LET´S READ A BIG DATAFRAME AND SEE WHAT HAPPENS WITH SHUFFLE

In [None]:
# LOAD LOCAL FILES INTO HADOOP
from hdfs import InsecureClient

client = InsecureClient("http://namenode:9870")
client.upload("/input", "/home/jovyan/data", overwrite=True)

'/input/data'

In [None]:
emp_big = spark.read.option("header", True).csv(
    "hdfs://namenode:9000/input/data/employee_records.csv"
)
emp_big.count()

TypeError: DataFrameReader.option() missing 2 required positional arguments: 'key' and 'value'

In [64]:
emp_big_unique = emp_big.distinct()
emp_big_unique.write.format("noop").mode("overwrite").save()

                                                                                

In [65]:
emp_big_unique.write.format("noop").mode("overwrite").save()

                                                                                

In [62]:
emp_big.rdd.getNumPartitions()

8

In [61]:
emp_big_unique.rdd.getNumPartitions()

200

WINDOW FUNCTIONS

In [55]:
# Window Functions
# select *, max(salary) over(partition by department_id order by salary desc) as max_salary from emp_unique
from pyspark.sql.window import Window
from pyspark.sql.functions import max, col, desc

window_spec = Window.partitionBy(col("department_id")).orderBy(col("salary").desc())
max_func = max(col("salary")).over(window_spec)

emp_1 = emp.withColumn("max_salary", max_func)
emp_1.write.format("noop").mode("overwrite").save()
emp_1.show()

                                                                                

+-----------+-------------+-------------+---+------+------+----------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|max_salary|
+-----------+-------------+-------------+---+------+------+----------+----------+
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|     70000|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|     70000|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|     70000|
|        016|          107|  Kelly Zhang| 30|Female| 49000|2018-04-01|     49000|
|        014|          107|    Emily Lee| 26|Female| 46000|2019-01-01|     49000|
|        011|          104|   David Park| 38|  Male| 65000|2015-11-01|     65000|
|        018|          104|    Nancy Liu| 29|      | 50000|2017-06-01|     65000|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|     65000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|     55000|
|        020|   

In [None]:
# Window Functions - 2nd highest salary of each department
# select *, row_number() over(partition by department_id order by salary desc) as rn from emp_unique where rn = 2
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc, col

window_spec = Window.partitionBy(col("department_id")).orderBy(col("salary").desc())
rn = row_number().over(window_spec)

emp_2 = emp.withColumn("rn", rn).where("rn = 2")


+-----------+-------------+-------------+---+------+------+----------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|max_salary|
+-----------+-------------+-------------+---+------+------+----------+----------+
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|     70000|
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|     70000|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|     70000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|     55000|
|        020|          102|    Grace Kim| 32|Female| 53000|2018-11-01|     55000|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|     55000|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|     55000|
|        019|          103|  Steven Chen| 36|  Male| 62000|2015-08-01|     62000|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|     62000|
|        009|   

                                                                                

In [12]:
emp_2.show()

+-----------+-------------+-----------+---+------+------+----------+---+
|employee_id|department_id|       name|age|gender|salary| hire_date| rn|
+-----------+-------------+-----------+---+------+------+----------+---+
|        001|          101|   John Doe| 30|  Male| 50000|2015-01-01|  2|
|        020|          102|  Grace Kim| 32|Female| 53000|2018-11-01|  2|
|        005|          103|  Jack Chan| 40|  Male| 60000|2013-04-01|  2|
|        018|          104|  Nancy Liu| 29|      | 50000|2017-06-01|  2|
|        012|          105| Susan Chen| 31|Female| 54000|2017-02-15|  2|
|        015|          106|Michael Lee| 37|  Male| 63000|2014-09-30|  2|
|        014|          107|  Emily Lee| 26|Female| 46000|2019-01-01|  2|
+-----------+-------------+-----------+---+------+------+----------+---+



                                                                                

In [None]:
# Window function using expr
# select *, row_number() over(partition by department_id order by salary desc) as rn from emp_unique where rn = 2
from pyspark.sql.functions import expr

emp_3 = emp.withColumn(
    "rn", expr("row_number() over(partition by department_id order by salary desc)")
).where("rn = 2")


In [14]:
emp_3.show()

+-----------+-------------+-----------+---+------+------+----------+---+
|employee_id|department_id|       name|age|gender|salary| hire_date| rn|
+-----------+-------------+-----------+---+------+------+----------+---+
|        001|          101|   John Doe| 30|  Male| 50000|2015-01-01|  2|
|        020|          102|  Grace Kim| 32|Female| 53000|2018-11-01|  2|
|        005|          103|  Jack Chan| 40|  Male| 60000|2013-04-01|  2|
|        018|          104|  Nancy Liu| 29|      | 50000|2017-06-01|  2|
|        012|          105| Susan Chen| 31|Female| 54000|2017-02-15|  2|
|        015|          106|Michael Lee| 37|  Male| 63000|2014-09-30|  2|
|        014|          107|  Emily Lee| 26|Female| 46000|2019-01-01|  2|
+-----------+-------------+-----------+---+------+------+----------+---+



In [15]:
spark.stop()