## Dynamic Partition Overwrite

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Partition Overwrite")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/07 17:20:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Example dataset
from pyspark.sql import functions as F

_data = [
    ["ORD1001", "P003", 70, "01-21-2022"],
    ["ORD1004", "P033", 12, "01-24-2022"],
    ["ORD1005", "P036", 10, "01-20-2022"],
    ["ORD1002", "P016", 2, "01-10-2022"],
    ["ORD1003", "P012", 6, "01-10-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the dataframe
df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
df = df.withColumn("order_date", F.to_date("order_date", "MM-dd-yyyy"))
df.printSchema()
df.show()

root
 |-- order_id: string (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- qty: long (nullable = true)
 |-- order_date: date (nullable = true)



                                                                                

+--------+-------+---+----------+
|order_id|prod_id|qty|order_date|
+--------+-------+---+----------+
| ORD1001|   P003| 70|2022-01-21|
| ORD1004|   P033| 12|2022-01-24|
| ORD1005|   P036| 10|2022-01-20|
| ORD1002|   P016|  2|2022-01-10|
| ORD1003|   P012|  6|2022-01-10|
+--------+-------+---+----------+



In [3]:
# Check the mode for Partition Overwrite
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

'STATIC'

In [None]:
# Lets repartition the data with order_date and write

df.write.format("parquet").partitionBy("order_date").mode("overwrite").save(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
)

                                                                                

In [None]:
spark.read.parquet(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
).groupBy("order_date").count().show()

[Stage 4:>                                                          (0 + 4) / 4]

+----------+-----+
|order_date|count|
+----------+-----+
|2022-01-21|    1|
|2022-01-20|    1|
|2022-01-24|    1|
|2022-01-10|    2|
+----------+-----+



                                                                                

In [None]:
# Lets create our delta dataset for Overwrite

_data = [
    ["ORD1010", "P053", 78, "01-26-2022"],
    ["ORD1011", "P076", 21, "01-20-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the delta dataframe
delta_df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
delta_df = delta_df.withColumn("order_date", F.to_date("order_date", "MM-dd-yyyy"))
delta_df.printSchema()
delta_df.show()

root
 |-- order_id: string (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- qty: long (nullable = true)
 |-- order_date: date (nullable = true)

+--------+-------+---+----------+
|order_id|prod_id|qty|order_date|
+--------+-------+---+----------+
| ORD1010|   P053| 78|2022-01-26|
| ORD1011|   P076| 21|2022-01-20|
+--------+-------+---+----------+



In [None]:
# Lets write to the same location for Orders partitioned

delta_df.write.format("parquet").partitionBy("order_date").mode("overwrite").save(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
)

                                                                                

In [None]:
spark.read.parquet(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
).groupBy("order_date").count().show()

+----------+-----+
|order_date|count|
+----------+-----+
|2022-01-20|    1|
|2022-01-26|    1|
+----------+-----+



### Lets follow the same example but this time with partitionOverwriteMode as "DYNAMIC"

In [9]:
# Setting the partitionOverwriteMode as DYNAMIC

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

'dynamic'

In [None]:
# Example dataset
from pyspark.sql.functions import cast, to_date

_data = [
    ["ORD1001", "P003", 70, "01-21-2022"],
    ["ORD1004", "P033", 12, "01-24-2022"],
    ["ORD1005", "P036", 10, "01-20-2022"],
    ["ORD1002", "P016", 2, "01-10-2022"],
    ["ORD1003", "P012", 6, "01-10-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the dataframe
df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
df = df.withColumn("order_date", F.to_date("order_date", "MM-dd-yyyy"))
df.printSchema()
df.show()

root
 |-- order_id: string (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- qty: long (nullable = true)
 |-- order_date: date (nullable = true)

+--------+-------+---+----------+
|order_id|prod_id|qty|order_date|
+--------+-------+---+----------+
| ORD1001|   P003| 70|2022-01-21|
| ORD1004|   P033| 12|2022-01-24|
| ORD1005|   P036| 10|2022-01-20|
| ORD1002|   P016|  2|2022-01-10|
| ORD1003|   P012|  6|2022-01-10|
+--------+-------+---+----------+



In [None]:
# Lets repartition the data with order_date and write

df.write.format("parquet").partitionBy("order_date").mode("overwrite").save(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
)

                                                                                

In [None]:
spark.read.parquet(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
).groupBy("order_date").count().show()

+----------+-----+
|order_date|count|
+----------+-----+
|2022-01-26|    1|
|2022-01-10|    2|
|2022-01-24|    1|
|2022-01-21|    1|
|2022-01-20|    1|
+----------+-----+



In [None]:
# Lets create our delta dataset for Overwrite

_data = [
    ["ORD1010", "P053", 78, "01-29-2022"],
    ["ORD1011", "P076", 21, "01-19-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the delta dataframe
delta_df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
delta_df = delta_df.withColumn("order_date", F.to_date("order_date", "MM-dd-yyyy"))
delta_df.printSchema()
delta_df.show()

root
 |-- order_id: string (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- qty: long (nullable = true)
 |-- order_date: date (nullable = true)

+--------+-------+---+----------+
|order_id|prod_id|qty|order_date|
+--------+-------+---+----------+
| ORD1010|   P053| 78|2022-01-29|
| ORD1011|   P076| 21|2022-01-19|
+--------+-------+---+----------+



In [None]:
# Lets write to the same location for Orders partitioned

delta_df.write.format("parquet").partitionBy("order_date").mode("overwrite").save(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
)

                                                                                

In [None]:
spark.read.parquet(
    "hdfs://namenode:9000/output/advanced/14/orders_partitioned"
).groupBy("order_date").count().show()

+----------+-----+
|order_date|count|
+----------+-----+
|2022-01-10|    1|
|2022-01-29|    1|
|2022-01-24|    1|
|2022-01-26|    1|
|2022-01-19|    1|
|2022-01-21|    1|
|2022-01-20|    1|
+----------+-----+



In [19]:
spark.stop()