<a href="https://colab.research.google.com/github/joao-dias-25/dataeng-spark/blob/main/spark/examples/06-write_partitioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Write
- .write
- .format (parquet, csv, json)
- options
- spark.sql.sources.partitionOverwriteMode dynamic

# Write Mode
- overwrite - The overwrite mode is used to overwrite the existing file, alternatively, you can use SaveMode.Overwrite
- append - To add the data to the existing file, alternatively, you can use SaveMode.Append
- ignore - Ignores write operation when the file already exists, alternatively, you can use SaveMode.Ignore.
- errorifexists or error - This is a default option when the file already exists, it returns an error, alternatively, you can use SaveMode.ErrorIfExists.

# Partitioning
Process to organize the data into multiple chunks based on some criteria.
Partitions are organized in sub-folders.
Partitioning improves performance in Spark.

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [2]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [3]:
from faker import Faker
from datetime import datetime

fake = Faker()

users = []
for _ in range(50):
    user = {
        'date': fake.date_time_between_dates(datetime(2024, 5, 1), datetime(2024, 5, 5)),
        'name': fake.name(),
        'address': fake.address(),
        'email': fake.email(),
        'dob': fake.date_of_birth(),
        'phone': fake.phone_number()
    }
    users.append(user)

df = spark.createDataFrame(users)

df.show(10, False)


+----------------------------------------------------+--------------------------+----------+-------------------------+-----------------+--------------------+
|address                                             |date                      |dob       |email                    |name             |phone               |
+----------------------------------------------------+--------------------------+----------+-------------------------+-----------------+--------------------+
|9500 Carter Ridges\nKimberlyhaven, MO 21519         |2024-05-04 11:51:10.699278|1965-09-29|djacobs@example.com      |Erika Wallace    |885-586-3585x301    |
|USS Wallace\nFPO AE 72382                           |2024-05-02 13:23:17.583099|1984-09-04|wwalker@example.net      |Lisa Mullins     |001-467-351-1718x604|
|9218 Karen Manor\nBarberville, ND 40430             |2024-05-04 02:03:43.955199|1958-01-27|coltonjohnson@example.org|Karen Thornton   |4818032072          |
|6873 Ramos Station Suite 575\nPort Bridget, ID 5791

# Writing as PARQUET



In [4]:
# Writing as PARQUET with no partitions

path = "/content/write_partitioning/parquet_no_partitions"

df.write.mode("overwrite").format("parquet").save(path)

!ls /content/write_partitioning/parquet_no_partitions

spark.read.format("parquet").load(path).count()

part-00000-66c796b1-ccda-4d72-a113-f0f73d25bb44-c000.snappy.parquet  _SUCCESS


50

In [8]:
!rm -rf /content/write_partitioning/parquet_with_partitions/date_part=20240503

In [9]:
# Writing as PARQUET with partitions
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions"

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df#.where("date_part = '20240503'")
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy("date_part")                                        # partition the data by column - creates sub-folders for each partition
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

'date_part=20240501'  'date_part=20240502'  'date_part=20240503'  'date_part=20240504'


50

In [10]:
# Checking single partition
spark.read.parquet("/content/write_partitioning/parquet_with_partitions/date_part=20240502").show()

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+
|             address|                date|       dob|               email|                name|               phone|
+--------------------+--------------------+----------+--------------------+--------------------+--------------------+
|USS Wallace\nFPO ...|2024-05-02 13:23:...|1984-09-04| wwalker@example.net|        Lisa Mullins|001-467-351-1718x604|
|4926 Douglas Trai...|2024-05-02 19:37:...|1934-03-13|  mark04@example.net|         Donna Olson|  (895)378-2800x9355|
|57106 James Plain...|2024-05-02 19:28:...|1921-05-09|denniskyle@exampl...|         Terry Brown|001-847-976-5674x084|
|USNS Moore\nFPO A...|2024-05-02 03:34:...|1996-08-18|frankdavid@exampl...|     Kimberly Nguyen|    212.793.6367x561|
|7497 Meza Drive A...|2024-05-02 00:56:...|2006-03-29| tharris@example.org|       Valerie Mccoy|    271.264.6096x104|
|USCGC Washington\...|2024-05-02 12:44:...|1991-06-05|  

# Writing as CSV

https://spark.apache.org/docs/3.5.1/sql-data-sources-csv.html

In [11]:
df.count()

50

In [12]:
path = "/content/write_partitioning/csv_no_partitioning/"

# write as csv
(df
  .write
  .format("csv")
  .mode("overwrite")
  .option("delimiter", "|")
  .option("header", True)
  .save(path))

# listing files in the folder
!ls /content/write_partitioning/csv_no_partitioning/

# read as csv
(spark
  .read
  .options(sep="|", multiLine=True, header=True)
  .csv(path)
  .count())

part-00000-215aeb5c-3bb9-4b90-bfc5-03b26bff4d25-c000.csv  _SUCCESS


50

In [14]:
(spark
  .read
  .options(sep="|", multiLine=True, header=True)
  .csv(path)).show()

+--------------------+--------------------+----------+--------------------+-----------------+--------------------+---------+
|             address|                date|       dob|               email|             name|               phone|date_part|
+--------------------+--------------------+----------+--------------------+-----------------+--------------------+---------+
|9500 Carter Ridge...|2024-05-04T11:51:...|1965-09-29| djacobs@example.com|    Erika Wallace|    885-586-3585x301| 20240504|
|USS Wallace\nFPO ...|2024-05-02T13:23:...|1984-09-04| wwalker@example.net|     Lisa Mullins|001-467-351-1718x604| 20240502|
|9218 Karen Manor\...|2024-05-04T02:03:...|1958-01-27|coltonjohnson@exa...|   Karen Thornton|          4818032072| 20240504|
|6873 Ramos Statio...|2024-05-01T12:23:...|1972-02-27|juliejones@exampl...|Jeffrey Henderson|       (730)511-2221| 20240501|
|4926 Douglas Trai...|2024-05-02T19:37:...|1934-03-13|  mark04@example.net|      Donna Olson|  (895)378-2800x9355| 20240502|


# Writing as JSON

https://spark.apache.org/docs/3.5.1/sql-data-sources-json.html

In [17]:
path = "/content/write_partitioning/json_no_partitioning/"

# write as json
(df
.write
.mode("overwrite")
.format("json")
.save(path))

# listing files in the folder
!ls /content/write_partitioning/json_no_partitioning/

# read as json
(spark
  .read
  .json(path)
  .count())

part-00000-419d3691-98ae-47c4-a74b-6dc64e310efc-c000.json  _SUCCESS


50

In [18]:
# reading json as text
spark.read.text(path).show(10, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                             |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"address":"9500 Carter Ridges\nKimberlyhaven, MO 21519","date":"2024-05-04T11:51:10.699Z","dob":"1965-09-29","email":"djacobs@example.com","name":"Erika Wallace","phone":"885-586-3585x301","date_part":"20240504"}             |
|{"address":"USS Wallace\nFPO AE 72382","date":"2024-05-02T13:23:17.583Z","dob":"198

In [19]:
# reading json as text
spark.read.json(path).show(10, False)

+----------------------------------------------------+------------------------+---------+----------+-------------------------+-----------------+--------------------+
|address                                             |date                    |date_part|dob       |email                    |name             |phone               |
+----------------------------------------------------+------------------------+---------+----------+-------------------------+-----------------+--------------------+
|9500 Carter Ridges\nKimberlyhaven, MO 21519         |2024-05-04T11:51:10.699Z|20240504 |1965-09-29|djacobs@example.com      |Erika Wallace    |885-586-3585x301    |
|USS Wallace\nFPO AE 72382                           |2024-05-02T13:23:17.583Z|20240502 |1984-09-04|wwalker@example.net      |Lisa Mullins     |001-467-351-1718x604|
|9218 Karen Manor\nBarberville, ND 40430             |2024-05-04T02:03:43.955Z|20240504 |1958-01-27|coltonjohnson@example.org|Karen Thornton   |4818032072          |
|687

In [21]:
# partition json data + saveAsTable

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

# write as json
(df.write
  .partitionBy("date_part")
  .mode("overwrite")
  .format("json")
  .saveAsTable("tbl_json_part"))

# read as json
print(spark.table("tbl_json_part").count())

# read as json
spark.sql("show partitions tbl_json_part").show()

50
+------------------+
|         partition|
+------------------+
|date_part=20240501|
|date_part=20240502|
|date_part=20240503|
|date_part=20240504|
+------------------+



In [23]:
spark.sql("show tables from default").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|tbl_json_part|      false|
+---------+-------------+-----------+



# Append Mode

In [None]:
# Writing as PARQUET with APPEND

path = "/content/write_partitioning/parquet_append"

df.write.mode("append").format("parquet").save(path)

!ls /content/write_partitioning/parquet_append

spark.read.format("parquet").load(path).count()

part-00000-19403de2-076f-46ee-849e-cb4d71597ef7-c000.snappy.parquet
part-00000-3b17956e-ed3d-4cbe-bebf-6602d8e45c1d-c000.snappy.parquet
part-00000-dc9d4535-21b1-452d-97eb-cc6cc34a5870-c000.snappy.parquet
_SUCCESS


150