In [1]:
# ------------------------------
# 1. SparkSession with Delta Config
# ------------------------------
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip

builder = (
    SparkSession.builder
    .appName("Test DeltaLake Connection")
    .master("spark://spark:7077")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/prj/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6d02036b-267d-418a-bcf6-555382222e10;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 309ms :: artifacts dl 16ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number

In [2]:
# ------------------------------
# 2. Load Source Data
# ------------------------------
customer_df = spark.read.parquet("/s3-datalake/source/customer_data.parquet")
sales_df = spark.read.csv("/s3-datalake/source/sales_data.csv", header=True, inferSchema=True)

print("Customer Data:")
customer_df.show(3)

print("Sales Data:")
sales_df.show(3)

                                                                                

Customer Data:


                                                                                

+-----------+------+---+--------------+
|customer_id|gender|age|payment_method|
+-----------+------+---+--------------+
|    C241288|Female| 28|   Credit Card|
|    C111565|  Male| 21|    Debit Card|
|    C266599|  Male| 20|          Cash|
+-----------+------+---+--------------+
only showing top 3 rows
Sales Data:
+----------+-----------+--------+--------+-------+------------+--------------+
|invoice_no|customer_id|category|quantity|  price|invoice_date| shopping_mall|
+----------+-----------+--------+--------+-------+------------+--------------+
|   I138884|    C241288|Clothing|       5| 1500.4|  05-08-2022|        Kanyon|
|   I317333|    C111565|   Shoes|       3|1800.51|  12-12-2021|Forum Istanbul|
|   I127801|    C266599|Clothing|       1| 300.08|  09-11-2021|     Metrocity|
+----------+-----------+--------+--------+-------+------------+--------------+
only showing top 3 rows


In [3]:
# ------------------------------
# 3. Write Sales Data as a Delta Table
# ------------------------------
output_path = "/s3-datalake/backup/sales_delta"

sales_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(output_path)

print("Delta files written to:", output_path)

                                                                                

Delta files written to: /s3-datalake/backup/sales_delta


In [4]:
# ------------------------------
# 4. Run SQL on the Delta Table
# ------------------------------
## Run an Update operation
spark.sql(f"""
    UPDATE delta.`{output_path}` 
    SET invoice_no = 'TEST'
""")

# Read and display
spark.sql(f"""
    SELECT * FROM delta.`{output_path}`
""").show(3)

25/07/18 17:06:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/18 17:06:40 WARN UpdateCommand: Could not validate number of records due to missing statistics.
                                                                                

+----------+-----------+---------------+--------+-----+------------+--------------+
|invoice_no|customer_id|       category|quantity|price|invoice_date| shopping_mall|
+----------+-----------+---------------+--------+-----+------------+--------------+
|      TEST|    C870742|Food & Beverage|       4|20.92|  07-10-2021|  Istinye Park|
|      TEST|    C289408|      Cosmetics|       2|81.32|  11-10-2021|        Kanyon|
|      TEST|    C138416|Food & Beverage|       3|15.69|  14-01-2022|Viaport Outlet|
+----------+-----------+---------------+--------+-----+------------+--------------+
only showing top 3 rows


In [5]:
# ------------------------------
# 5. Cleanup Delta Table and Files
# ------------------------------
# Drop metadata reference
spark.sql(f"""
    DROP TABLE IF EXISTS delta.`{output_path}`
""")

# Remove files
!rm -Rf {output_path}

print("Clean-up complete.")
spark.stop()

Clean-up complete.
