## Import libraries

In [1]:
from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession

## Create a SparkSession Object

In [2]:
builder = (SparkSession.builder
           .appName("merge-delta-table")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
          )

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")



:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-712db762-f0ad-462a-9f65-539ce13bc4dc;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 67ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   

## Create a delta table

In [3]:
sql_query = """
CREATE OR REPLACE TABLE default.movie_and_show_titles (
    show_id STRING,
    type STRING,
    title STRING,
    director STRING,
    cast STRING,
    country STRING,
    date_added STRING,
    release_year STRING,
    rating STRING,
    duration STRING,
    listed_in STRING,
    description STRING
    ) USING DELTA LOCATION "/opt/workspace/data/delta_lake/movie_and_show_titles";

"""
spark.sql(sql_query)


                                                                                

DataFrame[]

## Create a delta table object, convert to dataframe and show

In [4]:
deltaTable_titles = DeltaTable.forPath(spark, "/opt/workspace/data/delta_lake/movie_and_show_titles")

deltaTable_titles.toDF().show(5)



+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|show_id|type|title|director|cast|country|date_added|release_year|rating|duration|listed_in|description|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+



                                                                                

## Read a Delta Table

In [5]:
df_netflix = (spark.read.format("delta")
              .load("/opt/workspace/data/delta_lake/netflix_titles"))

df_netflix_deduped = df_netflix.dropDuplicates(["type", "title", "director", "date_added"])




## Merge the source dataframe with the target Delta Table

In [6]:
(deltaTable_titles.alias('movie_and_show_titles')
 .merge(df_netflix_deduped.alias('updates'), 
        """ lower(movie_and_show_titles.type) = lower(updates.type)
        AND
        lower(movie_and_show_titles.title) = lower(updates.title)
        AND
        lower(movie_and_show_titles.director) = lower(updates.director)
        AND 
        lower(movie_and_show_titles.date_added) = updates.date_added
        """)
 .whenMatchedUpdate (set = {
     "show_id" : "updates.show_id",
     "type" : "updates.type",
     "title": "updates.title",
     "director" : "updates.director",
     "cast" : "updates.cast",
     "country" :  "updates.country",
     "date_added" : "updates.date_added" ,
     "release_year" : "updates.release_year",
     "rating" : "updates.rating",
     "duration" : "updates.duration",
     "listed_in" : "updates.listed_in",
     "description" : "updates.description"})
 .whenNotMatchedInsert (values = {
     "show_id" : "updates.show_id",
     "type" : "updates.type",
     "title": "updates.title",
     "director" :"updates.director",
     "cast" : "updates.cast",
     "country" :  "updates.country",
     "date_added" : "updates.date_added" ,
     "release_year" : "updates.release_year",
     "rating" : "updates.rating",
     "duration" : "updates.duration",
     "listed_in" : "updates.listed_in",
     "description" : "updates.description"})
 .execute()
)
     

                                                                                

## Retrieve delta table history

In [12]:


sql_query = """
DESCRIBE HISTORY "/opt/workspace/data/delta_lake/movie_and_show_titles"
"""

history = spark.sql(sql_query)
history.show()

+-------+--------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|           operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      1|2025-04-11 22:34:...|  null|    null|               MERGE|{predicate -> ["(...|null|    null|     null|          0|  Serializable|        false|{numTargetRowsCop...|        null|Apache-Spark/3.4....|
|      0|2025-04-11 22:12:...|  null|    null|CREATE OR REPLACE...|{isManaged -> fal...|null|    null|     null|       null|  Serializable|         true|           

## Read additional data (CSV file)

In [14]:
df_titles = (spark.read.format("csv")
             .option("header", "true")
             .load("../../data/titles.csv")
            )

df_titles_deduped = df_titles.dropDuplicates(["type","title"])



## Merge the source dataframe with the target Delta Table

In [17]:
(deltaTable_titles.alias('movie_and_show_titles')
 .merge(df_titles_deduped.alias("updates"), 
        """ lower(movie_and_show_titles.type) = lower(updates.type)
        AND
        lower(movie_and_show_titles.title) = lower(updates.title)
        AND
        movie_and_show_titles.release_year = updates.release_year
        """)
 .whenMatchedUpdate(set = {
     "show_id" : "updates.id",
     "type" : "updates.type",
     "title": "updates.title",
     "country" :  "updates.production_countries",
     "release_year" : "updates.release_year",
     "rating" : "updates.age_certification",
     "duration" : "updates.runtime",
     "listed_in" : "updates.genres",
     "description" : "updates.description"})
 .whenNotMatchedInsert (values = {
     "show_id" : "updates.id",
     "type" : "updates.type",
     "title": "updates.title",
     "country" :  "updates.production_countries",
     "release_year" : "updates.release_year",
     "rating" : "updates.age_certification",
     "duration" : "updates.runtime",
     "listed_in" : "updates.genres",
     "description" : "updates.description"})
 .execute()
)
     

### Alternate way using SQL

In [19]:
sql_query = """

DESCRIBE HISTORY "/opt/workspace/data/delta_lake/movie_and_show_titles"

"""

result = spark.sql(sql_query)

result.show()

+-------+--------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|           operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      2|2025-04-11 22:58:...|  null|    null|               MERGE|{predicate -> ["(...|null|    null|     null|          1|  Serializable|        false|{numTargetRowsCop...|        null|Apache-Spark/3.4....|
|      1|2025-04-11 22:34:...|  null|    null|               MERGE|{predicate -> ["(...|null|    null|     null|          0|  Serializable|        false|{numTargetR

In [20]:
spark.stop()