## Import libraries

In [1]:
from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr,lit

## Create SparkSession object

In [2]:
builder = (SparkSession.builder
           .appName("upsert-delta-table")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory","512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
          )

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-78fe5b3a-81b0-4311-90d0-d1beca3a1d12;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 72ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   

## Read the delta table

In [3]:
deltaTable = DeltaTable.forPath(spark,"/opt/workspace/data/delta_lake/netflix_titles")


                                                                                

## Explore the data

In [4]:
deltaTable.toDF().show(5)

                                                                                

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         null|Septem

## Update records where director is null

In [5]:
deltaTable.update(
    condition=expr("director IS NULL"),
    set = {"director" : lit("")}
           )

                                                                                

In [6]:
deltaTable.toDF().select("director").show()

+--------------------+
|            director|
+--------------------+
|     Kirsten Johnson|
|                    |
|     Julien Leclercq|
|                    |
|                    |
|       Mike Flanagan|
|Robert Cullen, Jo...|
|        Haile Gerima|
|     Andy Devonshire|
|      Theodore Melfi|
|                    |
|   Kongkiat Komesiri|
| Christian Schwochow|
|       Bruno Garotti|
|                    |
|                    |
|Pedro de Echave G...|
|                    |
|          Adam Salky|
|                    |
+--------------------+
only showing top 20 rows



## Alternate way to update using SQL


In [7]:
sql_query = """
UPDATE delta.`/opt/workspace/data/delta_lake/netflix_titles` SET director = NULL WHERE director = "";
"""

result = spark.sql(sql_query)

print(result)


DataFrame[num_affected_rows: bigint]


In [8]:
deltaTable.toDF().select("director").show()

+--------------------+
|            director|
+--------------------+
|     Kirsten Johnson|
|                null|
|     Julien Leclercq|
|                null|
|                null|
|       Mike Flanagan|
|Robert Cullen, Jo...|
|        Haile Gerima|
|     Andy Devonshire|
|      Theodore Melfi|
|                null|
|   Kongkiat Komesiri|
| Christian Schwochow|
|       Bruno Garotti|
|                null|
|                null|
|Pedro de Echave G...|
|                null|
|          Adam Salky|
|                null|
+--------------------+
only showing top 20 rows



In [9]:
spark.stop()