In [1]:
from pyspark.sql import SparkSession

### Following code is used to initilize the Spark Session. Delta lake package is used while creating the spark session, which will help to save spark dataframe as Delta Table.

In [5]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("add-delta-lake") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0")\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .getOrCreate()


In [4]:
# To stop SparkSession
# spark.stop()

In [14]:
# Example DataFrame creation
data = [("Jerin", 29), ("Aayush", 35), ("Neeraj", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [15]:
df.show()

+------+---+
|  Name|Age|
+------+---+
| Jerin| 29|
|Aayush| 35|
|Neeraj| 28|
+------+---+



### Create Unmanaged Delta Table, This can be accessed only using location of the data.
- This unmanaged table is not registered in Delta Catalog, therefore this can not be accessed via Table Reference.

In [23]:
# unmanaged delta table - not registered in delta catalog
df.write.format("delta").save("/opt/spark/data/test-delta-table")

                                                                                

In [24]:
# Read unmanaged delta table
spark.sql("SELECT * FROM delta.`/opt/spark/data/test-delta-table`;").show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
|Aayush| 35|
|Neeraj| 28|
| Jerin| 29|
+------+---+



### Create Managed Delta Table, this can be accessed using SQL Table Reference.
- This managed table is registered in Delta Catalog, therefore this can be accessed via SQL Table Reference.

In [32]:
# Create a delta table, which is registered in delta catalog
df.write.format("delta").saveAsTable("`test_delta_table`")

In [34]:
# Read managed delta table, which is registered in delta catalog
spark.sql("select * from `test_delta_table`;").show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
|Aayush| 35|
|Neeraj| 28|
| Jerin| 29|
+------+---+



In [39]:
# Another method to create a delta table, which is registered in delta catalog
spark.sql("""
    CREATE TABLE test_delta_table_method2
    USING DELTA
    LOCATION '/opt/spark/data/test-delta-table'
""")

DataFrame[]

In [40]:
# Read managed delta table, which is registered in delta catalog
spark.sql("select * from `test_delta_table_method2`;").show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
|Aayush| 35|
|Neeraj| 28|
| Jerin| 29|
+------+---+



In [41]:
# Drop managed delta table
spark.sql("DROP TABLE `test_delta_table_method2`;").show()

++
||
++
++



In [47]:
# Show current database
spark.sql("SHOW DATABASES;").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [45]:
# Show Registered Tables
spark.sql("SHOW TABLES;").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|  default|test_delta_table|      false|
+---------+----------------+-----------+

