In [1]:
from pyspark.sql import SparkSession

In [2]:
# Load Spark SQL Magic command
%load_ext sparksql_magic

### Following code is used to initilize the Spark Session. 
- Delta lake package is used while creating the spark session, which will help to save spark dataframe as Delta Table.
- Delta Catalog is used as default catalog.
- Data is saved in Minio, which is a S3 compatible lake.

In [43]:
# # Initialize SparkSession
# # Use latest jars for delta lake. 
spark = SparkSession.builder \
    .appName("add-delta-lake_1") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.3.0,org.apache.hadoop:hadoop-aws:3.3.4")\
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "root") \
    .config("spark.hadoop.fs.s3a.secret.key", "jerinminioserver") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.warehouse.dir", "s3a://delta-bucket/delta-lake/data") \
    .getOrCreate()

In [44]:
# Get spark session config details
# spark.sparkContext.getConf().getAll()
print(spark.conf.get("spark.sql.warehouse.dir"))


s3a://delta-bucket/delta-lake/data


In [42]:
# To stop SparkSession
spark.stop()

In [45]:
# Example DataFrame creation
data = [("Jerin", 29), ("Aayush", 35), ("Neeraj", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [46]:
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+---+
|  Name|Age|
+------+---+
| Jerin| 29|
|Aayush| 35|
|Neeraj| 28|
+------+---+



                                                                                

In [48]:
%%sparksql

SHOW CATALOGS

0
catalog
spark_catalog


In [50]:
%%sparksql

SHOW SCHEMAS in spark_catalog


0
namespace
default


In [52]:
%%sparksql

SHOW TABLES in default

0,1,2
namespace,tableName,isTemporary


### Create Unmanaged Delta Table, This can be accessed only using location of the data.
- This unmanaged table is not registered in Delta Catalog, therefore this can not be accessed via Table Reference.

In [10]:
# unmanaged delta table - not registered in delta catalog
df.write.format("delta").save("s3a://delta-bucket/delta-lake/data/test-delta-table-unmanaged")

25/01/15 09:14:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [32]:
# Read unmanaged delta table
spark.sql("SELECT * FROM delta.`s3a://delta-bucket/delta-lake/data/test-delta-table-unmanaged`;").show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
|Aayush| 35|
|Neeraj| 28|
| Jerin| 29|
+------+---+



### Create Managed Delta Table, this can be accessed using SQL Table Reference.
- This managed table is registered in Delta Catalog, therefore this can be accessed via SQL Table Reference.

#### In this scenario, dataframe is saved to managed delta table

In [56]:
# This method is used to create a delta table from a dataframe and register the metadata to delta catalog.
# In this method, When Delta table is created, data location is also managed by delta catalog along with table metadata.  
# Since Delta Catalog manages the data location and metadata of delta table, therefore the actual data location is controlled by Delta Catalog.

# Below command saves data in minio location mentioned while initilizing the spark session
df.write.format("delta").saveAsTable("test_delta_table_managed")

In [57]:
# Read managed delta table, which is registered in delta catalog
spark.sql("select * from test_delta_table_managed;").show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
|Aayush| 35|
|Neeraj| 28|
| Jerin| 29|
+------+---+



In [58]:
%%sparksql
SHOW TABLES

0,1,2
namespace,tableName,isTemporary
default,test_delta_table_managed,False
default,test_delta_table_managed1,False


In [60]:
# Drop managed delta table, since this table is saved as delta table, registered in Delta Catalog ... 
# and data location is also managed by Delta Catalog therefore dropping the table will remove the metadata along with actual data from the location.

spark.sql("DROP TABLE test_delta_table_managed;").show()

++
||
++
++



#### In this method, unmanaged delta table location is registered to Delta catalog and a table name is assigned to it, therefore droping this table does not delete the data from location, it just drop the metadata from delta catalog.
- If the unmanaged table folder name is changed, then querying the table using table name will give error.

In [61]:
# Another method to create a delta table and register the metadata to delta catalog.
# In this method, Delta table is created on the location of unmanaged delta table.  
# Since Delta table is created on the location of unmanaged delta table, therefore the actual data location is not controlled by Delta Catalog.
# This method will register the table metadata to the Delta Catalog but the data location is not managed by Delta Catalog.

spark.sql("""
    CREATE TABLE test_delta_table_managed_method2
    USING DELTA
    LOCATION 's3a://delta-bucket/delta-lake/data/test-delta-table-unmanaged'
""")

DataFrame[]

In [64]:
# Read managed delta table, which is registered in delta catalog
spark.sql("select * from test_delta_table_managed_method2;").show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
|Aayush| 35|
|Neeraj| 28|
| Jerin| 29|
+------+---+



In [63]:
%%sparksql
SHOW TABLES

0,1,2
namespace,tableName,isTemporary
default,test_delta_table_managed_method2,False


In [65]:
# Drop managed delta table, since this table is created on an unmanaged delta table location therefore ...
# dropping the table will only remove the metadata from the delta catalog but not the actual data from the location.

spark.sql("DROP TABLE test_delta_table_managed_method2;").show()

++
||
++
++



In [66]:
# Show current database
spark.sql("SHOW DATABASES;").show()

+---------+
|namespace|
+---------+
|  default|
+---------+

