In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("Delta Lake - HDFS") \
                    .config("spark.executor.memory", "1G") \
                    .config("spark.driver.memory", "1G") \
                    .config("spark.driver.maxResultSize", "1G") \
                    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
                    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
                    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
                    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

## Warehouse check

In [2]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='spark_catalog', description=None)]

In [3]:
spark.sql("show catalogs").toPandas()

Unnamed: 0,catalog
0,spark_catalog


In [4]:
spark.catalog.currentCatalog()

'spark_catalog'

In [5]:
# Databases in metastore
spark.sql("""
    SHOW databases
""").toPandas()

Unnamed: 0,namespace
0,default


In [6]:
spark.catalog.listTables()

[Table(name='employees', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='users', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False)]

In [46]:
# Tables in the current catalog
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,employees,False
1,default,users,False


In [45]:
spark.sql("SHOW TABLES IN spark_catalog.default").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,employees,False
1,default,users,False


In [12]:
spark.sql("""
    describe table extended users
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,int,
1,name,string,
2,updated_at,date,
3,,,
4,# Detailed Table Information,,
5,Name,spark_catalog.default.users,
6,Type,MANAGED,
7,Location,file:/opt/apps/spark-warehouse/users,
8,Provider,delta,
9,Table Properties,"[delta.minReaderVersion=1,delta.minWriterVersi...",


## Create users data frame

In [7]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [8]:
data =  [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 1)},
         {'id': 2, 'name': 'Braga', 'updated_at': datetime(2022, 2, 2)},
         {'id': 3, 'name': 'Steve', 'updated_at': datetime(2022, 3, 3)}]

schema = StructType([StructField('id', IntegerType(), nullable=True),
                     StructField('name', StringType(), nullable=True),
                     StructField('updated_at', DateType(), nullable=True)])

df = spark.createDataFrame(data, schema=schema)
df.toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


# Create Delta Table

In [9]:
# Create or replace partitioned table with path using DataFrame's schema and write/overwrite data to it
# save table at a specific path in HDFS
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .option("userMetadata", "creating table") \
  .save("/delta-lake/users")

# Create table in the current catalog path using DataFrame's schema and write data to it
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .saveAsTable("users")

                                                                                

### Create table without data

In [10]:
my_table = DeltaTable.createOrReplace(spark) \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .property("description", "table with people data") \
  .location("/delta-lake/my-table") \
  .execute()

In [12]:
# hadoop metastore
my_table = DeltaTable.createOrReplace(spark) \
    .tableName("employees") \
    .addColumn("id", "INT", nullable = False) \
    .addColumn("firstName", "STRING") \
    .addColumn("middleName", "STRING") \
    .addColumn("lastName", "STRING", comment = "surname") \
    .addColumn("gender", "STRING") \
    .addColumn("birthDate", "TIMESTAMP") \
    .addColumn("ssn", dataType = "INT", generatedAlwaysAs = "id + 1") \
    .addColumn("salary", "INT") \
    .partitionedBy("gender") \
    .property("description", "table with people data") \
    .execute()

In [13]:
# Tables in metastore
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,employees,False
1,default,users,False


## Read Delta Table

In [14]:
# Read as delta format
spark.read.format("delta") \
     .load("/delta-lake/users") \
     .toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [15]:
spark.sql("""
    SELECT * FROM delta.`/delta-lake/users` -- query table by path
""").toPandas()

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [16]:
spark.sql("""
    SELECT * FROM default.users
""").toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [17]:
spark.read.table("users").toPandas()

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


## Describe tables

In [29]:
users_table = DeltaTable.forPath(spark, "/delta-lake/users")
users_table.history().withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2024-08-02 01:24:27,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numFiles': '2', 'numOutputRows': '3', 'numOu...",creating table,Apache-Spark/3.5.1 Delta-Lake/3.2.0


In [30]:
users_table.detail().show()

+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+
|format|                  id|name|description|            location|           createdAt|        lastModified|partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|       tableFeatures|
+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+
| delta|20979595-c9bb-405...|NULL|       NULL|hdfs://localhost:...|2024-08-02 01:24:...|2024-08-02 01:24:...|              []|               []|       2|       1880|        {}|               1|               2|[appendOnly, inva...|
+------+--------------------+----+-----------+--------------------+-----

In [31]:
spark.sql("""
    DESCRIBE DETAIL delta.`/delta-lake/users` -- query table by path
""").withColumn('createdAt', F.from_unixtime(F.unix_timestamp('createdAt'))) \
    .withColumn('lastModified', F.from_unixtime(F.unix_timestamp('lastModified'))) \
    .toPandas()

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
0,delta,20979595-c9bb-4051-9002-3453dae3cc02,,,hdfs://localhost:9000/delta-lake/users,2024-08-02 01:24:25,2024-08-02 01:24:27,[],[],2,1880,{},1,2,"[appendOnly, invariants]"


In [19]:
# Only if delta table saved as table
spark.sql("""
    DESCRIBE DETAIL users
""").withColumn('createdAt', F.from_unixtime(F.unix_timestamp('createdAt'))) \
    .withColumn('lastModified', F.from_unixtime(F.unix_timestamp('lastModified'))) \
    .toPandas()

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
0,delta,6e13e427-eac3-4930-ae17-40dd7d6cfe04,spark_catalog.default.users,,file:/opt/apps/spark-warehouse/users,2024-08-02 01:12:51,2024-08-02 01:12:55,[],[],2,1880,{},1,2,"[appendOnly, invariants]"


In [18]:
# Only if delta table saved as table
spark.sql("""
    SELECT * FROM users
    """).toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [None]:
# Drop table
spark.sql("""
    DROP TABLE atable
""").toPandas()

In [46]:
df.write.mode('overwrite').partitionBy("updated_at").saveAsTable("atable")

In [47]:
# Show partitions
spark.sql("""
    show partitions default.atable
""").toPandas()

Unnamed: 0,partition
0,updated_at=2022-01-01
1,updated_at=2022-02-02
2,updated_at=2022-03-03


In [48]:
df.createOrReplaceTempView("my_table")

## Restore

In [30]:
spark.sql("""
    RESTORE TABLE default.users TO VERSION AS OF 0
    -- RESTORE TABLE delta.`/delta-lake/users` TO TIMESTAMP AS OF <timestamp>
""").toPandas()

In [None]:
spark.stop()