In [51]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

config = SparkConf() \
                    .setAppName('Delta Lake - HDFS') \
                    .setAll([('spark.executor.memory', '2G'),
                            ('spark.driver.memory', '2G'),
                            ('spark.driver.maxResultSize', '1G')]) \
                    .set("spark.jars.packages", "com.amazonaws:aws-java-sdk-s3:1.12.533,org.apache.hadoop:hadoop-aws:3.3.6,io.delta:delta-core_2.12:2.4.0") \
                    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .set("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
                    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

sc = SparkContext(conf=config)
spark = SparkSession(sc)

spark

2023-09-15 00:33:07,821 WARN util.Utils: Your hostname, localhost resolves to a loopback address: 127.0.0.1; using 172.19.0.2 instead (on interface eth0)
2023-09-15 00:33:07,821 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.amazonaws#aws-java-sdk-s3 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-76ea70e0-1b01-4a69-8a06-e1b25e371e26;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark-3.4.1-bin-without-hadoop/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.amazonaws#aws-java-sdk-s3;1.12.533 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.533 in central
	found com.amazonaws#aws-java-sdk-core;1.12.533 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found com.fasterxml.jackson.core#jackson-databind;2.12.7.1 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.12.7 in central
	found com.fasterxml.jackson.core#jackson-core;2.12.7 in central
	found com.fasterxml.jackson.dataformat#jackson-dataformat-cbor;2.12.6 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.533 in central
	found org.apache.hadoop#hadoop-aws;3.3.6 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.367 in central
	found org.wildfly.openssl#wildfly-openss

In [34]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [35]:
data =  [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 1)},
         {'id': 2, 'name': 'Braga', 'updated_at': datetime(2022, 2, 2)},
         {'id': 3, 'name': 'Steve', 'updated_at': datetime(2022, 3, 3)}]

schema = StructType([StructField('id', IntegerType(), nullable=True),
                     StructField('name', StringType(), nullable=True),
                     StructField('updated_at', DateType(), nullable=True)])

df = spark.createDataFrame(data, schema=schema)
df.toPandas()

2023-09-15 00:33:31,117 INFO codegen.CodeGenerator: Code generated in 118.378583 ms
2023-09-15 00:33:31,177 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_43643/2765826206.py:10
2023-09-15 00:33:31,188 INFO scheduler.DAGScheduler: Got job 0 (toPandas at /tmp/ipykernel_43643/2765826206.py:10) with 2 output partitions
2023-09-15 00:33:31,189 INFO scheduler.DAGScheduler: Final stage: ResultStage 0 (toPandas at /tmp/ipykernel_43643/2765826206.py:10)
2023-09-15 00:33:31,189 INFO scheduler.DAGScheduler: Parents of final stage: List()
2023-09-15 00:33:31,190 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:33:31,192 INFO scheduler.DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[6] at toPandas at /tmp/ipykernel_43643/2765826206.py:10), which has no missing parents
2023-09-15 00:33:31,238 INFO memory.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 13.4 KiB, free 1048.8 MiB)
2023-09-15 00:33:31,262 INFO memory.MemoryStore: B

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


# Create Delta Table

In [6]:
# Create or replace partitioned table with path using DataFrame's schema and write/overwrite data to it
# df.write.format("delta") \
#   .mode("overwrite") \
#   .option("mergeSchema", "true") \
#   .option("userMetadata", "creating table") \
#   .save("/delta-lake/users")

# Create table in the metastore using DataFrame's schema and write data to it
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .saveAsTable("users")

2023-09-15 00:33:50,134 INFO storage.BlockManagerInfo: Removed broadcast_0_piece0 on localhost:39707 in memory (size: 7.1 KiB, free: 1048.8 MiB)
2023-09-15 00:33:50,149 INFO storage.BlockManagerInfo: Removed broadcast_0_piece0 on localhost:43135 in memory (size: 7.1 KiB, free: 1048.8 MiB)
2023-09-15 00:33:50,152 INFO storage.BlockManagerInfo: Removed broadcast_0_piece0 on localhost:44241 in memory (size: 7.1 KiB, free: 1048.8 MiB)
2023-09-15 00:33:50,335 INFO storage.DelegatingLogStore: LogStore `LogStoreAdapter(io.delta.storage.HDFSLogStore)` is used for scheme `file`
2023-09-15 00:33:50,344 INFO delta.DeltaLog: Creating initial snapshot without metadata, because the directory is empty
2023-09-15 00:33:50,354 INFO delta.InitialSnapshot: [tableId=317f7f63-08c6-4443-952d-80ff78690150] Created snapshot InitialSnapshot(path=file:/opt/apps/spark-warehouse/users/_delta_log, version=-1, metadata=Metadata(6e0c0a10-4754-47d4-94f9-3fc9899dd193,null,null,Format(parquet,Map()),null,List(),Map(),S

### Create table without data

In [29]:
my_table = DeltaTable.createOrReplace(spark) \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .property("description", "table with people data") \
  .location("/delta-lake/my-table") \
  .execute()

2023-09-14 01:31:16,455 INFO storage.BlockManagerInfo: Removed broadcast_8_piece0 on localhost:38093 in memory (size: 109.9 KiB, free: 1048.6 MiB)
2023-09-14 01:31:16,461 INFO storage.BlockManagerInfo: Removed broadcast_8_piece0 on localhost:45179 in memory (size: 109.9 KiB, free: 1048.7 MiB)
2023-09-14 01:31:16,471 INFO storage.BlockManagerInfo: Removed broadcast_7_piece0 on localhost:38093 in memory (size: 124.2 KiB, free: 1048.7 MiB)
2023-09-14 01:31:16,475 INFO storage.BlockManagerInfo: Removed broadcast_7_piece0 on localhost:45179 in memory (size: 124.2 KiB, free: 1048.8 MiB)
2023-09-14 01:31:16,475 INFO storage.BlockManagerInfo: Removed broadcast_7_piece0 on localhost:38079 in memory (size: 124.2 KiB, free: 1048.7 MiB)
2023-09-14 01:31:16,502 INFO storage.DelegatingLogStore: LogStore `LogStoreAdapter(io.delta.storage.HDFSLogStore)` is used for scheme `hdfs`
2023-09-14 01:31:16,555 INFO delta.DeltaLog: Creating initial snapshot without metadata, because the directory is empty
2023

In [36]:
# hive metastore
my_table = DeltaTable.createOrReplace(spark) \
    .tableName("employees") \
    .addColumn("id", "INT", nullable = False) \
    .addColumn("firstName", "STRING") \
    .addColumn("middleName", "STRING") \
    .addColumn("lastName", "STRING", comment = "surname") \
    .addColumn("gender", "STRING") \
    .addColumn("birthDate", "TIMESTAMP") \
    .addColumn("ssn", dataType = "INT", generatedAlwaysAs = "id + 1") \
    .addColumn("salary", "INT") \
    .partitionedBy("gender") \
    .property("description", "table with people data") \
    .execute()

2023-09-15 00:38:46,629 INFO delta.DeltaLog: No delta log found for the Delta table at file:/opt/apps/spark-warehouse/employees/_delta_log
2023-09-15 00:38:46,631 INFO delta.InitialSnapshot: [tableId=84ea5dfb-cb22-4224-a77b-4a60faf17319] Created snapshot InitialSnapshot(path=file:/opt/apps/spark-warehouse/employees/_delta_log, version=-1, metadata=Metadata(02c2725b-dcb2-4767-80ca-fa34bfe56519,null,null,Format(parquet,Map()),null,List(),Map(),Some(1694738326630)), logSegment=LogSegment(file:/opt/apps/spark-warehouse/employees/_delta_log,-1,List(),None,-1), checksumOpt=None)
2023-09-15 00:38:46,649 INFO delta.OptimisticTransaction: [tableId=02c2725b,txnId=10c15b31] Updated metadata from - to Metadata(4991a6e2-adb3-4aa2-aae9-5baf1c2c0b87,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"id","type":"integer","nullable":false,"metadata":{}},{"name":"firstName","type":"string","nullable":true,"metadata":{}},{"name":"middleName","type":"string","nullable":true,"metadata":{}}

In [10]:
# Tables in metastore
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,employees,False
1,default,users,False


## Read Delta Table

In [7]:
# Read as delta format
spark.read.format("delta") \
     .load("/delta-lake/users") \
     .toPandas()

2023-09-15 00:10:58,788 INFO stats.PrepareDeltaScan: DELTA: Filtering files for query
2023-09-15 00:10:59,000 INFO storage.BlockManagerInfo: Removed broadcast_15_piece0 on localhost:38829 in memory (size: 124.0 KiB, free: 1048.5 MiB)
2023-09-15 00:10:59,003 INFO storage.BlockManagerInfo: Removed broadcast_15_piece0 on localhost:38219 in memory (size: 124.0 KiB, free: 1048.6 MiB)
2023-09-15 00:10:59,018 INFO storage.BlockManagerInfo: Removed broadcast_15_piece0 on localhost:36691 in memory (size: 124.0 KiB, free: 1048.7 MiB)
2023-09-15 00:10:59,021 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_2236/17769786.py:2
2023-09-15 00:10:59,023 INFO scheduler.DAGScheduler: Got job 13 (toPandas at /tmp/ipykernel_2236/17769786.py:2) with 50 output partitions
2023-09-15 00:10:59,024 INFO scheduler.DAGScheduler: Final stage: ResultStage 18 (toPandas at /tmp/ipykernel_2236/17769786.py:2)
2023-09-15 00:10:59,024 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapSt

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [8]:
spark.sql("""
    SELECT * FROM delta.`/delta-lake/users` -- query table by path
""").toPandas()

2023-09-15 00:11:16,086 INFO stats.PrepareDeltaScan: DELTA: Filtering files for query
2023-09-15 00:11:16,270 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_2236/1023315069.py:1
2023-09-15 00:11:16,272 INFO scheduler.DAGScheduler: Got job 15 (toPandas at /tmp/ipykernel_2236/1023315069.py:1) with 50 output partitions
2023-09-15 00:11:16,272 INFO scheduler.DAGScheduler: Final stage: ResultStage 21 (toPandas at /tmp/ipykernel_2236/1023315069.py:1)
2023-09-15 00:11:16,272 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 20)
2023-09-15 00:11:16,273 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:11:16,275 INFO scheduler.DAGScheduler: Submitting ResultStage 21 (MapPartitionsRDD[66] at toPandas at /tmp/ipykernel_2236/1023315069.py:1), which has no missing parents
2023-09-15 00:11:16,300 INFO memory.MemoryStore: Block broadcast_20 stored as values in memory (estimated size 598.3 KiB, free 1045.7 MiB)
2023-09-15 00:11:16,305 INFO me

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [11]:
spark.sql("""
    SELECT * FROM default.users
""").toPandas()

2023-09-15 00:39:45,656 INFO stats.PrepareDeltaScan: DELTA: Filtering files for query
2023-09-15 00:39:45,932 INFO storage.BlockManagerInfo: Removed broadcast_14_piece0 on localhost:39707 in memory (size: 123.9 KiB, free: 1048.4 MiB)
2023-09-15 00:39:45,949 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_43643/2005388304.py:1
2023-09-15 00:39:45,961 INFO storage.BlockManagerInfo: Removed broadcast_14_piece0 on localhost:43135 in memory (size: 123.9 KiB, free: 1048.6 MiB)
2023-09-15 00:39:45,961 INFO scheduler.DAGScheduler: Got job 11 (toPandas at /tmp/ipykernel_43643/2005388304.py:1) with 50 output partitions
2023-09-15 00:39:45,961 INFO scheduler.DAGScheduler: Final stage: ResultStage 17 (toPandas at /tmp/ipykernel_43643/2005388304.py:1)
2023-09-15 00:39:45,961 INFO scheduler.DAGScheduler: Parents of final stage: List(ShuffleMapStage 16)
2023-09-15 00:39:45,965 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:39:45,968 INFO scheduler.DAGScheduler: S

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [12]:
spark.read.table("users").toPandas()

2023-09-15 00:39:54,240 INFO storage.BlockManagerInfo: Removed broadcast_18_piece0 on localhost:39707 in memory (size: 7.3 KiB, free: 1048.4 MiB)
2023-09-15 00:39:54,286 INFO storage.BlockManagerInfo: Removed broadcast_18_piece0 on localhost:43135 in memory (size: 7.3 KiB, free: 1048.5 MiB)
2023-09-15 00:39:54,286 INFO storage.BlockManagerInfo: Removed broadcast_18_piece0 on localhost:44241 in memory (size: 7.3 KiB, free: 1048.6 MiB)
2023-09-15 00:39:54,306 INFO storage.BlockManagerInfo: Removed broadcast_16_piece0 on localhost:44241 in memory (size: 139.7 KiB, free: 1048.7 MiB)
2023-09-15 00:39:54,311 INFO storage.BlockManagerInfo: Removed broadcast_16_piece0 on localhost:43135 in memory (size: 139.7 KiB, free: 1048.6 MiB)
2023-09-15 00:39:54,451 INFO stats.PrepareDeltaScan: DELTA: Filtering files for query
2023-09-15 00:39:54,466 INFO storage.BlockManagerInfo: Removed broadcast_16_piece0 on localhost:39707 in memory (size: 139.7 KiB, free: 1048.5 MiB)
2023-09-15 00:39:54,598 INFO spa

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


## Describe tables

In [14]:
users_table = DeltaTable.forPath(spark, "/delta-lake/users")
users_table.history().withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

2023-09-15 00:13:51,704 INFO spark.SparkContext: Starting job: getHistory at DeltaTableOperations.scala:54
2023-09-15 00:13:51,707 INFO scheduler.DAGScheduler: Got job 22 (getHistory at DeltaTableOperations.scala:54) with 2 output partitions
2023-09-15 00:13:51,707 INFO scheduler.DAGScheduler: Final stage: ResultStage 28 (getHistory at DeltaTableOperations.scala:54)
2023-09-15 00:13:51,707 INFO scheduler.DAGScheduler: Parents of final stage: List()
2023-09-15 00:13:51,707 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:13:51,708 INFO scheduler.DAGScheduler: Submitting ResultStage 28 (MapPartitionsRDD[104] at getHistory at DeltaTableOperations.scala:54), which has no missing parents
2023-09-15 00:13:51,720 INFO memory.MemoryStore: Block broadcast_29 stored as values in memory (estimated size 230.3 KiB, free 1046.5 MiB)
2023-09-15 00:13:51,723 INFO memory.MemoryStore: Block broadcast_29_piece0 stored as bytes in memory (estimated size 77.3 KiB, free 1046.5 MiB)
2023-09

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2023-09-15 00:09:23,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '3', 'numOutputBytes': '1880...",creating table,Apache-Spark/3.4.1 Delta-Lake/2.4.0


In [19]:
users_table.detail().show()

+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+--------+-----------+----------+----------------+----------------+--------------------+
|format|                  id|name|description|            location|           createdAt|        lastModified|partitionColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|       tableFeatures|
+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+--------+-----------+----------+----------------+----------------+--------------------+
| delta|ce6fe193-b6fa-46b...|null|       null|hdfs://localhost:...|2023-09-15 00:09:...|2023-09-15 00:09:...|              []|       2|       1880|        {}|               1|               2|[appendOnly, inva...|
+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+--------+---------

2023-09-15 00:15:10,300 INFO codegen.CodeGenerator: Code generated in 28.461875 ms
2023-09-15 00:15:10,329 INFO spark.SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
2023-09-15 00:15:10,331 INFO scheduler.DAGScheduler: Got job 24 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
2023-09-15 00:15:10,331 INFO scheduler.DAGScheduler: Final stage: ResultStage 30 (showString at NativeMethodAccessorImpl.java:0)
2023-09-15 00:15:10,331 INFO scheduler.DAGScheduler: Parents of final stage: List()
2023-09-15 00:15:10,331 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:15:10,332 INFO scheduler.DAGScheduler: Submitting ResultStage 30 (MapPartitionsRDD[110] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
2023-09-15 00:15:10,339 INFO memory.MemoryStore: Block broadcast_31 stored as values in memory (estimated size 17.3 KiB, free 1047.3 MiB)
2023-09-15 00:15:10,343 INFO memory.MemoryStore: Block broadc

In [20]:
spark.sql("""
    DESCRIBE DETAIL delta.`/delta-lake/users` -- query table by path
""").withColumn('createdAt', F.from_unixtime(F.unix_timestamp('createdAt'))) \
    .withColumn('lastModified', F.from_unixtime(F.unix_timestamp('lastModified'))) \
    .toPandas()

2023-09-15 00:15:38,962 INFO codegen.CodeGenerator: Code generated in 30.030208 ms
2023-09-15 00:15:38,984 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_2236/2825531411.py:1
2023-09-15 00:15:38,986 INFO scheduler.DAGScheduler: Got job 25 (toPandas at /tmp/ipykernel_2236/2825531411.py:1) with 1 output partitions
2023-09-15 00:15:38,986 INFO scheduler.DAGScheduler: Final stage: ResultStage 31 (toPandas at /tmp/ipykernel_2236/2825531411.py:1)
2023-09-15 00:15:38,986 INFO scheduler.DAGScheduler: Parents of final stage: List()
2023-09-15 00:15:38,987 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:15:38,990 INFO scheduler.DAGScheduler: Submitting ResultStage 31 (MapPartitionsRDD[113] at toPandas at /tmp/ipykernel_2236/2825531411.py:1), which has no missing parents
2023-09-15 00:15:39,004 INFO memory.MemoryStore: Block broadcast_32 stored as values in memory (estimated size 17.7 KiB, free 1047.3 MiB)
2023-09-15 00:15:39,009 INFO memory.MemoryStore: Bloc

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
0,delta,ce6fe193-b6fa-46bb-9458-205a4373c0c0,,,hdfs://localhost:9000/delta-lake/users,2023-09-15 00:09:17,2023-09-15 00:09:23,[],2,1880,{},1,2,"[appendOnly, invariants]"


In [13]:
# Only if delta table saved as table
spark.sql("""
    DESCRIBE DETAIL users
""").withColumn('createdAt', F.from_unixtime(F.unix_timestamp('createdAt'))) \
    .withColumn('lastModified', F.from_unixtime(F.unix_timestamp('lastModified'))) \
    .toPandas()

2023-09-15 00:40:18,967 INFO codegen.CodeGenerator: Code generated in 20.090458 ms
2023-09-15 00:40:19,037 INFO codegen.CodeGenerator: Code generated in 14.836791 ms
2023-09-15 00:40:19,052 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_43643/3718254780.py:2
2023-09-15 00:40:19,053 INFO scheduler.DAGScheduler: Got job 15 (toPandas at /tmp/ipykernel_43643/3718254780.py:2) with 1 output partitions
2023-09-15 00:40:19,053 INFO scheduler.DAGScheduler: Final stage: ResultStage 22 (toPandas at /tmp/ipykernel_43643/3718254780.py:2)
2023-09-15 00:40:19,053 INFO scheduler.DAGScheduler: Parents of final stage: List()
2023-09-15 00:40:19,053 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:40:19,054 INFO scheduler.DAGScheduler: Submitting ResultStage 22 (MapPartitionsRDD[71] at toPandas at /tmp/ipykernel_43643/3718254780.py:2), which has no missing parents
2023-09-15 00:40:19,057 INFO memory.MemoryStore: Block broadcast_22 stored as values in memory (estimated

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
0,delta,80d0e580-1432-462d-b8cf-8bad3cdb9d49,spark_catalog.default.users,,file:/opt/apps/spark-warehouse/users,2023-09-15 00:33:50,2023-09-15 00:33:55,[],2,1880,{},1,2,"[appendOnly, invariants]"


In [15]:
# Only if delta table saved as table
spark.sql("""
    SELECT * FROM users
    """).toPandas()

2023-09-15 00:40:39,437 INFO stats.PrepareDeltaScan: DELTA: Filtering files for query
2023-09-15 00:40:39,635 INFO storage.BlockManagerInfo: Removed broadcast_22_piece0 on localhost:39707 in memory (size: 6.6 KiB, free: 1048.1 MiB)
2023-09-15 00:40:39,639 INFO storage.BlockManagerInfo: Removed broadcast_22_piece0 on localhost:43135 in memory (size: 6.6 KiB, free: 1048.3 MiB)
2023-09-15 00:40:39,657 INFO storage.BlockManagerInfo: Removed broadcast_23_piece0 on localhost:39707 in memory (size: 139.7 KiB, free: 1048.3 MiB)
2023-09-15 00:40:39,659 INFO storage.BlockManagerInfo: Removed broadcast_23_piece0 on localhost:43135 in memory (size: 139.7 KiB, free: 1048.4 MiB)
2023-09-15 00:40:39,664 INFO storage.BlockManagerInfo: Removed broadcast_23_piece0 on localhost:44241 in memory (size: 139.7 KiB, free: 1048.5 MiB)
2023-09-15 00:40:39,684 INFO spark.SparkContext: Starting job: toPandas at /tmp/ipykernel_43643/3276025551.py:2
2023-09-15 00:40:39,686 INFO scheduler.DAGScheduler: Got job 18 (t

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [49]:
# Tables in metastore
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,atable,False
1,default,employees,False
2,,my_table,False


In [17]:
# Databases in metastore
spark.sql("""
    SHOW databases
""").toPandas()

2023-09-15 00:40:59,872 INFO codegen.CodeGenerator: Code generated in 30.678458 ms
2023-09-15 00:40:59,886 INFO codegen.CodeGenerator: Code generated in 4.072375 ms


Unnamed: 0,namespace
0,default


In [45]:
# Drop table
spark.sql("""
    DROP TABLE atable
""").toPandas()

2023-09-15 00:56:55,844 INFO datasources.InMemoryFileIndex: It took 6 ms to list leaf files for 1 paths.


In [46]:
df.write.mode('overwrite').partitionBy("updated_at").saveAsTable("atable")

In [47]:
# Show partitions
spark.sql("""
    show partitions default.atable
""").toPandas()

Unnamed: 0,partition
0,updated_at=2022-01-01
1,updated_at=2022-02-02
2,updated_at=2022-03-03


In [58]:
spark.catalog.currentCatalog()

'spark_catalog'

In [41]:
spark.catalog.listTables()

2023-09-15 00:52:22,728 INFO spark.SparkContext: Starting job: listTables at NativeMethodAccessorImpl.java:0
2023-09-15 00:52:22,729 INFO scheduler.DAGScheduler: Got job 9 (listTables at NativeMethodAccessorImpl.java:0) with 1 output partitions
2023-09-15 00:52:22,729 INFO scheduler.DAGScheduler: Final stage: ResultStage 12 (listTables at NativeMethodAccessorImpl.java:0)
2023-09-15 00:52:22,729 INFO scheduler.DAGScheduler: Parents of final stage: List()
2023-09-15 00:52:22,729 INFO scheduler.DAGScheduler: Missing parents: List()
2023-09-15 00:52:22,730 INFO scheduler.DAGScheduler: Submitting ResultStage 12 (SQLExecutionRDD[47] at listTables at NativeMethodAccessorImpl.java:0), which has no missing parents
2023-09-15 00:52:22,736 INFO memory.MemoryStore: Block broadcast_12 stored as values in memory (estimated size 16.0 KiB, free 1046.8 MiB)
2023-09-15 00:52:22,746 INFO memory.MemoryStore: Block broadcast_12_piece0 stored as bytes in memory (estimated size 5.4 KiB, free 1046.8 MiB)
2023

[Table(name='employees', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False)]

In [48]:
df.createOrReplaceTempView("my_table")

## Restore

In [30]:
spark.sql("""
    RESTORE TABLE default.users TO VERSION AS OF 0
    -- RESTORE TABLE delta.`/delta-lake/users` TO TIMESTAMP AS OF <timestamp>
""").toPandas()

In [50]:
spark.stop()

2023-09-15 00:41:18,447 INFO spark.SparkContext: SparkContext is stopping with exitCode 0.
2023-09-15 00:41:18,493 INFO server.AbstractConnector: Stopped Spark@5a17df0a{HTTP/1.1, (http/1.1)}{0.0.0.0:4040}
2023-09-15 00:41:18,507 INFO ui.SparkUI: Stopped Spark web UI at http://localhost:4040
2023-09-15 00:41:18,537 INFO cluster.YarnClientSchedulerBackend: Interrupting monitor thread
2023-09-15 00:41:18,603 INFO cluster.YarnClientSchedulerBackend: Shutting down all executors
2023-09-15 00:41:18,603 INFO cluster.YarnSchedulerBackend$YarnDriverEndpoint: Asking each executor to shut down
2023-09-15 00:41:18,609 INFO cluster.YarnClientSchedulerBackend: YARN client scheduler backend Stopped
2023-09-15 00:41:18,812 INFO spark.MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
2023-09-15 00:41:18,890 INFO memory.MemoryStore: MemoryStore cleared
2023-09-15 00:41:18,891 INFO storage.BlockManager: BlockManager stopped
2023-09-15 00:41:18,894 INFO storage.BlockManagerMaster: Bl