In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder \
            .appName("Delta Lake - minio") \
            .config("spark.executor.memory", "1G") \
            .config("spark.driver.memory", "1G") \
            .config("spark.driver.maxResultSize", "1G") \
            .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
            .config("spark.jars.packages", "com.amazonaws:aws-java-sdk-s3:1.12.765,org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.2.0") \
            .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
            .config("spark.hadoop.fs.s3a.access.key", "myuserserviceaccount") \
            .config("spark.hadoop.fs.s3a.secret.key", "myuserserviceaccountpassword") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.committer.name", "directory") \
            .config("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "replace") \
            .config("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp/staging") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

In [2]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import lit, col
from delta.tables import DeltaTable

In [3]:
data =  [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 1)},
         {'id': 2, 'name': 'Braga', 'updated_at': datetime(2022, 2, 2)},
         {'id': 3, 'name': 'Steve', 'updated_at': datetime(2022, 3, 3)}]

schema = StructType([StructField('id', IntegerType(), nullable=True),
                     StructField('name', StringType(), nullable=True),
                     StructField('updated_at', DateType(), nullable=True)])

df = spark.createDataFrame(data, schema=schema)
df.toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


In [4]:
bucket = "my-bucket"

# Create Delta Table from DF

### Create delta table from DF

In [5]:
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .save(f"s3a://{bucket}//delta-lake/users")

                                                                                

### Create table without data

In [7]:
my_table = DeltaTable.createOrReplace(spark) \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .property("description", "table with people info") \
  .location(f"s3a://{bucket}/delta-lake/employees") \
  .execute()

## Read Delta Table

### Read as Delta Table

In [8]:
from pyspark.sql import functions as F

# Read delta table in delta format
my_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/employees")

# in order to fix the casting error dtype 'datetime64' is not supported: cast datetime fields
my_table.toDF().withColumn('birthDate', F.from_unixtime(F.unix_timestamp('birthDate'))).toPandas()

# or
# my_table.toDF().show()

                                                                                

Unnamed: 0,id,firstName,middleName,lastName,gender,birthDate,ssn,salary


### Read as DF

In [9]:
spark.read.format("delta") \
     .load(f"s3a://{bucket}/delta-lake/users") \
     .toPandas()

# spark.read.format("parquet") \
#      .load(f"s3a://{bucket}//delta-lake/users") \
#      .show()

                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


### Read as DF using SQL

In [10]:
spark.sql(f"""
    SELECT * FROM delta.`s3a://{bucket}/delta-lake/users`
""").toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


## Describe tables

In [11]:
from pyspark.sql import functions as F

# Read empty delta table in delta format
my_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/employees")
history_df = my_table.history()

# history_df.toPandas() # returns Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.

# in order to fix the above error: cast datetime fields
history_df.withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

# history_df.withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).dtypes

# history_df.show(truncate=False)

                                                                                

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2024-08-02 01:51:11,,,CREATE OR REPLACE TABLE,"{'partitionBy': '[]', 'clusterBy': '[]', 'desc...",,,,,Serializable,True,{},,Apache-Spark/3.5.1 Delta-Lake/3.2.0


In [12]:
my_table.detail() \
.withColumn('createdAt', F.from_unixtime(F.unix_timestamp('createdAt'))) \
.withColumn('lastModified', F.from_unixtime(F.unix_timestamp('lastModified'))) \
.toPandas()

# my_table.detail().show()

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
0,delta,b7de3eb1-a632-40f4-af01-57763de6e772,,,s3a://my-bucket/delta-lake/employees,2024-08-02 01:51:11,2024-08-02 01:51:11,[],[],0,0,{'description': 'table with people info'},1,2,"[appendOnly, invariants]"


In [13]:
users_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users")

users_table.history().withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2024-08-02 01:51:10,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numFiles': '2', 'numOutputRows': '3', 'numOu...",,Apache-Spark/3.5.1 Delta-Lake/3.2.0


## Upsert (Merge) new Data

In [14]:
from pyspark.sql import functions as F

users_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users")

users_table.toDF().withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))).toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02 00:00:00
1,3,Steve,2022-03-03 00:00:00
2,1,Alice,2022-01-01 00:00:00


In [15]:
new_data = [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 2)},
            {'id': 10, 'name': 'Jose', 'updated_at': datetime(2023, 1, 2)},
            {'id': 4, 'name': 'Carell', 'updated_at': datetime(2022, 4, 4)}]

new_data_df = spark.createDataFrame(new_data, schema=schema)
new_data_df.toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,10,Jose,2023-01-02
2,4,Carell,2022-04-04


In [16]:
users_table.alias("old_data") \
           .merge(source=new_data_df.alias("new_data"), condition="old_data.id = new_data.id") \
           .whenMatchedUpdate(set={
                                    "updated_at": "new_data.updated_at"
                              }) \
           .whenNotMatchedInsert(values={
                                            "id": "new_data.id",
                                            "name": "new_data.name",
                                            "updated_at": "new_data.updated_at"
                                }) \
           .execute()

users_table.toDF().toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04
2,10,Jose,2023-01-02
3,2,Braga,2022-02-02
4,3,Steve,2022-03-03


In [17]:
users_table.alias("old_data") \
           .merge(source=new_data_df.alias("new_data"), condition="old_data.id = new_data.id") \
           .whenMatchedUpdateAll() \
           .whenNotMatchedInsertAll() \
           .execute()

users_table.toDF().toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04
2,10,Jose,2023-01-02
3,2,Braga,2022-02-02
4,3,Steve,2022-03-03


## Read new/updates records only

In [18]:
spark.sql(f"""
    ALTER TABLE delta.`s3a://{bucket}/delta-lake/users` SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

DataFrame[]

In [19]:
users_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users")

users_table.history().withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,3,2024-08-02 01:52:10,,,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",,,,2.0,Serializable,True,{},,Apache-Spark/3.5.1 Delta-Lake/3.2.0
1,2,2024-08-02 01:52:09,,,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...",,,,1.0,Serializable,False,"{'numTargetRowsCopied': '0', 'numTargetRowsDel...",,Apache-Spark/3.5.1 Delta-Lake/3.2.0
2,1,2024-08-02 01:52:03,,,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...",,,,0.0,Serializable,False,"{'numTargetRowsCopied': '0', 'numTargetRowsDel...",,Apache-Spark/3.5.1 Delta-Lake/3.2.0
3,0,2024-08-02 01:51:10,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numFiles': '2', 'numOutputRows': '3', 'numOu...",,Apache-Spark/3.5.1 Delta-Lake/3.2.0


In [20]:
# get latest version
latest_version = users_table.history().select("version").collect()[0].version
print("latest version:", latest_version)

latest_version_merge = users_table.history().filter("operation in ('MERGE', 'WRITE')").select("version").collect()[0].version
print("latest version with merge:", latest_version_merge)

latest version: 3
latest version with merge: 2


In [21]:
spark.read.format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingVersion", latest_version) \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .filter("_change_type = 'update_postimage'") \
    .withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))) \
    .withColumn('_commit_timestamp', F.from_unixtime(F.unix_timestamp('_commit_timestamp'))) \
    .toPandas()


# spark.read.format("delta") \
#     .option("readChangeFeed", "true") \
#     .option("startingVersion", latest_version - 4) \
#     .option("endingVersion", latest_version) \
#     .load(f"s3a://{bucket}/delta-lake/users") \
#     .withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))).toPandas()


Unnamed: 0,id,name,updated_at,_change_type,_commit_version,_commit_timestamp


In [None]:
spark.read.format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingVersion", latest_version_merge) \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .filter("_change_type = 'update_postimage'") \
    .withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))) \
    .withColumn('_commit_timestamp', F.from_unixtime(F.unix_timestamp('_commit_timestamp'))) \
    .toPandas()

In [None]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users") \
.toDF() \
.withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))) \
.toPandas()

### Upsert with Missing columns

In [None]:
# it's missing 'updated_at' column
data_missing_fields = [{"id": 5, 'name': 'Joao'}]

data_missing_fields = spark.createDataFrame(data_missing_fields, schema=schema)
data_missing_fields.toPandas()

In [None]:
users_table.alias("old_data") \
           .merge(source=data_missing_fields.alias("new_data"), condition="old_data.id = new_data.id") \
           .whenMatchedUpdateAll() \
           .whenNotMatchedInsertAll() \
           .execute()

users_table.toDF().toPandas()

## Update Schema adding column

In [None]:
# Read as delta format
users_df = spark.read.format("delta").load(f"s3a://{bucket}/delta-lake/users")

users_df.toPandas()

In [None]:
# Add column
new_users_df = users_df.withColumn("age", lit(None).cast(StringType()))
new_users_df.printSchema()

In [None]:
new_users_df.toPandas()

In [None]:
# Merge Schema, another option is overwriteSchema
new_users_df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .option("userMetadata", "add age column") \
  .save(f"s3a://{bucket}/delta-lake/users")

In [None]:
# DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").history().show(vertical=True, truncate=False)

DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").history().withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

In [None]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").toDF().toPandas()

## Update values

In [None]:
users_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users")

users_table.toDF().withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))).toPandas()

In [None]:
users_table.update(
    condition = col("id") == 5,
    set = { "age": lit(30) }
)

In [None]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").toDF().withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))).toPandas()

## Read specific version

In [None]:
spark.read \
    .format("delta") \
    .option("versionAsOf", 0) \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .toPandas()

In [None]:
spark.read \
    .format("delta") \
    .option("versionAsOf", 9) \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .toPandas()

In [None]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").toDF().withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))).toPandas()

## Delete rows

In [None]:
users_table = DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users")

users_table.toDF().withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))).toPandas()

In [None]:
users_table.delete(col("name") == "Alice")

In [None]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").toDF().withColumn('updated_at', F.from_unixtime(F.unix_timestamp('updated_at'))).toPandas()

## Time Travel

In [23]:
# available versions
DeltaTable \
.forPath(spark, f"s3a://{bucket}/delta-lake/users").history() \
.select("version", "timestamp", "operation", "operationParameters", "operationMetrics", "userMetadata") \
.withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))) \
.toPandas()

Unnamed: 0,version,timestamp,operation,operationParameters,operationMetrics,userMetadata
0,3,2024-08-02 01:52:10,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",{},
1,2,2024-08-02 01:52:09,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
2,1,2024-08-02 01:52:03,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
3,0,2024-08-02 01:51:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}","{'numFiles': '2', 'numOutputRows': '3', 'numOu...",


In [24]:
spark.read \
    .format("delta") \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04
2,10,Jose,2023-01-02
3,2,Braga,2022-02-02
4,3,Steve,2022-03-03


In [None]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").restoreToVersion(9).toPandas()

In [None]:
spark.read \
    .format("delta") \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .toPandas()

### Repartition and time travel

In [26]:
spark.read \
    .format("delta") \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("userMetadata", "partitioning by date") \
    .partitionBy("updated_at") \
    .save(f"s3a://{bucket}/delta-lake/users")

In [27]:
spark.read \
    .format("delta") \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .rdd \
    .getNumPartitions()

                                                                                

2

In [28]:
# available versions
DeltaTable \
.forPath(spark, f"s3a://{bucket}/delta-lake/users").history() \
.select("version", "timestamp", "operation", "operationParameters", "operationMetrics", "userMetadata") \
.withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))) \
.toPandas()

Unnamed: 0,version,timestamp,operation,operationParameters,operationMetrics,userMetadata
0,4,2024-08-02 01:53:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[""update...","{'numFiles': '5', 'numOutputRows': '5', 'numOu...",partitioning by date
1,3,2024-08-02 01:52:10,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",{},
2,2,2024-08-02 01:52:09,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
3,1,2024-08-02 01:52:03,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
4,0,2024-08-02 01:51:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}","{'numFiles': '2', 'numOutputRows': '3', 'numOu...",


In [30]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").restoreToVersion(4).toPandas()

In [31]:
spark.read \
    .format("delta") \
    .load(f"s3a://{bucket}/delta-lake/users") \
    .rdd \
    .getNumPartitions()

                                                                                

2

In [33]:
# available versions
DeltaTable \
.forPath(spark, f"s3a://{bucket}/delta-lake/users").history() \
.select("version", "timestamp", "operation", "operationParameters", "operationMetrics", "userMetadata") \
.withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))) \
.toPandas()

Unnamed: 0,version,timestamp,operation,operationParameters,operationMetrics,userMetadata
0,4,2024-08-02 01:53:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[""update...","{'numFiles': '5', 'numOutputRows': '5', 'numOu...",partitioning by date
1,3,2024-08-02 01:52:10,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",{},
2,2,2024-08-02 01:52:09,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
3,1,2024-08-02 01:52:03,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
4,0,2024-08-02 01:51:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}","{'numFiles': '2', 'numOutputRows': '3', 'numOu...",


In [34]:
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").vacuum(0.3).toPandas()

                                                                                

Deleted 4 files and directories in a total of 6 directories.


In [35]:
# available versions
DeltaTable \
.forPath(spark, f"s3a://{bucket}/delta-lake/users").history() \
.select("version", "timestamp", "operation", "operationParameters", "operationMetrics", "userMetadata") \
.withColumn('timestamp', F.from_unixtime(F.unix_timestamp('timestamp'))) \
.toPandas()

Unnamed: 0,version,timestamp,operation,operationParameters,operationMetrics,userMetadata
0,4,2024-08-02 01:53:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[""update...","{'numFiles': '5', 'numOutputRows': '5', 'numOu...",partitioning by date
1,3,2024-08-02 01:52:10,SET TBLPROPERTIES,"{'properties': '{""delta.enableChangeDataFeed"":...",{},
2,2,2024-08-02 01:52:09,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
3,1,2024-08-02 01:52:03,MERGE,"{'predicate': '[""(id#1756 = id#1873)""]', 'matc...","{'numTargetRowsCopied': '0', 'numTargetRowsDel...",
4,0,2024-08-02 01:51:10,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}","{'numFiles': '2', 'numOutputRows': '3', 'numOu...",


## Optimizations

In [36]:
# coalesce small files into larger ones
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").optimize().executeCompaction().toPandas()

                                                                                

Unnamed: 0,path,metrics
0,s3a://my-bucket/delta-lake/users,"{'numFilesAdded': 0, 'numFilesRemoved': 0, 'fi..."


In [37]:
# vacuum -  remove files no longer referenced
# vacuum files not required by versions more than 1 hour old
DeltaTable.forPath(spark, f"s3a://{bucket}/delta-lake/users").vacuum(1).toPandas()



Deleted 0 files and directories in a total of 6 directories.


                                                                                

## Using SQL commands on a delta table from S3

In [38]:
spark.sql(f"""
    ALTER TABLE delta.`s3a://{bucket}/delta-lake/users` SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

DataFrame[]

In [39]:
spark.sql(f"""
    SELECT * FROM delta.`s3a://{bucket}/delta-lake/users`
""").toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,4,Carell,2022-04-04
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03
3,1,Alice,2022-01-02
4,10,Jose,2023-01-02


In [None]:
spark.stop()