In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

config = SparkConf() \
                    .setAppName('Delta Lake') \
                    .setAll([('spark.executor.memory', '4G'),
                            ('spark.driver.memory', '2G'),
                            ('spark.driver.maxResultSize', '1G')]) \
                    .set("spark.jars.packages", "com.amazonaws:aws-java-sdk-s3:1.12.161,org.apache.hadoop:hadoop-aws:3.3.1,io.delta:delta-core_2.12:1.1.0") \
                    .set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
                    .set("spark.hadoop.fs.s3a.access.key", "minio") \
                    .set("spark.hadoop.fs.s3a.secret.key", "minio123") \
                    .set("spark.hadoop.fs.s3a.path.style.access", True) \
                    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .set("spark.hadoop.fs.s3a.path.style.access", "true") \
                    .set("spark.hadoop.fs.s3a.committer.name", "directory") \
                    .set("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "replace") \
                    .set("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp/staging") \
                    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

sc = SparkContext(conf=config)
spark = SparkSession(sc)

# spark = SparkSession.builder.config(conf=config)

spark

2022-02-20 22:00:53,669 WARN util.Utils: Your hostname, localhost resolves to a loopback address: 127.0.0.1; using 192.168.32.2 instead (on interface eth0)
2022-02-20 22:00:53,669 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.amazonaws#aws-java-sdk-s3 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-daa305bd-4090-4164-9650-232de12841e8;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark-3.2.1-bin-without-hadoop/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.amazonaws#aws-java-sdk-s3;1.12.161 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.161 in central
	found com.amazonaws#aws-java-sdk-core;1.12.161 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found com.fasterxml.jackson.core#jackson-databind;2.12.6 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.12.6 in central
	found com.fasterxml.jackson.core#jackson-core;2.12.6 in central
	found com.fasterxml.jackson.dataformat#jackson-dataformat-cbor;2.12.6 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.161 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;

In [67]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import lit
from delta.tables import DeltaTable

In [86]:
data =  [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 1)},
         {'id': 2, 'name': 'Braga', 'updated_at': datetime(2022, 2, 2)},
         {'id': 3, 'name': 'Steve', 'updated_at': datetime(2022, 3, 3)}]

schema = StructType([StructField('id', IntegerType(), nullable=True),
                     StructField('name', StringType(), nullable=True),
                     StructField('updated_at', DateType(), nullable=True)])

df = spark.createDataFrame(data, schema=schema)
df.toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


## Create Delta Table

### HDFS

In [87]:
# Create or replace partitioned table with path using DataFrame's schema and write/overwrite data to it
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .option("userMetadata", "some comments") \
  .save("/delta-lake/users")

# Create table in the metastore using DataFrame's schema and write data to it
# df.write.format("delta") \
#   .mode("overwrite") \
#   .option("mergeSchema", "true") \
#   .saveAsTable("users")

                                                                                

### Minio (S3)

In [None]:
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .save("s3a://my-bucket//delta-lake/users")

### Create table without data

In [7]:
my_table = DeltaTable.createOrReplace(spark) \
  .addColumn("id", "INT") \
  .addColumn("firstName", "STRING") \
  .addColumn("middleName", "STRING") \
  .addColumn("lastName", "STRING", comment = "surname") \
  .addColumn("gender", "STRING") \
  .addColumn("birthDate", "TIMESTAMP") \
  .addColumn("ssn", "STRING") \
  .addColumn("salary", "INT") \
  .property("description", "table with people data") \
  .location("/delta-lake/my-table") \
  .execute()

                                                                                

In [23]:
# Read as delta format
my_table = DeltaTable.forPath(spark, "/delta-lake/my-table")
my_table.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,0,2022-02-20 22:04:09.548,,,CREATE OR REPLACE TABLE,"{'description': None, 'partitionBy': '[]', 'pr...",,,,,SnapshotIsolation,True,{},,Apache-Spark/3.2.1 Delta-Lake/1.1.0


In [24]:
my_table.toDF().toPandas()

Unnamed: 0,id,firstName,middleName,lastName,gender,birthDate,ssn,salary


## Read Delta Table

### HDFS

In [88]:
# Read as delta format
spark.read.format("delta") \
     .load("/delta-lake/users") \
     .toPandas()

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [11]:
spark.sql("""
    SELECT * FROM delta.`/delta-lake/users` -- query table by path
""").toPandas()

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [28]:
# Read as parquet format
spark.read.format("parquet") \
     .load("/delta-lake/users") \
     .toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04
2,2,Braga,2022-02-02
3,3,Steve,2022-03-03
4,1,Alice,2022-01-01


### Minio (S3)

In [None]:
spark.read.format("delta") \
     .load("s3a://my-bucket/delta-lake/users") \
     .toPandas()

# spark.read.format("parquet") \
#      .load("s3a://my-bucket//delta-lake/users") \
#      .show()

## Describe tables

In [29]:
users_table = DeltaTable.forPath(spark, "/delta-lake/users")
users_table.history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,1,2022-02-20 22:11:18.529,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,0.0,Serializable,False,"{'numOutputRows': '2', 'numTargetRowsInserted'...",,Apache-Spark/3.2.1 Delta-Lake/1.1.0
1,0,2022-02-20 22:03:47.787,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '3', 'numOutputBytes': '1880...",some comments,Apache-Spark/3.2.1 Delta-Lake/1.1.0


In [14]:
spark.sql("""
    DESCRIBE DETAIL delta.`/delta-lake/users` -- query table by path
""").toPandas()

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
0,delta,0b8f1ca3-c7bf-4740-89ea-678beefb4baf,,,hdfs://localhost:9000/delta-lake/users,2022-02-20 22:03:45.023,2022-02-20 22:03:47.787,[],2,1880,{},1,2


In [15]:
# Only if delta table saved as table
spark.sql("""
    DESCRIBE DETAIL users
""").toPandas()

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
0,delta,931b0889-4c4f-4f45-98aa-d64052cb5fbd,default.users,,file:/opt/apps/spark-warehouse/users,2022-02-20 18:40:31.589,2022-02-20 22:03:55.757,[],2,1880,{},1,2


In [16]:
# Only if delta table saved as table
spark.sql("""
    SELECT * FROM users
    """).toPandas()

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01


In [17]:
# Tables in metastore
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,users,False


## Upsert (Merge) new Data

In [115]:
users_table = DeltaTable.forPath(spark, "/delta-lake/users")

In [90]:
new_data = [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 2)},
            {'id': 4, 'name': 'Carell', 'updated_at': datetime(2022, 4, 4)}]

new_data_df = spark.createDataFrame(new_data, schema=schema)
new_data_df.toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04


In [None]:
users_table.alias("old_data") \
           .merge(source=new_data_df.alias("new_data"), condition="old_data.id = new_data.id") \
           .whenMatchedUpdate(set={
                                    "updated_at": "new_data.updated_at"
                              }) \
           .whenNotMatchedInsert(values={
                                            "id": "new_data.id",
                                            "name": "new_data.name",
                                            "updated_at": "new_data.updated_at"
                                }) \
           .execute()

users_table.toDF().toPandas()

In [91]:
users_table.alias("old_data") \
           .merge(source=new_data_df.alias("new_data"), condition="old_data.id = new_data.id") \
           .whenMatchedUpdateAll() \
           .whenNotMatchedInsertAll() \
           .execute()

users_table.toDF().toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04
2,2,Braga,2022-02-02
3,3,Steve,2022-03-03


### Upsert with Missing fields

In [92]:
data_missing_fields = [{"id": 5, 'name': 'Joao'}]

data_missing_fields = spark.createDataFrame(data_missing_fields, schema=schema)
data_missing_fields.toPandas()

Unnamed: 0,id,name,updated_at
0,5,Joao,


In [93]:
users_table.alias("old_data") \
           .merge(source=data_missing_fields.alias("new_data"), condition="old_data.id = new_data.id") \
           .whenMatchedUpdateAll() \
           .whenNotMatchedInsertAll() \
           .execute()

users_table.toDF().limit(10).toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-02
1,4,Carell,2022-04-04
2,2,Braga,2022-02-02
3,3,Steve,2022-03-03
4,5,Joao,


## Update Schema adding column

In [100]:
# Read as delta format
update_schema = spark.read.format("delta").load("/delta-lake/users")

In [101]:
# Add column
update_schema = update_schema.withColumn("age", lit(None).cast(StringType()))
update_schema.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- updated_at: date (nullable = true)
 |-- age: string (nullable = true)



In [102]:
update_schema.limit(10).toPandas()

Unnamed: 0,id,name,updated_at,age
0,1,Alice,2022-01-02,
1,4,Carell,2022-04-04,
2,2,Braga,2022-02-02,
3,3,Steve,2022-03-03,
4,5,Joao,,


In [106]:
# Merge Schema, another option is overwriteSchema
update_schema.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .option("userMetadata", "add age column") \
  .save("/delta-lake/users")

                                                                                

In [119]:
DeltaTable.forPath(spark, "/delta-lake/users").history().toPandas()

Unnamed: 0,version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,5,2022-02-20 22:56:30.573,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,4.0,Serializable,False,"{'numOutputRows': '7', 'numTargetRowsInserted'...",,Apache-Spark/3.2.1 Delta-Lake/1.1.0
1,4,2022-02-20 22:50:27.894,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,3.0,Serializable,False,"{'numOutputRows': '10', 'numOutputBytes': '243...",add age column,Apache-Spark/3.2.1 Delta-Lake/1.1.0
2,3,2022-02-20 22:48:36.639,,,WRITE,"{'mode': 'Append', 'partitionBy': '[]'}",,,,2.0,Serializable,False,"{'numOutputRows': '5', 'numOutputBytes': '2254...",add age column,Apache-Spark/3.2.1 Delta-Lake/1.1.0
3,2,2022-02-20 22:46:07.823,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,1.0,Serializable,False,"{'numOutputRows': '1', 'numTargetRowsInserted'...",,Apache-Spark/3.2.1 Delta-Lake/1.1.0
4,1,2022-02-20 22:45:53.184,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""...",,,,0.0,Serializable,False,"{'numOutputRows': '2', 'numTargetRowsInserted'...",,Apache-Spark/3.2.1 Delta-Lake/1.1.0
5,0,2022-02-20 22:45:34.193,,,WRITE,"{'mode': 'Overwrite', 'partitionBy': '[]'}",,,,,Serializable,False,"{'numOutputRows': '3', 'numOutputBytes': '1880...",some comments,Apache-Spark/3.2.1 Delta-Lake/1.1.0


In [118]:
DeltaTable.forPath(spark, "/delta-lake/users").toDF().toPandas()

Unnamed: 0,id,name,updated_at,age
0,1,Alice,2022-01-02,30.0
1,1,Alice,2022-01-02,30.0
2,2,Braga,2022-02-02,
3,3,Steve,2022-03-03,
4,4,Carell,2022-04-04,
5,4,Carell,2022-04-04,
6,5,Joao,,
7,2,Braga,2022-02-02,
8,3,Steve,2022-03-03,
9,5,Joao,,


In [None]:
spark.stop()