In [6]:
from pyspark.sql import SparkSession
from delta import *

builder = SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


#### Create Table

In [7]:
data = spark.range(0, 5)
data.write.format("delta").mode("overwrite").save("./tmp/delta-table")

#### Read Data

In [8]:
df = spark.read.format("delta").load("./tmp/delta-table")
df.show()

+---+
| id|
+---+
|  4|
|  3|
|  2|
|  0|
|  1|
+---+



#### Overwrite

In [9]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")

                                                                                

In [10]:
df2 = spark.read.format("delta").load("./tmp/delta-table")
df2.show()

+---+
| id|
+---+
|  4|
|  3|
|  2|
|  0|
|  1|
+---+



#### Conditional update without overwrite

In [11]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "./tmp/delta-table")

# Update every even value by adding 100 to it
deltaTable.update( condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(newData.alias("newData"), "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

25/08/07 20:20:49 WARN UpdateCommand: Could not validate number of records due to missing statistics.
25/08/07 20:20:52 WARN DeleteCommand: Could not validate number of records due to missing statistics.
25/08/07 20:20:55 WARN MapPartitionsRDD: RDD 198 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



#### Read older versions of data using time travel

In [12]:
df = spark.read.format("delta").option("versionAsOf", 0).load("./tmp/delta-table")
df.show()

+---+
| id|
+---+
|  3|
|  2|
|  4|
|  0|
|  1|
+---+

