In [1]:
# For Google Colaboratory
!pip install pyspark==3.5.1 delta-spark

Collecting delta-spark
  Downloading delta_spark-4.0.0-py3-none-any.whl.metadata (1.9 kB)
INFO: pip is looking at multiple versions of delta-spark to determine which version is compatible with other requirements. This could take a while.
  Downloading delta_spark-3.3.2-py3-none-any.whl.metadata (2.2 kB)
  Downloading delta_spark-3.3.1-py3-none-any.whl.metadata (1.9 kB)
  Downloading delta_spark-3.3.0-py3-none-any.whl.metadata (2.0 kB)
  Downloading delta_spark-3.2.1-py3-none-any.whl.metadata (1.9 kB)
  Downloading delta_spark-3.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading delta_spark-3.2.0-py3-none-any.whl (21 kB)
Installing collected packages: delta-spark
Successfully installed delta-spark-3.2.0


In [2]:
from pyspark.sql import SparkSession
from delta.tables import DeltaTable

def _create_delta_spark():
  from pyspark.sql import SparkSession
  from delta import configure_spark_with_delta_pip
  builder = SparkSession.builder.appName("DeltaLakeApp") \
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
  .config("spark.jars.packages","io.delta:delta-core_2.12:2.0.0")
  return configure_spark_with_delta_pip(builder).getOrCreate()

spark = _create_delta_spark()

In [3]:
def _enable_sparkui(port=4040):
    from google.colab import output
    return output.serve_kernel_port_as_window(port, path='/jobs/index.html')

_enable_sparkui()

Try `serve_kernel_port_as_iframe` instead. [0m


<IPython.core.display.Javascript object>

In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [5]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/MyDrive/Big_Data/Practicals' # Please adjust the path accordingly
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/MyDrive/Big_Data/Practicals


## Practical 5b: Delta Lake

This notebook provides an example of Checkpoint File.

Source: Bennie Haelen, “Delta Lake: Up & Running”

In [6]:
data = spark.range(0, 10)
data.coalesce(1).write.format("delta").mode("overwrite").save('/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2')


In [7]:
spark.read.format("delta").load("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2").show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [8]:
spark.read.format("json").load("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2/_delta_log/00000000000000000000.json").show()

+--------------------+--------------------+--------------------+--------+
|                 add|          commitInfo|            metaData|protocol|
+--------------------+--------------------+--------------------+--------+
|                NULL|{Apache-Spark/3.5...|                NULL|    NULL|
|                NULL|                NULL|{1756864740079, {...|    NULL|
|                NULL|                NULL|                NULL|  {1, 2}|
|{true, 1756864744...|                NULL|                NULL|    NULL|
+--------------------+--------------------+--------------------+--------+



In [9]:
for index in range(9):
    data = spark.range(0, 10)
    data.coalesce(1).write.format("delta").mode("append").save('/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2')

In [12]:
spark.read.format("delta").load("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2").count()

100

In [14]:
os.listdir("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2/_delta_log")

['_commits',
 '00000000000000000000.json',
 '.00000000000000000000.json.crc',
 '00000000000000000001.json',
 '.00000000000000000001.json.crc',
 '00000000000000000002.json',
 '.00000000000000000002.json.crc',
 '00000000000000000003.json',
 '.00000000000000000003.json.crc',
 '00000000000000000004.json',
 '.00000000000000000004.json.crc',
 '00000000000000000005.json',
 '.00000000000000000005.json.crc',
 '00000000000000000006.json',
 '.00000000000000000006.json.crc',
 '00000000000000000007.json',
 '.00000000000000000007.json.crc',
 '00000000000000000008.json',
 '.00000000000000000008.json.crc',
 '00000000000000000009.json',
 '.00000000000000000009.json.crc']

In [15]:
data = spark.range(0, 10)
data.coalesce(1).write.format("delta").mode("append").save('/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2')

In [16]:
os.listdir("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2/_delta_log")

['_commits',
 '00000000000000000000.json',
 '.00000000000000000000.json.crc',
 '00000000000000000001.json',
 '.00000000000000000001.json.crc',
 '00000000000000000002.json',
 '.00000000000000000002.json.crc',
 '00000000000000000003.json',
 '.00000000000000000003.json.crc',
 '00000000000000000004.json',
 '.00000000000000000004.json.crc',
 '00000000000000000005.json',
 '.00000000000000000005.json.crc',
 '00000000000000000006.json',
 '.00000000000000000006.json.crc',
 '00000000000000000007.json',
 '.00000000000000000007.json.crc',
 '00000000000000000008.json',
 '.00000000000000000008.json.crc',
 '00000000000000000009.json',
 '.00000000000000000009.json.crc',
 '00000000000000000010.json',
 '.00000000000000000010.json.crc',
 '00000000000000000010.checkpoint.parquet',
 '.00000000000000000010.checkpoint.parquet.crc',
 '_last_checkpoint',
 '._last_checkpoint.crc']

In [17]:
spark.read.format("parquet").load("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2/_delta_log/00000000000000000010.checkpoint.parquet").show()

+----+--------------------+------+--------------------+------------------+--------------+
| txn|                 add|remove|            metaData|          protocol|domainMetadata|
+----+--------------------+------+--------------------+------------------+--------------+
|NULL|{part-00000-cb273...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-6b2c0...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-48eed...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-4119c...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-dac52...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-8e3f6...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-e7398...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-1af2d...|  NULL|                NULL|              NULL|          NULL|
|NULL|{par

In [18]:
for index in range(11):
    data = spark.range(0, 10)
    data.coalesce(1).write.format("delta").mode("append").save('/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2')

In [19]:
os.listdir("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2/_delta_log")

['_commits',
 '00000000000000000000.json',
 '.00000000000000000000.json.crc',
 '00000000000000000001.json',
 '.00000000000000000001.json.crc',
 '00000000000000000002.json',
 '.00000000000000000002.json.crc',
 '00000000000000000003.json',
 '.00000000000000000003.json.crc',
 '00000000000000000004.json',
 '.00000000000000000004.json.crc',
 '00000000000000000005.json',
 '.00000000000000000005.json.crc',
 '00000000000000000006.json',
 '.00000000000000000006.json.crc',
 '00000000000000000007.json',
 '.00000000000000000007.json.crc',
 '00000000000000000008.json',
 '.00000000000000000008.json.crc',
 '00000000000000000009.json',
 '.00000000000000000009.json.crc',
 '00000000000000000010.json',
 '.00000000000000000010.json.crc',
 '00000000000000000010.checkpoint.parquet',
 '.00000000000000000010.checkpoint.parquet.crc',
 '_last_checkpoint',
 '._last_checkpoint.crc',
 '00000000000000000011.json',
 '.00000000000000000011.json.crc',
 '00000000000000000012.json',
 '.00000000000000000012.json.crc',
 '

In [20]:
spark.read.format("parquet").load("/content/gdrive/MyDrive/Big_Data/Practicals/deltaPath2/_delta_log/00000000000000000020.checkpoint.parquet").show(40)

+----+--------------------+------+--------------------+------------------+--------------+
| txn|                 add|remove|            metaData|          protocol|domainMetadata|
+----+--------------------+------+--------------------+------------------+--------------+
|NULL|{part-00000-5672d...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-cb273...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-62fb1...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-7b44a...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-46cdb...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-6b2c0...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-e44ed...|  NULL|                NULL|              NULL|          NULL|
|NULL|{part-00000-48eed...|  NULL|                NULL|              NULL|          NULL|
|NULL|{par