In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
            .appName('Iceberg - minio') \
            .config("spark.executor.memory", "1G") \
            .config("spark.driver.memory", "1G") \
            .config("spark.driver.maxResultSize", "1G") \
            .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.0") \
            .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
            .config("spark.hadoop.fs.s3a.access.key", "myuserserviceaccount") \
            .config("spark.hadoop.fs.s3a.secret.key", "myuserserviceaccountpassword") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
            .config("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog") \
            .config("spark.sql.catalog.iceberg_catalog.type", "hadoop") \
            .config("spark.sql.catalog.iceberg_catalog.warehouse", "s3a://my-bucket/iceberg") \
            .config("spark.sql.defaultCatalog", "iceberg_catalog") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

:: loading settings :: url = jar:file:/opt/spark-3.5.1-bin-without-hadoop/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-53e0c844-0fe1-436e-8f5c-2035cd260988;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.4.0 in central
	found software.amazon.awssdk#bundle;2.23.19 in central
	found org.wildfly.openssl#wildfly-openssl;1.1.3.Final in central
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.0 in central
:: resolution report :: resolve 127ms :: artifacts dl 3ms
	:: modules in use:
	org.apache.hadoop#hadoop-aws;3.4.0 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.0 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.1.3.Final from central in [default]
	software.amazon.awssdk#bundle;2.23.19 from central in [default]
	-----------------------------------------

In [2]:
bucket = "my-bucket"

## Dataframe creation

In [3]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import lit, col
from delta.tables import DeltaTable

In [4]:
data =  [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 1)},
         {'id': 2, 'name': 'Braga', 'updated_at': datetime(2022, 2, 2)},
         {'id': 3, 'name': 'Steve', 'updated_at': datetime(2022, 3, 3)}]

schema = StructType([StructField('id', IntegerType(), nullable=True),
                     StructField('name', StringType(), nullable=True),
                     StructField('updated_at', DateType(), nullable=True)])

df = spark.createDataFrame(data, schema=schema)
df.toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


# Create Iceberg Table from DF

In [5]:
spark.sql("show catalogs").toPandas()

Unnamed: 0,catalog
0,iceberg_catalog
1,spark_catalog


In [6]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='iceberg_catalog', description=None),
 CatalogMetadata(name='spark_catalog', description=None)]

In [7]:
spark.catalog.currentCatalog()

'iceberg_catalog'

In [8]:
# create database/namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS mydb").toPandas()

In [9]:
spark.sql("show databases").toPandas()

Unnamed: 0,namespace
0,mydb


In [10]:
spark.catalog.listDatabases()

[Database(name='mydb', catalog='iceberg_catalog', description=None, locationUri='s3a://my-bucket/iceberg/mydb')]

In [11]:
spark.sql("describe database mydb").toPandas()

Unnamed: 0,info_name,info_value
0,Catalog Name,iceberg_catalog
1,Namespace Name,mydb
2,Location,s3a://my-bucket/iceberg/mydb


In [12]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS mydb.users (
    id INT COMMENT 'Identificador único',
    name STRING COMMENT 'Nome do indivíduo',
    updated_at date COMMENT 'Data de update'
)
USING iceberg
""")
# location 's3a://{bucket}/iceberg/db/users'  -- no need of it

DataFrame[]

In [13]:
spark.sql("show tables in mydb").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,mydb,users,False


In [14]:
spark.sql("""
    describe table extended mydb.users
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,int,Identificador único
1,name,string,Nome do indivíduo
2,updated_at,date,Data de update
3,,,
4,# Metadata Columns,,
5,_spec_id,int,
6,_partition,struct<>,
7,_file,string,
8,_pos,bigint,
9,_deleted,boolean,


In [15]:
df.writeTo("mydb.users").append()

                                                                                

In [16]:
spark.read.table("mydb.users").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


In [20]:
# append write with Avro instead of Parquet
df.write \
    .option("write-format", "avro") \
    .option("snapshot-property.key", "value") \
    .insertInto("mydb.users")

In [27]:
# To show table history:
spark.sql("SELECT * FROM mydb.users.history").toPandas()

Unnamed: 0,made_current_at,snapshot_id,parent_id,is_current_ancestor
0,2024-08-13 18:34:14.586,3711599203814133704,,True
1,2024-08-13 18:35:13.989,4735134423970272370,3.711599e+18,True


In [28]:
# To show table metadata log entries:
spark.sql("SELECT * FROM mydb.users.metadata_log_entries").toPandas()

Unnamed: 0,timestamp,file,latest_snapshot_id,latest_schema_id,latest_sequence_number
0,2024-08-13 18:34:04.153,s3a://my-bucket/iceberg/mydb/users/metadata/v1...,,,
1,2024-08-13 18:34:14.586,s3a://my-bucket/iceberg/mydb/users/metadata/v2...,3.711599e+18,0.0,1.0
2,2024-08-13 18:35:13.989,s3a://my-bucket/iceberg/mydb/users/metadata/v3...,4.735134e+18,0.0,2.0


In [29]:
# To show the valid snapshots for a table:
spark.sql("SELECT * FROM mydb.users.snapshots").toPandas()

Unnamed: 0,committed_at,snapshot_id,parent_id,operation,manifest_list,summary
0,2024-08-13 18:34:14.586,3711599203814133704,,append,s3a://my-bucket/iceberg/mydb/users/metadata/sn...,"{'engine-version': '3.5.1', 'added-data-files'..."
1,2024-08-13 18:35:13.989,4735134423970272370,3.711599e+18,append,s3a://my-bucket/iceberg/mydb/users/metadata/sn...,"{'engine-version': '3.5.1', 'added-data-files'..."


In [30]:
# To show all the table's current manifest entries for both data and delete files.
spark.sql("SELECT * FROM mydb.users.entries").toPandas()

Unnamed: 0,status,snapshot_id,sequence_number,file_sequence_number,data_file,readable_metrics
0,1,4735134423970272370,2,2,"(0, s3a://my-bucket/iceberg/mydb/users/data/00...","((None, None, None, None, None, None), (None, ..."
1,1,4735134423970272370,2,2,"(0, s3a://my-bucket/iceberg/mydb/users/data/00...","((None, None, None, None, None, None), (None, ..."
2,1,3711599203814133704,1,1,"(0, s3a://my-bucket/iceberg/mydb/users/data/00...","((42, 1, 0, None, 1, 1), (47, 1, 0, None, Alic..."
3,1,3711599203814133704,1,1,"(0, s3a://my-bucket/iceberg/mydb/users/data/00...","((46, 2, 0, None, 2, 3), (56, 2, 0, None, Brag..."


In [22]:
# To show a table's current files:
spark.sql("SELECT * FROM mydb.users.files").toPandas()

Unnamed: 0,content,file_path,file_format,spec_id,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id,readable_metrics
0,0,s3a://my-bucket/iceberg/mydb/users/data/00000-...,AVRO,0,1,592,,,,,,,,,,0,"((None, None, None, None, None, None), (None, ..."
1,0,s3a://my-bucket/iceberg/mydb/users/data/00001-...,AVRO,0,2,605,,,,,,,,,,0,"((None, None, None, None, None, None), (None, ..."
2,0,s3a://my-bucket/iceberg/mydb/users/data/00000-...,PARQUET,0,1,890,"{1: 42, 2: 47, 3: 42}","{1: 1, 2: 1, 3: 1}","{1: 0, 2: 0, 3: 0}",{},"{1: [1, 0, 0, 0], 2: [65, 108, 105, 99, 101], ...","{1: [1, 0, 0, 0], 2: [65, 108, 105, 99, 101], ...",,[4],,0,"((42, 1, 0, None, 1, 1), (47, 1, 0, None, Alic..."
3,0,s3a://my-bucket/iceberg/mydb/users/data/00001-...,PARQUET,0,2,893,"{1: 46, 2: 56, 3: 46}","{1: 2, 2: 2, 3: 2}","{1: 0, 2: 0, 3: 0}",{},"{1: [2, 0, 0, 0], 2: [66, 114, 97, 103, 97], 3...","{1: [3, 0, 0, 0], 2: [83, 116, 101, 118, 101],...",,[4],,0,"((46, 2, 0, None, 2, 3), (56, 2, 0, None, Brag..."


In [32]:
# To show a table's current file manifests:
spark.sql("SELECT * FROM mydb.users.manifests").toPandas()

Unnamed: 0,content,path,length,partition_spec_id,added_snapshot_id,added_data_files_count,existing_data_files_count,deleted_data_files_count,added_delete_files_count,existing_delete_files_count,deleted_delete_files_count,partition_summaries
0,0,s3a://my-bucket/iceberg/mydb/users/metadata/4e...,6788,0,4735134423970272370,2,0,0,0,0,0,[]
1,0,s3a://my-bucket/iceberg/mydb/users/metadata/6a...,6872,0,3711599203814133704,2,0,0,0,0,0,[]


In [25]:
# To show a table's known snapshot references:
spark.sql("SELECT * FROM mydb.users.refs").toPandas()

Unnamed: 0,name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep,max_snapshot_age_in_ms
0,main,BRANCH,4735134423970272370,,,


### Time Travel

In [None]:
# time travel to snapshot with ID 4735134423970272370
spark.read
    .option("snapshot-id", 4735134423970272370)
    .format("iceberg")
    .load("path/to/table")

# time travel to October 26, 1986 at 01:21:00
spark.read
    .option("as-of-timestamp", "499162860000")
    .format("iceberg")
    .load("path/to/table")

In [33]:
spark.read.option("snapshot-id", 4735134423970272370).table("mydb.users").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,1,Alice,2022-01-01
2,2,Braga,2022-02-02
3,3,Steve,2022-03-03
4,2,Braga,2022-02-02
5,3,Steve,2022-03-03


In [35]:
spark.read.option("snapshot-id", 3711599203814133704).table("mydb.users").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


In [None]:
"""
-- time travel to October 26, 1986 at 01:21:00
SELECT * FROM prod.db.table TIMESTAMP AS OF '1986-10-26 01:21:00';

-- time travel to snapshot with id 10963874102873L
SELECT * FROM prod.db.table VERSION AS OF 10963874102873;

-- time travel to the head snapshot of audit-branch
SELECT * FROM prod.db.table VERSION AS OF 'audit-branch';

-- time travel to the snapshot referenced by the tag historical-snapshot
SELECT * FROM prod.db.table VERSION AS OF 'historical-snapshot';

"""

In [43]:
spark.sql("SELECT * FROM mydb.users TIMESTAMP AS OF '2024-08-13 18:35:00'").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


In [45]:
spark.sql("SELECT * FROM mydb.users VERSION AS OF 3711599203814133704").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


In [49]:
spark.table("mydb.users").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03
3,1,Alice,2022-01-01
4,2,Braga,2022-02-02
5,3,Steve,2022-03-03


In [47]:
spark.read.format("iceberg").load("mydb.users").toPandas()

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03
3,1,Alice,2022-01-01
4,2,Braga,2022-02-02
5,3,Steve,2022-03-03
