In [7]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


CATALOG_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3://warehouse/"
STORAGE_URI = "http://172.18.0.5:9000"

conf = (
    pyspark.SparkConf()
        .setAppName('sales_data_app')
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.24.8,software.amazon.awssdk:url-connection-client:2.24.8')
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set("spark.sql.catalog.nessie.gc.enabled", "true")
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Session Started")

spark.sql("""
    ALTER TABLE nessie.customers 
    SET TBLPROPERTIES ('gc.enabled'='true')
""")

spark.sql("""
    CALL nessie.system.expire_snapshots(
        table => 'nessie.customers',
        retain_last => 1
    )
""").show()

spark.sql("""
    CALL nessie.system.expire_snapshots(
        table => 'nessie.customers',
        older_than => TIMESTAMP '2025-07-29 09:00:00',
        retain_last => 1
    )
""").show()


spark.sql("SELECT * FROM nessie.customers.snapshots").show()

spark.stop()



25/07/29 11:57:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark Session Started


25/07/29 11:57:18 WARN NessieUtil: The Iceberg property 'gc.enabled' and/or 'write.metadata.delete-after-commit.enabled' is enabled on table 'customers' in NessieCatalog. This will likely make data in other Nessie branches and tags and in earlier, historical Nessie commits inaccessible. The recommended setting for those properties is 'false'. Use the 'nessie-gc' tool for Nessie reference-aware garbage collection.
                                                                                

+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+
|deleted_data_files_count|deleted_position_delete_files_count|deleted_equality_delete_files_count|deleted_manifest_files_count|deleted_manifest_lists_count|deleted_statistics_files_count|
+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+
|                       0|                                  0|                                  0|                           2|                           5|                             0|
+------------------------+-----------------------------------+-----------------------------------+----------------------------+----------------------------+------------------------------+

+--------------------+-------------------+-----------------

In [8]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


CATALOG_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3://warehouse/"
STORAGE_URI = "http://172.18.0.5:9000"

conf = (
    pyspark.SparkConf()
        .setAppName('sales_data_app')
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.24.8,software.amazon.awssdk:url-connection-client:2.24.8')
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Session Started")

spark.sql("""
    ALTER TABLE nessie.customers 
    SET TBLPROPERTIES ('gc.enabled'='true')
""")

spark.sql("""
    CALL nessie.system.expire_snapshots(
        table => 'nessie.customers',
        older_than => TIMESTAMP '2025-07-29 09:00:00',
        retain_last => 1
    )
""").show()


spark.sql("SELECT * FROM nessie.customers.snapshots").show()

spark.stop()



Spark Session Started


                                                                                

+-------------------+--------------------+---------+-------------+-------------+-----------+
|        snapshot_id|        committed_at|operation|added_records|total_records|added_files|
+-------------------+--------------------+---------+-------------+-------------+-----------+
|4691373338720129280|2025-07-29 10:14:...|   append|            2|           15|          2|
+-------------------+--------------------+---------+-------------+-------------+-----------+

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2025-07-29 10:14:...|4691373338720129280|8626986426246576127|   append|s3://warehouse/cu...|{dremio-job-id ->...|
+--------------------+-------------------+-------------------+---------+--

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType


CATALOG_URI = "http://nessie:19120/api/v1"
WAREHOUSE = "s3://warehouse/"
STORAGE_URI = "http://172.18.0.5:9000"

conf = (
    pyspark.SparkConf()
        .setAppName('sales_data_app')
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.24.8,software.amazon.awssdk:url-connection-client:2.24.8')
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Session Started")

spark.sql("""
    ALTER TABLE nessie.customers 
    SET TBLPROPERTIES ('gc.enabled'='true')
""")

spark.sql("""
    CALL nessie.system.expire_snapshots(
        table => 'nessie.customers',
        older_than => TIMESTAMP '2025-07-29 09:00:00',
        retain_last => 1
    )
""").show()


spark.sql("SELECT * FROM nessie.customers.snapshots").show()

spark.stop()



Spark Session Started


                                                                                

+--------------------------+----------------------+---------------------+-----------------------+
|rewritten_data_files_count|added_data_files_count|rewritten_bytes_count|failed_data_files_count|
+--------------------------+----------------------+---------------------+-----------------------+
|                         7|                     5|                11260|                      0|
+--------------------------+----------------------+---------------------+-----------------------+

