In [1]:
from pyspark.sql import SparkSession

# Note: Using version 3.5_2.12 to match your new Spark installation
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("NessieSpark35") \
    .config("spark.eventLog.enabled", "false") \
    .config("spark.eventLog.dir", "file:///tmp/spark-events") \
    .config("spark.history.fs.logDirectory", "file:///tmp/spark-events") \
    .config("spark.jars.packages", 
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,"
            "org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,"
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .config("spark.sql.extensions", 
            "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,"
            "org.projectnessie.spark.extensions.NessieSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://nessie:19101/api/v1") \
    .config("spark.sql.catalog.iceberg.ref", "main") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "password") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print("Spark 3.5.0 successfully initialized with Iceberg 1.5.0 and Nessie!")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jupyter/.ivy2/cache
The jars for the packages stored in: /home/jupyter/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8ba0dee3-3375-400d-b7d4-8c595c3a8bd8;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.0 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.77.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.5.0/iceberg-spark-runtime-3.5_2.12-1.5.0.jar

Spark 3.5.0 successfully initialized with Iceberg 1.5.0 and Nessie!


In [2]:
spark.sql("SELECT * FROM iceberg.nessie_test.users").show()

25/12/18 11:22:04 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+---+--------+------+
| id|    name|active|
+---+--------+------+
|  1|   Alice|  true|
|  2|     Bob| false|
|  1|   Alice|  true|
|  3|Haranesh|  true|
|  1|   Alice|  true|
|  2|     Bob| false|
|  2|     Bob| false|
+---+--------+------+



In [3]:
spark.table("iceberg.nessie_test.users").show()

+---+--------+------+
| id|    name|active|
+---+--------+------+
|  1|   Alice|  true|
|  2|     Bob| false|
|  1|   Alice|  true|
|  2|     Bob| false|
|  3|Haranesh|  true|
|  1|   Alice|  true|
|  2|     Bob| false|
+---+--------+------+



In [4]:
import pandas as pd

pdf = pd.DataFrame([
    (1, "Alice", True),
    (2, "Bob", False)
], columns=["id", "name", "active"])

# Convert to list of dicts to bypass the Spark/Pandas compatibility bug
df = spark.createDataFrame(pdf.to_dict('records'))

df.writeTo("iceberg.nessie_test.users").append()
df.show()

                                                                                

+------+---+-----+
|active| id| name|
+------+---+-----+
|  true|  1|Alice|
| false|  2|  Bob|
+------+---+-----+



In [5]:
!pip list | grep -E "pandas|numpy|pyarrow|pyspark|cloudpickle"

numpy                     2.3.2
pandas                    2.3.3
pyspark                   3.5.0
