In [16]:
import pyspark
from pyspark.sql import SparkSession

In [17]:
%load_ext sparksql_magic

The sparksql_magic extension is already loaded. To reload it, use:
  %reload_ext sparksql_magic


#### Following code is used to initilize the Spark Session. Delta lake package is used while creating the spark session, which will help to save spark dataframe as Delta Table. Nessie is used as default catalog and Minio as Object Storage

In [20]:
################### Initilize Spark Session #########################

# Minio Location; Create bucket ("nessiebucket") in Minio 
v_minioWarehouse = "s3://nessiebucket/"     

# Minio URI; Use IP address in the URI not the docker service "minio".
    # Using docker service "minio" in URI like "http://minio:9000" will give error "connection closed quitely".
    # Adding Minio S3 endpoint in Nessie requires IP in the URI not docker service "minio".
    # If AWS SDK error occurs stating "connection closed quitely" then it means Minio connection is not successful.
# Use docker inspect <Minio-Container_id> to get the IP 
v_minioStorageUri = "http://172.19.0.2:9000"

# Nessie Server URI
v_nessieCatalogUri = "http://nessie:19120/api/v1"  


# Spark Configuration
conf = (
    pyspark.SparkConf()
        .setAppName('nessie-app')
        .setMaster("spark://spark-master:7077")
        # Add Jars - Iceberg, AWS bundle for Minio and Nessie
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.20.147,software.amazon.awssdk:url-connection-client:2.20.147')
        # Add Iceberg and Nessie extensions
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', v_nessieCatalogUri)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Configure Minio storage; Credetials and Region is required to connect to Minio will be stored in Environment Variable 
            # AWS_REGION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
        .set('spark.sql.catalog.nessie.s3.endpoint', v_minioStorageUri)
        .set('spark.sql.catalog.nessie.warehouse', v_minioWarehouse)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

# Create Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [19]:
# To stop SparkSession
spark.stop()

In [4]:
# Get spark session config details
# spark.sparkContext.getConf().getAll()

In [21]:
# Example DataFrame creation
data = [("Jerin", 29), ("Aayush", 35), ("Neeraj", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [22]:
df.show()

                                                                                

+------+---+
|  Name|Age|
+------+---+
| Jerin| 29|
|Aayush| 35|
|Neeraj| 28|
+------+---+



In [23]:
# Create the "testingnessie" namespace
spark.sql("CREATE NAMESPACE nessie.nessieNamespace;").show()

++
||
++
++



In [24]:
df.writeTo("nessie.nessieNamespace.test_iceberg_table1").create()

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".                
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
