In [2]:
import os

In [4]:
credentials = {
'AccessKeyId': os.environ['AWS_ACCESS_KEY_ID'],
'SecretAccessKey': os.environ['AWS_SECRET_ACCESS_KEY'],
'SessionToken': os.environ['AWS_SESSION_TOKEN']
}

In [5]:
warehouse_dir = "s3a://icevogel/"

jars_packages = (
    "org.apache.hadoop:hadoop-aws:3.3.6,"
    "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0,"
    "software.amazon.awssdk:url-connection-client:2.17.178,"
    "software.amazon.awssdk:bundle:2.17.178"
)

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Iceberg with Jupyter") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", "8") \
    .config("spark.jars.packages", jars_packages) \
    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.glue_catalog.warehouse", warehouse_dir) \
    .config("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "s3a://icevogel") \
    .config("spark.sql.catalog.my_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.access.key", credentials['AccessKeyId']) \
    .config("spark.hadoop.fs.s3a.secret.key", credentials['SecretAccessKey']) \
    .config("spark.hadoop.fs.s3a.session.token", credentials['SessionToken']) \
    .getOrCreate()

In [9]:
spark.sql("SHOW DATABASES IN glue_catalog").show()

+---------+
|namespace|
+---------+
|dashboard|
|     vpts|
+---------+



Tell Spark where to read from (in this case metadata store in the AWS glue catalog)

In [None]:
df_live = spark.read \
    .format("iceberg") \
    .load("glue_catalog.dashboard.live_data")
