In [5]:
%session_id_prefix native-iceberg-sql-
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Setting session ID prefix to native-iceberg-sql-
Setting Glue version to: 3.0
Current idle_timeout is 2880 minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg'}


In [1]:
catalog_name = "glue_catalog"
bucket_name = "646297494209-singapore-datalake"
bucket_prefix = "iceberg"
database_name = "iceberg_sql"
table_name = "product"
warehouse_path = f"s3://{bucket_name}/{bucket_prefix}"

Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::646297494209:role/AWSGlueServiceRoleDefault
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: f76f51e2-476b-4439-99a8-5ffdcdb52d72
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
--datalake-formats iceberg
Waiting for session f76f51e2-476b-4439-99a8-5ffdcdb52d72 to get into ready status...
Session f76f51e2-476b-4439-99a8-5ffdcdb52d72 has been created.



## Initialize SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config("spark.sql.warehouse.dir", warehouse_path) \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", warehouse_path) \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




## Clean up existing resources

In [3]:
query = f"""
DROP TABLE IF EXISTS {catalog_name}.{database_name}.{table_name}
"""
spark.sql(query)

DataFrame[]


## Create Iceberg table with sample data

In [4]:
from pyspark.sql import Row
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)




In [5]:
df_products.createOrReplaceTempView(f"tmp_{table_name}")




In [6]:
query = f"""
CREATE DATABASE IF NOT EXISTS {database_name}
"""
spark.sql(query)

DataFrame[]


In [7]:
query = f"""
CREATE TABLE {catalog_name}.{database_name}.{table_name}
USING iceberg
AS SELECT * FROM tmp_{table_name}
"""
spark.sql(query)

DataFrame[]


In [8]:
%%sql
USE iceberg_sql

++
||
++
++


In [9]:
%%sql
SHOW TABLES

+-----------+-----------+-----------+
|   database|  tableName|isTemporary|
+-----------+-----------+-----------+
|iceberg_sql|    product|      false|
|           |tmp_product|       true|
+-----------+-----------+-----------+


## Read from Iceberg table

In [10]:
%%sql
SELECT * FROM glue_catalog.iceberg_sql.product

+----------+------------+-----+-----------+-------------------+
|product_id|product_name|price|   category|         updated_at|
+----------+------------+-----+-----------+-------------------+
|     00001|      Heater|  250|Electronics|1.675401749026384E9|
|     00002|  Thermostat|  400|Electronics|1.675401749026384E9|
|     00003|  Television|  600|Electronics|1.675401749026384E9|
|     00004|     Blender|  100|Electronics|1.675401749026384E9|
|     00005| USB charger|   50|Electronics|1.675401749026384E9|
+----------+------------+-----+-----------+-------------------+


## Upsert records into Iceberg table

In [11]:
ut = time.time()

product_updates = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 400, 'category': 'Electronics', 'updated_at': ut}, # Update
    {'product_id': '00006', 'product_name': 'Chair', 'price': 50, 'category': 'Furniture', 'updated_at': ut} # Insert
]
df_product_updates = spark.createDataFrame(Row(**x) for x in product_updates)




In [12]:
df_product_updates.createOrReplaceTempView(f"tmp_{table_name}_updates")




In [13]:
query = f"""
MERGE INTO {catalog_name}.{database_name}.{table_name} AS t
USING (SELECT * FROM tmp_{table_name}_updates) AS u
ON t.product_id = u.product_id
WHEN MATCHED THEN UPDATE SET t.updated_at = u.updated_at
WHEN NOT MATCHED THEN INSERT *
"""
spark.sql(query)

DataFrame[]


In [14]:
%%sql
SELECT * FROM glue_catalog.iceberg_sql.product

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00002|  Thermostat|  400|Electronics| 1.675401749026384E9|
|     00003|  Television|  600|Electronics| 1.675401749026384E9|
|     00004|     Blender|  100|Electronics| 1.675401749026384E9|
|     00005| USB charger|   50|Electronics| 1.675401749026384E9|
|     00001|      Heater|  250|Electronics|1.6754018299467256E9|
|     00006|       Chair|   50|  Furniture|1.6754018299467256E9|
+----------+------------+-----+-----------+--------------------+


## Delete records

In [20]:
%%sql
DELETE FROM glue_catalog.iceberg_sql.product WHERE product_name = 'Blender'

++
||
++
++


In [21]:
%%sql
SELECT * FROM glue_catalog.iceberg_sql.product

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00002|  Thermostat|  400|Electronics| 1.675401749026384E9|
|     00003|  Television|  600|Electronics| 1.675401749026384E9|
|     00005| USB charger|   50|Electronics| 1.675401749026384E9|
|     00001|      Heater|  250|Electronics|1.6754018299467256E9|
|     00006|       Chair|   50|  Furniture|1.6754018299467256E9|
+----------+------------+-----+-----------+--------------------+


## View History and Snapshots

In [22]:
%%sql
SELECT * FROM glue_catalog.iceberg_sql.product.history

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2023-02-03 05:23:...|2227512522147639299|               null|               true|
|2023-02-03 05:24:...|8151186603465942035|2227512522147639299|               true|
|2023-02-03 05:29:...| 720355281947583927|8151186603465942035|               true|
+--------------------+-------------------+-------------------+-------------------+


In [23]:
%%sql
SELECT * FROM glue_catalog.iceberg_sql.product.snapshots

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2023-02-03 05:23:...|2227512522147639299|               null|   append|s3://646297494209...|{spark.app.id -> ...|
|2023-02-03 05:24:...|8151186603465942035|2227512522147639299|overwrite|s3://646297494209...|{spark.app.id -> ...|
|2023-02-03 05:29:...| 720355281947583927|8151186603465942035|overwrite|s3://646297494209...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+


In [24]:
%%sql
SELECT h.made_current_at, s.operation, h.snapshot_id, h.is_current_ancestor, s.summary["spark.app.id"] FROM glue_catalog.iceberg_sql.product.history h JOIN glue_catalog.iceberg_sql.product.snapshots s  ON h.snapshot_id = s.snapshot_id ORDER BY made_current_at

+--------------------+---------+-------------------+-------------------+---------------------+
|     made_current_at|operation|        snapshot_id|is_current_ancestor|summary[spark.app.id]|
+--------------------+---------+-------------------+-------------------+---------------------+
|2023-02-03 05:23:...|   append|2227512522147639299|               true| spark-application...|
|2023-02-03 05:24:...|overwrite|8151186603465942035|               true| spark-application...|
|2023-02-03 05:29:...|overwrite| 720355281947583927|               true| spark-application...|
+--------------------+---------+-------------------+-------------------+---------------------+


## Stop Session

In [None]:
%stop_session