In [1]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

# This CATALOG_URL works for the "docker compose" testing and development environment
# Change 'server' if you are not running on "docker compose" (f. ex. 'localhost' if TIP is running locally).
CATALOG_URL = "http://server:8080/catalog"

# In the provided "docker compose" testing and development environment,
# this warehouse already exists.
# See USER_GUIDE.MD and STORAGE.MD for configuration examples if you
# are running this notebook with your own setup.
# There is also an example in 'Multiple Warehouses.ipynb'.
WAREHOUSE = "demo"
CATALOG = "demo_catalog"
NAMESPACE = "demo_namespace"

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.6.1"

# Connect with Spark

In [2]:
config = {
    f"spark.sql.catalog.{CATALOG}": "org.apache.iceberg.spark.SparkCatalog",
    f"spark.sql.catalog.{CATALOG}.type": "rest",
    f"spark.sql.catalog.{CATALOG}.uri": CATALOG_URL,
    f"spark.sql.catalog.{CATALOG}.warehouse": WAREHOUSE,
    f"spark.sql.catalog.{CATALOG}.io-impl": "org.apache.iceberg.aws.s3.S3FileIO",
    "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.defaultCatalog": CATALOG,
    "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}",
}


In [3]:
spark_config = SparkConf().setMaster('local').setAppName("Iceberg-REST")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

spark.sql(f"USE {CATALOG}")

DataFrame[]

## Read and Write Tables

In [4]:
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {NAMESPACE}")
spark.sql("SHOW NAMESPACES").toPandas()

Unnamed: 0,namespace
0,demo_namespace


In [5]:
for n in range(5):
    data = pd.DataFrame([[1, 'a-string', 2.2+n]], columns=['id', 'strings', 'floats'])
    sdf = spark.createDataFrame(data)
    sdf.writeTo(f"{NAMESPACE}.my_table_{n}").createOrReplace()

In [6]:
spark.sql(f"SELECT * FROM {NAMESPACE}.my_table_0").toPandas()

Unnamed: 0,id,strings,floats
0,1,a-string,2.2


In [7]:
spark.sql(f"CREATE VIEW {NAMESPACE}.my_view AS (SELECT id FROM {NAMESPACE}.my_table_0)")

AnalysisException: [VIEW_ALREADY_EXISTS] Cannot create view demo_namespace.my_view because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.

In [8]:
import requests
import json

# Get the warehouse ID
response = requests.get('http://server:8080/management/v1/warehouse?project-id=00000000-0000-0000-0000-000000000000')
warehouse_id = response.json()['warehouses'][0]['id']

In [10]:
# Send the DELETE request
response = requests.delete(f'http://server:8080/catalog/v1/{warehouse_id}/namespaces/demo_namespace/tables/my_table_3?purgeRequested=True')

# Print the HTTP status code
print(response.status_code)

204
