In [1]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import requests

CATALOG_URL = "http://server:8181/catalog"

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = ".".join(SPARK_VERSION.split(".")[:2])
ICEBERG_VERSION = "1.8.1"

In [2]:
requests.post(
    "http://server:8181/management/v1/warehouse",
    json={
        "warehouse-name": "hdfs6",
        "storage-profile": {
            "type": "hdfs",
            "key-prefix": "/user/hdfs6",
            "url": "hdfs://namenode:8020",
        },
    },
).content

b'{"warehouse-id":"20cad55a-2438-11f0-b63b-17972a294a50"}'

# Connect with Spark

In [3]:
config = {
    "spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog",
    "spark.sql.catalog.lakekeeper.type": "rest",
    "spark.sql.catalog.lakekeeper.uri": CATALOG_URL,
    "spark.sql.catalog.lakekeeper.warehouse": "hdfs",
    "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.defaultCatalog": "lakekeeper",
    "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}",
}

In [4]:
spark_config = SparkConf().setMaster("local").setAppName("Iceberg-REST")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()
spark.sparkContext.setLogLevel("DEBUG")

spark.sql("USE lakekeeper")

DataFrame[]

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 56334)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

## Read and Write Tables

In [5]:
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS my_namespace")
spark.sql("SHOW NAMESPACES").toPandas()

Unnamed: 0,namespace
0,my_namespace


In [6]:
data = pd.DataFrame([[1, "a-string", 2.2]], columns=["id", "strings", "floats"])
sdf = spark.createDataFrame(data)

In [7]:
sdf.writeTo(f"my_namespace.my_table").createOrReplace()

In [8]:
spark.sql(f"SELECT * FROM my_namespace.my_table").toPandas()

Unnamed: 0,id,strings,floats
0,1,a-string,2.2
