In [1]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

CATALOG_URL = "http://server:8080/catalog"
MANAGEMENT_URL = "http://server:8080/management"
DEMO_WAREHOUSE = "demo"
SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.5.2"
config = {
    "spark.sql.catalog.demo-catalog": "org.apache.iceberg.spark.SparkCatalog",
    "spark.sql.catalog.demo-catalog.type": "rest",
    "spark.sql.catalog.demo-catalog.uri": CATALOG_URL,
    "spark.sql.catalog.demo-catalog.warehouse": DEMO_WAREHOUSE,
    "spark.sql.catalog.demo-catalog.io-impl": "org.apache.iceberg.aws.s3.S3FileIO",
    "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    "spark.sql.defaultCatalog": "demo-catalog",
    "spark.jars.packages": f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION}"
}
spark_config = SparkConf().setMaster('local').setAppName("Iceberg-REST")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

## Read and Write Tables

In [2]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS spark_demo")

DataFrame[]

In [3]:
spark.sql("SHOW NAMESPACES").toPandas()

Unnamed: 0,namespace
0,spark_demo


In [4]:
data = pd.DataFrame([[1, 'a-string', 2.2]], columns=['id', 'strings', 'floats'])
sdf = spark.createDataFrame(data)

In [5]:
sdf.writeTo("spark_demo.my_table").createOrReplace()

In [2]:
spark.sql("SELECT * FROM spark_demo.my_table").toPandas()

Unnamed: 0,id,strings,floats
0,1,a-string,2.2


In [7]:
spark.sql("DESCRIBE EXTENDED spark_demo.my_table").toPandas().iloc[17].data_type

'[current-snapshot-id=5849667482925402855,format=iceberg/parquet,format-version=2]'

In [8]:
spark.sql("create view spark_demo.myview4 as select id, strings from spark_demo.my_table")

DataFrame[]

In [12]:
spark.sql("select * from  spark_demo.myview4").toPandas()

Unnamed: 0,id
0,1


In [10]:
spark.sparkContext.setLogLevel("debug")

In [11]:
spark.sql("CREATE OR REPLACE VIEW spark_demo.myview4 (id COMMENT 'id of thing') as select id from spark_demo.my_table")

DataFrame[]

In [17]:
spark.sql("SHOW VIEWS IN spark_demo").toPandas()

Unnamed: 0,namespace,viewName,isTemporary
0,spark_demo,myview4,False


In [19]:
spark.sql("DROP table spark_demo.my_table")

DataFrame[]

In [16]:
spark.sql("select * from  spark_demo.myview3").toPandas()

Unnamed: 0,id,strings
0,1,a-string


In [21]:
import requests
spark.sql("CREATE NAMESPACE IF NOT EXISTS myns")
data = pd.DataFrame([[1, 'spring', 2.2]], columns=['id', 'name', 'size'])
sdf = spark.createDataFrame(data)
sdf.writeTo("myns.parts").createOrReplace()
spark.sql("create view if not exists myns.view_springs as select id, size from myns.parts where name = 'spring'").toPandas()
rr = requests.get("http://server:8080/catalog/v1/ab756a68-351d-11ef-92b0-0332564d8630/namespaces/myns/views/view_springs")
meta = rr.json()
meta

{'metadata-location': 's3://examples/initial-warehouse/27fe1f14-3521-11ef-89e3-236b78a91df0/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b/metadata/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b.gz.metadata.json',
 'metadata': {'format-version': 1,
  'view-uuid': '01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b',
  'location': 's3://examples/initial-warehouse/27fe1f14-3521-11ef-89e3-236b78a91df0/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b/metadata/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b.gz.metadata.json',
  'current-version-id': 1,
  'versions': [{'version-id': 1,
    'schema-id': 0,
    'timestamp-ms': 1719560199920,
    'summary': {'engine-version': '3.5.1',
     'engine-name': 'spark',
     'iceberg-version': 'Apache Iceberg 1.5.2 (commit cbb853073e681b4075d7c8707610dceecbee3a82)',
     'app-id': 'local-1719559068458'},
    'representations': [{'type': 'sql',
      'sql': "select id, size from myns.parts where name = 'spring'",
      'dialect': 'spark'}],
    'default-namespace': []}],
  'version-log': [{'version-id': 1, '

In [22]:
spark.sql("create or replace view myns.view_springs as select id, name, size from myns.parts where name = 'spring'").toPandas()

In [23]:
rr = requests.get("http://server:8080/catalog/v1/ab756a68-351d-11ef-92b0-0332564d8630/namespaces/myns/views/view_springs")
meta = rr.json()
meta

{'metadata-location': 's3://examples/initial-warehouse/27fe1f14-3521-11ef-89e3-236b78a91df0/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b/metadata/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b.gz.metadata.json',
 'metadata': {'format-version': 1,
  'view-uuid': '01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b',
  'location': 's3://examples/initial-warehouse/27fe1f14-3521-11ef-89e3-236b78a91df0/01905dc6-5ef3-7ef3-a69b-7c802b2a7f5b/metadata/01905dc6-dffe-73c3-a2ff-d48002895883.gz.metadata.json',
  'current-version-id': 2,
  'versions': [{'version-id': 2,
    'schema-id': 1,
    'timestamp-ms': 1719560232954,
    'summary': {'engine-version': '3.5.1',
     'app-id': 'local-1719559068458',
     'iceberg-version': 'Apache Iceberg 1.5.2 (commit cbb853073e681b4075d7c8707610dceecbee3a82)',
     'engine-name': 'spark'},
    'representations': [{'type': 'sql',
      'sql': "select id, name, size from myns.parts where name = 'spring'",
      'dialect': 'spark'}],
    'default-namespace': []},
   {'version-id': 1,
    'schema