In [1]:
from pyspark.sql import SparkSession

In [2]:
%load_ext sparksql_magic

#### Following code is used to initilize the Spark Session. Iceberg package is used while creating the spark session, which will help to save spark dataframe as Iceberg Table.

In [18]:
# Initialize SparkSession

spark = SparkSession.builder \
    .appName("add-iceberg-minio") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1,org.apache.hive:hive-exec:3.1.3")\
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")\
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.spark_catalog.type", "hive")\
    .config("spark.sql.catalog.local.type", "hadoop")\
    .config("spark.sql.catalog.local.warehouse", "/usr/spark/delta-lake/data")\
    .config("spark.sql.defaultCatalog", "local")\
    .config("spark.hadoop.datanucleus.autoCreateSchema", "true")\
    .config("spark.hadoop.datanucleus.fixedDatastore", "false")\
    .config("hive.metastore.schema.verification", "false")\
    .config("hive.metastore.schema.verification.record.version", "false")\
    .getOrCreate()


In [17]:
# To stop SparkSession
spark.stop()

In [None]:
# Get spark session config details
# spark.sparkContext.getConf().getAll()

In [27]:
# Example DataFrame creation
data = [("Jerin", 29), ("Aayush", 35), ("Neeraj", 28),("Pritisinh", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [28]:
df.show()

                                                                                

+---------+---+
|     Name|Age|
+---------+---+
|    Jerin| 29|
|   Aayush| 35|
|   Neeraj| 28|
|Pritisinh| 28|
+---------+---+



### Create Managed Iceberg Table.
- This managed table is registered in Spark Catalog, therefore this can be accessed via SQL Table Reference.

In [39]:
# Usually gets error "version-hint.text does not exist" but the table gets created however while reading it, no error will occur.
# Reexecuting it again will not produce error.
df.writeTo("local.test_iceberg_table").createOrReplace() 

                                                                                

In [40]:
spark.read.table("local.test_iceberg_table").show()

+---------+---+
|     Name|Age|
+---------+---+
|    Jerin| 29|
|   Aayush| 35|
|   Neeraj| 28|
|Pritisinh| 28|
+---------+---+



In [41]:
%%sparksql

DROP TABLE local.test_iceberg_table;

In [42]:
# Creating table using spark sql wont throw "version-hint.text does not exist" 

spark.sql("""
    CREATE TABLE local.test_iceberg_table_sparksql (
        id INT,
        name STRING
    ) USING iceberg
""")

DataFrame[]

In [43]:
%%sparksql

INSERT INTO local.test_iceberg_table_sparksql VALUES (1,'Jerin'),(2,'Aayush')


                                                                                

In [44]:
%%sparksql

Select * from local.test_iceberg_table_sparksql

0,1
id,name
1,Jerin
2,Aayush
