In [1]:
from pyspark.sql import SparkSession

In [2]:
%load_ext sparksql_magic

#### Following code is used to initilize the Spark Session. Iceberg package is used while creating the spark session, which will help to save spark dataframe as Iceberg Table.

In [8]:
# Initialize SparkSession
# .config("spark.sql.catalog.local.warehouse", "/usr/spark/delta-lake/data")\ # This config is used to store data in local file system
# .config("spark.sql.catalog.local.warehouse", "s3a://iceberg-bucket/delta-lake/data")\ # his config is used to store data in Minio, create bucket ("iceberg-bucket") if not exists
spark = SparkSession.builder \
    .appName("iceberg-minio-spark-session") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1,org.apache.hive:hive-exec:3.1.3,org.apache.hadoop:hadoop-aws:3.3.4")\
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")\
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.spark_catalog.type", "hive")\
    .config("spark.sql.catalog.local.type", "hadoop")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "root") \
    .config("spark.hadoop.fs.s3a.secret.key", "jerinminioserver") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.catalog.local.warehouse", "s3a://iceberg-bucket/delta-lake/data")\
    .config("spark.sql.defaultCatalog", "local")\
    .config("spark.hadoop.datanucleus.autoCreateSchema", "true")\
    .config("spark.hadoop.datanucleus.fixedDatastore", "false")\
    .config("hive.metastore.schema.verification", "false")\
    .config("hive.metastore.schema.verification.record.version", "false")\
    .getOrCreate()




:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.hive#hive-exec added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c775c80a-59c3-41ca-b221-5376ecdd4b76;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.7.1 in central
	found org.apache.hive#hive-exec;3.1.3 in central
	found org.apache.hive#hive-vector-code-gen;3.1.3 in central
	found commons-lang#commons-lang;2.6 in central
	found com.google.guava#guava;19.0 in central
	found org.apache.ant#ant;1.9.1 in central
	found org.apache.ant#ant-launcher;1.9.1 in central
	found org.slf4j#slf4j-api;1.7.10 in central
	found org.apache.hive#hive-upgrade-acid;3.1.3 in central
	found org.apache.hive#hive-llap-tez;3.1.3 in central
	found org.apache.hive#hive-common;3.1.3 in central
	found org.apache.hive#

In [13]:
# To stop SparkSession
spark.stop()

In [None]:
# Get spark session config details
# spark.sparkContext.getConf().getAll()

In [9]:
# Example DataFrame creation
data = [("Jerin", 29), ("Aayush", 35), ("Neeraj", 28),("Pritisinh", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [10]:
df.show()

25/01/15 08:07:47 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+---------+---+
|     Name|Age|
+---------+---+
|    Jerin| 29|
|   Aayush| 35|
|   Neeraj| 28|
|Pritisinh| 28|
+---------+---+



### Create Managed Iceberg Table.
- This managed table is registered in Spark Catalog, therefore this can be accessed via SQL Table Reference.

In [12]:
# Usually gets error "version-hint.text does not exist" but the table gets created however while reading it, no error will occur.
# Reexecuting it again will not produce error.
df.writeTo("local.test_iceberg_table").createOrReplace() 

In [13]:
spark.read.table("local.test_iceberg_table").show()

[Stage 4:>                                                          (0 + 1) / 1]

+---------+---+
|     Name|Age|
+---------+---+
|    Jerin| 29|
|   Aayush| 35|
|   Neeraj| 28|
|Pritisinh| 28|
+---------+---+



                                                                                

In [14]:
%%sparksql

DROP TABLE local.test_iceberg_table;

In [15]:
# Creating table using spark sql wont throw "version-hint.text does not exist" 

spark.sql("""
    CREATE TABLE local.test_iceberg_table_sparksql (
        id INT,
        name STRING
    ) USING iceberg
""")

DataFrame[]

In [16]:
%%sparksql

INSERT INTO local.test_iceberg_table_sparksql VALUES (1,'Jerin'),(2,'Aayush')


In [17]:
%%sparksql

Select * from local.test_iceberg_table_sparksql

0,1
id,name
1,Jerin
2,Aayush
