## In Memory

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("In memory catalog") \
    .config("spark.jars.packages", "org.apache.spark:spark-hive_2.12:3.5.1") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

In [2]:
spark.sql("""
    SHOW catalogs
""").toPandas()

Unnamed: 0,catalog
0,spark_catalog


In [3]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS mydb").toPandas()

In [4]:
spark.sql("""
    SHOW databases
""").toPandas()

Unnamed: 0,namespace
0,default
1,mydb


In [5]:
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary


In [7]:
spark.sql("""
CREATE TABLE mydb.local_table (
    id INT COMMENT 'Identificador único',
    name STRING COMMENT 'Nome do indivíduo'
) USING hive
""")

DataFrame[]

In [8]:
spark.sql("""
    describe table extended mydb.local_table
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,int,Identificador único
1,name,string,Nome do indivíduo
2,,,
3,# Detailed Table Information,,
4,Catalog,spark_catalog,
5,Database,mydb,
6,Table,local_table,
7,Owner,root,
8,Created Time,Tue Aug 13 18:14:55 UTC 2024,
9,Last Access,UNKNOWN,


In [9]:
spark.sql("""
    INSERT INTO mydb.local_table VALUES (1, "John"), (2, "Maria")
""").toPandas()

                                                                                

In [10]:
tuple_list_data = [('Alice', 1), ('Braga', 2), ('Steve', 3)]
spark.createDataFrame(tuple_list_data, ['name', 'id']).show()

                                                                                

+-----+---+
| name| id|
+-----+---+
|Alice|  1|
|Braga|  2|
|Steve|  3|
+-----+---+



In [11]:
spark.read.table("mydb.local_table").toPandas()

                                                                                

Unnamed: 0,id,name
0,1,John
1,2,Maria


## Derby Metastore

In [1]:
!hdfs dfs -rm -r /hive-warehouse

2024-08-13 18:05:11,380 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted /hive-warehouse


In [None]:
from pyspark.sql import SparkSession

# this one uses spark default metastore based on derby

warehouse_path = "/hive-warehouse"  # path to warehouse in HDFS hdfs://localhost:9000/hive-warehouse

# Inicializar a SparkSession com suporte ao Hive
spark = SparkSession.builder \
    .appName("Spark Derby Metastore") \
    .config("spark.jars.packages", "org.apache.spark:spark-hive_2.12:3.5.1") \
    .config("spark.sql.warehouse.dir", warehouse_path) \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

In [2]:
spark.sql("""
    SHOW catalogs
""").toPandas()

Unnamed: 0,catalog
0,spark_catalog


In [3]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS mydb").toPandas()

In [4]:
spark.sql("""
    SHOW databases
""").toPandas()

Unnamed: 0,namespace
0,default
1,mydb


In [13]:
spark.sql("""
CREATE TABLE mydb.hive_table (
    id INT COMMENT 'Identificador único',
    name STRING COMMENT 'Nome do indivíduo'
) USING hive
""")

DataFrame[]

In [6]:
spark.sql("""
    SHOW tables
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary


In [14]:
spark.sql("""
    describe table extended mydb.hive_table
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,int,Identificador único
1,name,string,Nome do indivíduo
2,,,
3,# Detailed Table Information,,
4,Catalog,spark_catalog,
5,Database,mydb,
6,Table,hive_table,
7,Owner,root,
8,Created Time,Tue Aug 13 18:09:16 UTC 2024,
9,Last Access,UNKNOWN,


In [16]:
spark.sql("""
    INSERT INTO mydb.hive_table VALUES (1, "John"), (2, "Maria")
""").toPandas()

                                                                                

In [17]:
spark.read.table("mydb.hive_table").toPandas()

Unnamed: 0,id,name
0,1,John
1,2,Maria


## Delta

In [1]:
!hdfs dfs -rm -r /delta-warehouse

2024-08-04 20:55:15,492 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted /delta-warehouse


In [None]:
from pyspark.sql import SparkSession

warehouse_path = "/delta-warehouse"  # path to warehouse in HDFS hdfs://localhost:9000/delta-warehouse
enable_schema_merge = "true"
type_compatibility_policy = "LEGACY"  # legacy enables more flexibility

# Inicializar a SparkSession com suporte ao Delta Lake
spark = SparkSession.builder \
    .appName("delta lake") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.warehouse.dir", warehouse_path) \
    .config("spark.databricks.delta.schema.autoMerge.enabled", enable_schema_merge) \
    .config("spark.sql.storeAssignmentPolicy", type_compatibility_policy) \
    .getOrCreate()
    
spark.sparkContext.setLogLevel("ERROR")
spark

In [3]:
spark.sql("""
    SHOW catalogs
""").toPandas()

Unnamed: 0,catalog
0,spark_catalog


In [4]:
spark.catalog.currentCatalog()

'spark_catalog'

In [5]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS mydb").toPandas()

In [6]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS another_db").toPandas()

In [7]:
spark.sql("""
    SHOW databases
""").toPandas()

Unnamed: 0,namespace
0,another_db
1,default
2,mydb


In [8]:
# Criar uma tabela Delta Lake
spark.sql("""
CREATE TABLE mydb.delta_table (
    id INT COMMENT 'Identificador único',
    name STRING COMMENT 'Nome do indivíduo'
) USING delta
""")

                                                                                

DataFrame[]

In [18]:
from pyspark.sql.types import StringType
from delta.tables import DeltaTable

# Criar outra tabela delta
DeltaTable.createOrReplace(spark) \
    .tableName("mydb.delta_table_2") \
    .addColumn("id", dataType="INT", nullable=False, comment= "Identificador único") \
    .addColumn("name", dataType=StringType(), comment="Identificador único") \
    .execute() \
    .detail() \
    .toPandas()

                                                                                

Unnamed: 0,format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures
0,delta,812b9bda-93d7-4344-ab48-7b85ceaa5ccf,spark_catalog.mydb.delta_table_2,,hdfs://localhost:9000/delta-warehouse/mydb.db/...,2024-08-04 20:58:39.577,2024-08-04 20:58:40.850,[],[],0,0,{},1,2,"[appendOnly, invariants]"


In [10]:
spark.sql("""
    SHOW tables in mydb
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,mydb,delta_table,False
1,mydb,delta_table_2,False


In [21]:
spark.sql("""
    describe table extended mydb.delta_table
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,int,Identificador único
1,name,string,Nome do indivíduo
2,,,
3,# Detailed Table Information,,
4,Name,spark_catalog.mydb.delta_table,
5,Type,MANAGED,
6,Location,hdfs://localhost:9000/delta-warehouse/mydb.db/...,
7,Provider,delta,
8,Table Properties,"[delta.minReaderVersion=1,delta.minWriterVersi...",


In [12]:
spark.sql("""
    INSERT INTO mydb.delta_table VALUES (1, "John"), (2, "Maria")
""").toPandas()

                                                                                

In [37]:
from pyspark.sql.types import StructType, StructField, IntegerType
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True)
])

tuple_list_data = [(1, 'Alice'), (2, 'John'), (3, 'Steve')]
df = spark.createDataFrame(tuple_list_data, schema=schema)

df.write \
    .option("mergeSchema", "true") \
    .saveAsTable("mydb.delta_table", format="delta", mode="overwrite")

# df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("mydb.delta_table")

# df.write \
#   .format("delta") \
#   .mode("overwrite") \
#   .saveAsTable("mydb.delta_table")

                                                                                

In [38]:
spark.read.table("mydb.delta_table").toPandas()

                                                                                

Unnamed: 0,id,name
0,1,Alice
1,2,John
2,3,Steve


## Iceberg

A catalog is created and named by adding a property spark.sql.catalog.(catalog-name) with an implementation class for its value.

Iceberg supplies two implementations:

- **org.apache.iceberg.spark.SparkCatalog** supports a Hive Metastore or a Hadoop warehouse as a catalog
- **org.apache.iceberg.spark.SparkSessionCatalog** adds support for Iceberg tables to Spark's built-in catalog, and delegates to the built-in catalog for non-Iceberg table

Reference: https://iceberg.apache.org/docs/latest/spark-configuration/#catalog-configuration

In [1]:
!hdfs dfs -rm -r /iceberg-warehouse

2024-08-04 13:46:17,773 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted /iceberg-warehouse


In [3]:
from pyspark.sql import SparkSession

# catalog_type = "hive"  # tables from a Hive metastore
catalog_type = "hadoop"  # directory-based catalog in HDFS
warehouse_path = "/iceberg-warehouse"  # path to warehouse in HDFS hdfs://localhost:9000/iceberg-warehouse

# Inicializar a SparkSession com suporte ao Iceberg
spark = SparkSession.builder \
    .appName("iceberg") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.0") \
    .config("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg_catalog.type", catalog_type) \
    .config("spark.sql.catalog.iceberg_catalog.warehouse", warehouse_path) \
    .config("spark.sql.defaultCatalog", "iceberg_catalog") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

# another approach
# .config("spark.sql.catalog.iceberg_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
# .config("spark.sql.catalog.iceberg_catalog.type", "hive")

# remote catalog
# .config("spark.sql.catalog.iceberg_catalog.uri", "thrift://localhost:9083")

:: loading settings :: url = jar:file:/opt/spark-3.5.1-bin-without-hadoop/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c68c5838-ae14-4ee0-8a5a-54c0a6d402c5;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.0 in central
:: resolution report :: resolve 52ms :: artifacts dl 3ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submi

In [2]:
# spark.catalog.setCurrentCatalog("iceberg_catalog")

# OR by setting up

# .config("spark.sql.defaultCatalog", "iceberg_catalog")

In [4]:
spark.sql("""
    SHOW catalogs
""").toPandas()

Unnamed: 0,catalog
0,iceberg_catalog
1,spark_catalog


In [5]:
spark.catalog.currentCatalog()

'iceberg_catalog'

In [6]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg_catalog.mydb").toPandas()

In [7]:
spark.sql("""
    SHOW databases
""").toPandas()

Unnamed: 0,namespace
0,mydb


In [9]:
# Criar uma tabela Iceberg
spark.sql("""
CREATE OR REPLACE TABLE mydb.iceberg_table (
    id INT COMMENT 'Identificador único',
    name STRING COMMENT 'Nome do indivíduo'
) USING iceberg
""").toPandas()

In [10]:
spark.sql("""
    SHOW tables in mydb
""").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,mydb,iceberg_table,False


In [11]:
spark.sql("""
    describe table extended mydb.iceberg_table
""").toPandas()

Unnamed: 0,col_name,data_type,comment
0,id,int,Identificador único
1,name,string,Nome do indivíduo
2,,,
3,# Metadata Columns,,
4,_spec_id,int,
5,_partition,struct<>,
6,_file,string,
7,_pos,bigint,
8,_deleted,boolean,
9,,,


In [12]:
spark.sql("""
    INSERT INTO mydb.iceberg_table VALUES (1, "John"), (2, "Maria")
""").toPandas()

                                                                                

In [13]:
spark.read.table("mydb.iceberg_table").toPandas()

                                                                                

Unnamed: 0,id,name
0,1,John
1,2,Maria


## Glue

In [None]:
from pyspark.sql import SparkSession

ACCESS_KEY = ""
SECRET_KEY = ""
# Inicializar a SparkSession com suporte ao Glue Catalog
spark = SparkSession.builder \
    .appName("Glue Catalog Example") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.0") \
    .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
    .config("spark.hadoop.hive.metastore.glue.catalogid", "667047950229") \
     .enableHiveSupport() \
    .getOrCreate()