In [None]:
# demo link: https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?filepath=notebooks/nessie-iceberg-demo-nba.ipynb

## docker setting
# nessie-catalog:
#   image: ghcr.io/projectnessie/nessie:0.104.2
#   ports:
#     - "19120:19120"
#     - "9002:9000"

In [3]:
import os
# import findspark
from pyspark.sql import *
from pyspark import SparkConf
# import pynessie

# findspark.init()
# pynessie_version = pynessie.__version__

conf = SparkConf()
# we need iceberg libraries and the nessie sql extensions
conf.set(
    "spark.jars.packages",
    f"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.2",
)
# ensure python <-> java interactions are w/ pyarrow
conf.set("spark.sql.execution.pyarrow.enabled", "true")
# create catalog dev_catalog as an iceberg catalog
conf.set("spark.sql.catalog.dev_catalog", "org.apache.iceberg.spark.SparkCatalog")
# tell the dev_catalog that its a Nessie catalog
conf.set("spark.sql.catalog.dev_catalog.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
# set the location for Nessie catalog to store data. Spark writes to this directory
conf.set("spark.sql.catalog.dev_catalog.warehouse", "s3://warehouse-dev")
# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)
conf.set("spark.sql.catalog.dev_catalog.uri", "http://nessie-catalog:19120/api/v1")
# default branch for Nessie catalog to work on
conf.set("spark.sql.catalog.dev_catalog.ref", "main")
# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda
conf.set("spark.sql.catalog.dev_catalog.auth_type", "NONE")
# enable the extensions for both Nessie and Iceberg
conf.set(
    "spark.sql.extensions",
    "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions",
)
# finally, start up the Spark server
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3fe062d3-6dcf-47ac-9d73-a36aaecf2a4e;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.1 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.104.2 in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.1/iceberg-spark-runtime-3.5_2.12-1.9.1.jar ...
	[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.1!iceberg-spark-runtime-3.5_2.12.jar (2491ms)
downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-integrations/nessie-spark-extensions-3.5_2.12/0.104.2/nessie-spark-extensions-3.5_2.12-0.104.2.jar ...
	[SUCCESSFU

Spark Running


In [4]:
spark.sql("CREATE NAMESPACE dev_catalog.nba")

DataFrame[]

In [5]:
spark.sql("CREATE BRANCH dev IN dev_catalog FROM main").toPandas()

Unnamed: 0,refType,name,hash
0,Branch,dev,55e9d2a5fd9344ad3ad2a2ef4eb99bd7d8ba12bcdcaa29...


In [6]:
spark.sql("LIST REFERENCES IN dev_catalog").toPandas()

Unnamed: 0,refType,name,hash
0,Branch,dev,55e9d2a5fd9344ad3ad2a2ef4eb99bd7d8ba12bcdcaa29...
1,Branch,main,55e9d2a5fd9344ad3ad2a2ef4eb99bd7d8ba12bcdcaa29...


In [10]:
spark.sql("USE REFERENCE dev IN dev_catalog")

# Creating `salaries` table
spark.sql(
    """CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries
            (Season STRING, Team STRING, Salary STRING, Player STRING) USING iceberg"""
)

spark.sql(
    """CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv
            OPTIONS (path "s3://warehouse-dev/datasets/nba/salaries.csv", header true)"""
)
spark.sql("INSERT INTO dev_catalog.nba.salaries SELECT * FROM salaries_table")

# Creating `totals_stats` table
spark.sql(
    """CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (
             Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,
             BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)
             USING iceberg"""
)
spark.sql(
    """CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv
            OPTIONS (path "s3://warehouse-dev/datasets/nba/totals_stats.csv", header true)"""
)
spark.sql("INSERT INTO dev_catalog.nba.totals_stats SELECT * FROM stats_table").toPandas()

In [14]:
table_count = spark.sql("select count(*) from dev_catalog.nba.`salaries@dev`").toPandas().values[0][0]
csv_count = spark.sql("select count(*) from salaries_table").toPandas().values[0][0]
# assert table_count == csv_count
print(table_count, csv_count)

table_count = spark.sql("select count(*) from dev_catalog.nba.`totals_stats@dev`").toPandas().values[0][0]
csv_count = spark.sql("select count(*) from stats_table").toPandas().values[0][0]
# assert table_count == csv_count
print(table_count, csv_count)

150 50
92 92


# Check generated tables

In [16]:
spark.sql("USE REFERENCE main IN dev_catalog").toPandas()


Unnamed: 0,refType,name,hash
0,Branch,main,55e9d2a5fd9344ad3ad2a2ef4eb99bd7d8ba12bcdcaa29...


In [17]:
spark.sql("SHOW TABLES IN dev_catalog").toPandas()

Unnamed: 0,namespace,tableName,isTemporary


In [18]:
spark.sql("USE REFERENCE dev IN dev_catalog").toPandas()

Unnamed: 0,refType,name,hash
0,Branch,dev,cce1f98c97ccfc4da729cc519ba4f7fbfc06036ffcd6a9...


In [19]:
spark.sql("SHOW TABLES IN dev_catalog").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,nba,salaries,False
1,nba,totals_stats,False


In [20]:
spark.sql("LIST REFERENCES IN dev_catalog").toPandas()

Unnamed: 0,refType,name,hash
0,Branch,dev,cce1f98c97ccfc4da729cc519ba4f7fbfc06036ffcd6a9...
1,Branch,main,55e9d2a5fd9344ad3ad2a2ef4eb99bd7d8ba12bcdcaa29...


# merge

In [21]:
spark.sql("MERGE BRANCH dev INTO main IN dev_catalog").toPandas()

Unnamed: 0,name,hash
0,main,bc0b2e56e9fe0192ab065a5d516cf7572d2f32038830c6...


In [22]:
spark.sql("LIST REFERENCES IN dev_catalog").toPandas()

Unnamed: 0,refType,name,hash
0,Branch,dev,cce1f98c97ccfc4da729cc519ba4f7fbfc06036ffcd6a9...
1,Branch,main,bc0b2e56e9fe0192ab065a5d516cf7572d2f32038830c6...


In [23]:
spark.sql("USE REFERENCE main IN dev_catalog").toPandas()

Unnamed: 0,refType,name,hash
0,Branch,main,bc0b2e56e9fe0192ab065a5d516cf7572d2f32038830c6...


In [24]:
spark.sql("SHOW TABLES IN dev_catalog").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,nba,salaries,False
1,nba,totals_stats,False


# new branch

In [27]:
# spark.sql("CREATE BRANCH etl IN dev_catalog FROM main").toPandas()

In [28]:
# add some salaries for Kevin Durant
spark.sql("USE REFERENCE etl IN dev_catalog")
spark.sql(
    """INSERT INTO dev_catalog.nba.salaries VALUES
    ("2017-18", "Golden State Warriors", "$25000000", "Kevin Durant"),
    ("2018-19", "Golden State Warriors", "$30000000", "Kevin Durant"),
    ("2019-20", "Brooklyn Nets", "$37199000", "Kevin Durant"),
    ("2020-21", "Brooklyn Nets", "$39058950", "Kevin Durant")
    """
).toPandas()

In [29]:
# Dropping a column in the `totals_stats` table
spark.sql("ALTER TABLE dev_catalog.nba.totals_stats DROP COLUMN Age").toPandas()

In [30]:
spark.sql("USE REFERENCE main IN dev_catalog").toPandas()
spark.sql("SHOW TABLES IN dev_catalog").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,nba,salaries,False
1,nba,totals_stats,False


In [31]:
spark.sql("USE REFERENCE etl IN dev_catalog").toPandas()
spark.sql("SHOW TABLES IN dev_catalog").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,nba,salaries,False
1,nba,totals_stats,False
