In [None]:
import org.apache.spark.sql._
import scala.sys.process._

In [None]:
// So now we need to configure Spark to use Iceberg
// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/
// We'll use the "hadoop" (aka file) catalog & /high-performance-spark-examples/warehouse for the location
val spark = (SparkSession.builder.master("local[*]")
             // Setup the extensions
             .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
             .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
             .config("spark.sql.catalog.local.type", "hadoop")
             .config("spark.sql.catalog.local.warehouse", "/high-performance-spark-examples/warehouse")
             .getOrCreate()
             )
import spark._

In [None]:
// Load the current data
val df = spark.read.option("header", "true").option("inferSchema", "true").csv("/high-performance-spark-examples/data/fetched/2021")

In [None]:
// Drop existing table if present & create new table
spark.sql("DROP TABLE IF EXISTS local.uk_gender_pay_data")

In [None]:
// Write the data out
df.write.saveAsTable("local.uk_gender_pay_data")

In [None]:
"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/".!!

In [None]:
"cat  /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json".!!

In [None]:
// Java SDK time imports
import java.util.HashMap
import java.util.Map

import org.apache.iceberg.Table
import org.apache.iceberg.catalog.TableIdentifier
import org.apache.iceberg.hadoop.HadoopCatalog


// And to handle java types
import scala.jdk.CollectionConverters._

In [None]:
val catalog = new HadoopCatalog(spark.sparkContext.hadoopConfiguration, "/high-performance-spark-examples/warehouse")

In [None]:
val name = TableIdentifier.of("uk_gender_pay_data")

In [None]:
val table = catalog.loadTable(name)

In [None]:
val snapshots = table.snapshots().asScala.toList
snapshots

In [None]:
val snapshot = snapshots(0).snapshotId()

In [None]:
val altSnapshotQuery = spark.sql("SELECT * FROM local.uk_gender_pay_data.snapshots")
altSnapshotQuery.show()

In [None]:
val altSnapshotId = spark.sql("SELECT snapshot_id FROM local.uk_gender_pay_data.snapshots").collect()(0)

In [None]:
spark.sql("SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5").show()

In [None]:
// We can also list snapshots with the select
spark.sql("SELECT * FROM local.uk_gender_pay_data.snapshots").show()

In [None]:
// And the files!
// We can also list snapshots with the select
spark.sql("SELECT * FROM local.uk_gender_pay_data.files").show()

In [None]:
// Lets take a quick look and see
spark.sql("SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5").show()

In [None]:
spark.sql("DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)")

In [None]:
// Make sure the data is gone
spark.sql("SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5").show()

In [None]:
// Yay! ok now lets travel back in time
spark.sql(f"SELECT * FROM local.uk_gender_pay_data VERSION AS OF ${snapshot} WHERE isnull(responsibleperson) LIMIT 5").show()

In [None]:
// Or the same with option + DF syntax
spark.read.option("snapshot-id", f"${snapshot}").table("local.uk_gender_pay_data").show()

In [None]:
spark.sql(f"SELECT * FROM local.uk_gender_pay_data.files").show()

In [None]:
spark.sql("DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode")

In [None]:
// Write the data out
df.registerTempTable("temp_table")
// We could use the table write semantics but we can't do truncate() on that
spark.sql("CREATE TABLE local.uk_gender_pay_data_postcode USING iceberg PARTITIONED BY (truncate(1, PostCode)) AS select * from temp_table")

In [None]:
spark.sql("SELECT * FROM local.uk_gender_pay_data_postcode.files").show()

In [None]:
val year_dfs = 2022.to(2023).map(r => spark.read.option("header", "true").option("inferSchema", "true").csv(s"/high-performance-spark-examples/data/fetched/${r}"))

In [None]:
List("local.uk_gender_pay_data", "local.uk_gender_pay_data_postcode").foreach(table => year_dfs.foreach(df => df.write.mode("append").saveAsTable(table)))

In [None]:
val uncompacted_file_list = "ls -alh ../warehouse/uk_gender_pay_data/data/".!!

In [None]:
val uncompacted_metadata_file_list = "ls -alh ../warehouse/uk_gender_pay_data/metadata/".!!

In [None]:
spark.sql("SELECT * FROM local.uk_gender_pay_data.files").show()

In [None]:
spark.sql("SELECT * FROM local.uk_gender_pay_data.snapshots").show()

In [None]:
import org.apache.iceberg.spark.actions.SparkActions
// Iceberg actions
import org.apache.iceberg.actions.Action

In [None]:
// So far the logging has been... verbose but interesting, but the next stages it's actually too much
sc.setLogLevel("ERROR")

In [None]:
SparkActions.get().rewriteDataFiles(table).option("target-file-size-bytes", (512L*1024L*1024L).toString).option("rewrite-all", "true").execute()

In [None]:
val compacted_file_list = "ls -alh ../warehouse/uk_gender_pay_data/data/".!!

In [None]:
// Remove the old snapshots
// This is really slow in local mode for reasons*
SparkActions.get().expireSnapshots(table).expireOlderThan(System.currentTimeMillis()).retainLast(4).execute()

In [None]:
// Remove the orphaned files
SparkActions.get().deleteOrphanFiles(table).execute()

In [None]:
val cleaned_and_compacted_file_list = "ls ../warehouse/uk_gender_pay_data/data/".!!

In [None]:
// Remove old snapshots

// But we also need to rewrite the files


In [None]:
spark.sql("SELECT * FROM local.uk_gender_pay_data.files").show()

In [None]:
// Lets go take a look at a quick side-by-side test
//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh