In [1]:

spark.sql("DROP table bootcamp.matches_bucketed")

Intitializing Scala interpreter ...

Spark Web UI available at http://2e8125773246:4041
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1734348854315)
SparkSession available as 'spark'


res0: org.apache.spark.sql.DataFrame = []


In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col}
import org.apache.spark.storage.StorageLevel

val spark = SparkSession.builder()
  .appName("IcebergTableManagement") 
  .config("spark.executor.memory", "4g")
  .config("spark.driver.memory", "4g")
  .config("spark.sql.shuffle.partitions", "200") // Fine for large datasets
  .config("spark.sql.files.maxPartitionBytes", "134217728") // Optional: 128 MB is default
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") // Optional: Disable broadcast join
  .config("spark.dynamicAllocation.enabled", "true") // Helps with resource allocation
  .config("spark.dynamicAllocation.minExecutors", "1") // Ensure minimum resources
  .config("spark.dynamicAllocation.maxExecutors", "50") // Scalable resource allocation
  .getOrCreate()


val matchesBucketedselect = spark.read.option("header", "true")
  .option("inferSchema", "true")
  .csv("/home/iceberg/data/matches.csv")

// Get distinct completion dates
val distinctDates = matchesBucketedselect.select("completion_date").distinct().collect()

// Create the Iceberg table if it doesn't exist
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id))
"""
spark.sql(bucketedDDL)

// Process data in chunks based on completion_date
distinctDates.foreach { row =>
  val date = row.getAs[java.sql.Timestamp]("completion_date")
  val filteredMatches = matchesBucketedselect.filter(col("completion_date") === date)
  
  // Repartition and persist the filtered data
  val optimizedMatches = filteredMatches
    .select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
    .repartition(16, $"match_id")
    .persist(StorageLevel.MEMORY_AND_DISK)
    
  optimizedMatches.write
    .mode("append")
    .bucketBy(16, "match_id")
    .partitionBy("completion_date")
    .saveAsTable("bootcamp.matches_bucketed")
}

// Verify the data in the table
val result = spark.sql("SELECT * FROM bootcamp.matches_bucketed")
result.show()


+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|4a7fcf11-1d90-4c9...|        true|2323b76a-db98-4e0...|2016-09-22 00:00:00|
|438ab2bf-8ee9-400...|        true|2323b76a-db98-4e0...|2016-09-22 00:00:00|
|c103c17f-955d-49b...|        true|892189e9-d712-4bd...|2016-09-21 00:00:00|
|c3f935c6-0a56-498...|        true|c98949ae-60a8-43d...|2016-09-28 00:00:00|
|800a835c-aac3-425...|        true|f72e0ef0-7c4a-430...|2016-07-16 00:00:00|
|91565f91-93cd-46d...|        true|f72e0ef0-7c4a-430...|2016-07-16 00:00:00|
|f53a0b04-ef68-442...|        true|892189e9-d712-4bd...|2016-09-21 00:00:00|
|9ac645d1-4eb0-424...|        true|2323b76a-db98-4e0...|2016-09-22 00:00:00|
|de780c26-bb7a-48b...|        true|892189e9-d712-4bd...|2016-09-28 00:00:00|
|03906291-1ac6-40e...|        true|892189e9-d712-4bd...|2016-09-28 00:00:00|

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.storage.StorageLevel
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@625e59c9
matchesBucketedselect: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
distinctDates: Array[org.apache.spark.sql.Row] = Array([2016-03-13 00:00:00.0], [2016-03-11 00:00:00.0], [2016-03-10 00:00:00.0], [2016-01-30 00:00:00.0], [2016-03-27 00:00:00.0], [2016-04-10 00:00:00.0], [2016-01-18 00:00:00.0], [2016-02-01 00:00:00.0], [2015-12-14 00:00:00.0], [2016-02-03 00:00:00.0], [2016-04-30 00:00:00.0], [2016-03-05 00:00:00.0], [2016-04-15 00:00:00.0], [2016-05-21 00:00:00.0], [2015-10-31 00:00:00.0], [2016-01-22 00:00:00.0], [2016-02-09 00:00:00...


In [3]:
spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.matches_bucketed.files").show()

+---------+
|num_files|
+---------+
|     3665|
+---------+



In [4]:
val bucketedDetailsDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INTEGER,
    player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id));
"""
spark.sql(bucketedDetailsDDL)

bucketedDetailsDDL: String =
"
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INTEGER,
    player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id));
"
res3: org.apache.spark.sql.DataFrame = []


In [6]:
val matchDetailsBucketed =  spark.read.option("header", "true")
.option("inferSchema", "true")
.csv("/home/iceberg/data/match_details.csv")

// Partitioned table on the 16 buckets based on match_id
matchDetailsBucketed
.select(
    $"match_id",
    $"player_gamertag",
    $"player_total_kills",
    $"player_total_deaths"
)
.write.mode("append")
.bucketBy(16, "match_id")
.saveAsTable("bootcamp.match_details_bucketed")

matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]


In [8]:
val matchesBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")

val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")

// Disable broadcast joins
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")

spark.sql("""
    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
    AND md.completion_date = DATE('2016-01-01')
        
""").explain()


spark.sql("""
    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id    
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#37190], [match_id#37194], Inner
   :- Sort [match_id#37190 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#37190, 200), ENSURE_REQUIREMENTS, [plan_id=16081]
   :     +- BatchScan demo.bootcamp.match_details_bucketed[match_id#37190, player_gamertag#37191, player_total_kills#37192, player_total_deaths#37193] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#37194 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#37194, 200), ENSURE_REQUIREMENTS, [plan_id=16082]
         +- BatchScan demo.bootcamp.matches_bucketed[match_id#37194, is_team_game#37195, playlist_id#37196, completion_date#37197] demo.bootcamp.matches_bucketed (branch=null) [filters=completion_date IS NOT NULL, completion_date = 1451606400000000, match_id IS NOT NULL, groupedBy=] RuntimeFilters: []


== Physical Plan ==


matchesBucketed: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]
