In [None]:

// In python use: from pyspark.sql.functions import broadcast, split, lit
import org.apache.spark.sql.functions.{broadcast, split, lit}

val matchesBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")

val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")

// spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed""")
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id));
"""
// spark.sql(bucketedDDL)

// Partitioned table on completion_date and the 16 buckets based on match_id
matchesBucketed
.select(
    $"match_id",
    $"is_team_game",
    $"playlist_id",
    $"completion_date"
)
.write.mode("append")
.partitionBy("completion_date")
.bucketBy(16, "match_id")
.saveAsTable("bootcamp.matches_bucketed")

val bucketedDetailsDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INTEGER,
    player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id));
"""
spark.sql(bucketedDetailsDDL)

// Partitioned table on the 16 buckets based on match_id
matchDetailsBucketed
.select(
    $"match_id",
    $"player_gamertag",
    $"player_total_kills",
    $"player_total_deaths"
)
.write.mode("append")
.bucketBy(16, "match_id").saveAsTable("bootcamp.match_details_bucketed")

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

//matchesBucketed.createOrReplaceTempView("matches")
//matchDetailsBucketed.createOrReplaceTempView("match_details")

//spark.sql("""
//    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
//    ON mdb.match_id = md.match_id
//    AND md.completion_date = DATE('2016-01-01')
//        
//""").explain()
//
//
//spark.sql("""
//    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id    
//""").explain()

// spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "1000000000000")

// val broadcastFromThreshold = matches.as("m").join(matchDetails.as("md"), $"m.match_id" === $"md.match_id")
//   .select($"m.completion_date", $"md.player_gamertag",  $"md.player_total_kills")
//   .take(5)

// val explicitBroadcast = matches.as("m").join(broadcast(matchDetails).as("md"), $"m.match_id" === $"md.match_id")
//   .select($"md.*", split($"completion_date", " ").getItem(0).as("ds"))

val bucketedValues = matchDetailsBucketed.as("mdb").join(matchesBucketed.as("mb"), $"mb.match_id" === $"mdb.match_id").explain()
val values = matchDetailsBucketed.as("m").join(matchesBucketed.as("md"), $"m.match_id" === $"md.match_id").explain()

// explicitBroadcast.write.mode("overwrite").insertInto("match_details_bucketed")

// matches.withColumn("ds", split($"completion_date", " ").getItem(0)).write.mode("overwrite").insertInto("matches_bucketed")

// spark.sql(bucketedSQL)

In [1]:
import org.apache.spark.sql.functions.{broadcast, split, lit}

val matchesBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")

val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")

matchesBucketed.show(5)

Intitializing Scala interpreter ...

Spark Web UI available at http://2e8125773246:4041
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1734346756353)
SparkSession available as 'spark'


+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------------+
|            match_id|               mapid|is_team_game|         playlist_id|     game_variant_id|is_match_over|    completion_date|match_duration|game_mode|      map_variant_id|
+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------------+
|11de1a94-8d07-416...|c7edbf0f-f206-11e...|        true|f72e0ef0-7c4a-430...|1e473914-46e4-408...|         true|2016-02-22 00:00:00|          NULL|     NULL|                NULL|
|d3643e71-3e51-43e...|cb914b9e-f206-11e...|       false|d0766624-dbd7-453...|257a305e-4dd3-41f...|         true|2016-02-14 00:00:00|          NULL|     NULL|                NULL|
|d78d2aae-36e4-48a...|c7edbf0f-f206-11e...|        true|f72e0ef0-7c4a-430...|1e473914-46e4-408...|       

import org.apache.spark.sql.functions.{broadcast, split, lit}
matchesBucketed: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]


In [2]:
spark.sql(""" DROP TABLE IF EXISTS bootcamp.matches_bucketed """)

res1: org.apache.spark.sql.DataFrame = []


In [3]:
spark.sql(""" DROP TABLE IF EXISTS bootcamp.match_details_bucketed """)

res2: org.apache.spark.sql.DataFrame = []


In [4]:
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id));
"""

spark.sql(bucketedDDL)

bucketedDDL: String =
"
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id));
"
res3: org.apache.spark.sql.DataFrame = []


In [None]:
matchesBucketed
.select(
    $"match_id",
    $"is_team_game",
    $"playlist_id",
    $"completion_date"
)
.limit(1000)
.show(20)

In [None]:
// Partitioned table on completion_date and the 16 buckets based on match_id
matchesBucketed
.select(
    $"match_id",
    $"is_team_game",
    $"playlist_id",
    $"completion_date"
)
.limit(1000)
.write.mode("append")
.partitionBy("completion_date")
.bucketBy(16, "match_id")
.saveAsTable("bootcamp.matches_bucketed")

In [None]:
val bucketedDetailsDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INTEGER,
    player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id));
"""
spark.sql(bucketedDetailsDDL)

In [None]:
// Partitioned table on the 16 buckets based on match_id
matchDetailsBucketed
.select(
    $"match_id",
    $"player_gamertag",
    $"player_total_kills",
    $"player_total_deaths"
)
.write.mode("append")
.bucketBy(16, "match_id")
.saveAsTable("bootcamp.match_details_bucketed")

In [None]:
val bucketedValues = matchDetailsBucketed.as("mdb").join(matchesBucketed.as("mb"), $"mdb.match_id" === $"mb.match_id").explain()
bucketedValues

In [None]:
// Disable broadcast joins
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")

spark.sql("""
    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
    AND md.completion_date = DATE('2016-01-01')
        
""").explain()


spark.sql("""
    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id    
""").explain()