In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.driver.memory", "5g")\
    .appName("sesion_1")\
    .master("local[*]")\
    .getOrCreate()

In [None]:
spark.sparkContext

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
spark.sparkContext.uiWebUrl

In [None]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [None]:
contracts_df = spark.read.parquet("../../resources/data/parquet/big_contracts")

clients_df = spark.read.parquet("../../resources/data/parquet/big_clients")

contracts_df.show(2)
clients_df.show(2)

#### spark.sql.adaptive.enabled
##### Default value: true since Apache Spark 3.2.0.
Adaptive Query Execution (AQE) is an optimization technique in Spark SQL that makes use of the runtime statistics to choose the most efficient query execution plan.

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [None]:
contracts_df\
    .groupBy("cod_titular")\
    .agg(
        f.max("fec_alta"),
        f.min("fec_alta")
        )\
    .count()

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

In [None]:
contracts_df\
    .groupBy("cod_titular")\
    .agg(
        f.max("fec_alta"),
        f.min("fec_alta")
        )\
    .count()

#### spark.sql.autoBroadcastJoinThreshold
##### Default value: 10485760 (10 MB)
Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when performing a join. By setting this value to -1, broadcasting can be disabled.

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [None]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [None]:
contracts_df.join(clients_df, ["id"]).explain()

In [None]:
contracts_df.join(clients_df, ["id"]).write.mode("overwrite").parquet("../../resources/data/parquet/t_broadcast_join")

In [None]:
3*1024*1024

In [None]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "3145728")

In [None]:
contracts_df.join(clients_df, ["id"]).write.mode("overwrite").parquet("../../resources/data/parquet/t_broadcast_join")

#### spark.sql.adaptive.autoBroadcastJoinThreshold
##### Default value: Same as spark.sql.autoBroadcastJoinThreshold
Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when performing a join. By setting this value to -1, broadcasting can be disabled. The default value is the same as spark.sql.autoBroadcastJoinThreshold. Note that, this config is used only in adaptive framework.

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

In [None]:
spark.conf.get("spark.sql.adaptive.autoBroadcastJoinThreshold")

In [None]:
contracts_df.join(clients_df, ["id"]).write.mode("overwrite").parquet("../../resources/data/parquet/t_broadcast_join")

In [None]:
spark.conf.set("spark.sql.adaptive.autoBroadcastJoinThreshold", "2097152")

In [None]:
contracts_df.join(clients_df, ["id"]).write.mode("overwrite").parquet("../../resources/data/parquet/t_broadcast_join")

#### spark.sql.broadcastTimeout
##### Default value: 300
Timeout in seconds for the broadcast wait time in broadcast joins

      "java.util.concurrent.TimeoutException: Futures timed out after [300 seconds] at"

This error is fixed by increasing the value of spark.sql.broadcastTimeout, for example e spark.conf.set("spark.sql.broadcastTimeout", "3600")

#### spark.default.parallelism
##### Default number of partitions in **RDDs** returned by transformations like join, reduceByKey, and parallelize when not set by user.
For distributed shuffle operations like reduceByKey and join, the largest number of partitions in a parent RDD. For operations like parallelize with no parent RDDs, it depends on the cluster manager:
- Local mode: number of cores on the local machine
- Mesos fine grained mode: 8
- Others: total number of cores on all executor nodes or 2, whichever is larger

In [None]:
data = spark.sparkContext.textFile("../../resources/data/txt/green_eggs_and_ham.txt")
data.getNumPartitions()

In [None]:
result = data \
    .flatMap(lambda line: line.replace("  ", " ").upper().split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda count_1, count_2: count_1 + count_2)
result.count()

In [None]:
result.getNumPartitions()

In [None]:
spark.conf.set("spark.default.parallelism", 17) 

In [None]:
result = data \
    .flatMap(lambda line: line.replace("  ", " ").upper().split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda count_1, count_2: count_1 + count_2)
result.count()   #No funciona spark.default.parallelism una vez creada la spark session

In [None]:
result.getNumPartitions()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.driver.memory", "5g")\
    .config("spark.default.parallelism", "17")\
    .appName("sesion_1")\
    .master("local[*]")\
    .getOrCreate()

In [None]:
data = spark.sparkContext.textFile("../../resources/data/txt/green_eggs_and_ham.txt", 8)
data.getNumPartitions()

In [None]:
result = data \
    .flatMap(lambda line: line.replace("  ", " ").upper().split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda count_1, count_2: count_1 + count_2)
result.count()

In [None]:
result.getNumPartitions()

In [None]:
data = spark.sparkContext.parallelize([i for i in range(100)])
data.count()

In [None]:
data.getNumPartitions()

In [None]:
spark.conf.get("spark.default.parallelism")

#### spark.sql.shuffle.partitions
##### Default value: 200
The default number of partitions to use when shuffling data for joins or aggregations. Used in DataFrames or DataSets.

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [None]:
contracts_df.join(clients_df, ["id"]).count()

In [None]:
spark.conf.set("spark.sql.shuffle.partitions","100")

In [None]:
contracts_df.join(clients_df, ["id"]).count()

In [None]:
spark.conf.get("spark.sql.shuffle.partitions")

#### spark.sql.files.maxPartitionBytes	
##### Default value: 134217728 (128 MB)
The maximum number of bytes to pack into a single partition when reading files. This configuration is effective only when using file-based sources such as Parquet, JSON and ORC.

In [None]:
contracts_df.join(clients_df, ["id"]).count()

In [None]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

In [None]:
128*1024*1024

#### spark.sql.adaptive.skewJoin.enabled
##### Default value: true
When true and spark.sql.adaptive.enabled is true, Spark dynamically handles skew in sort-merge join by splitting (and replicating if needed) skewed partitions.

https://spark.apache.org/docs/latest/sql-performance-tuning.html#optimizing-skew-join

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
spark.conf.set("spark.sql.adaptive.autoBroadcastJoinThreshold", "-1")

In [None]:
contracts_df.groupBy("cod_titular").count().show()

In [None]:
clients_df.sample(False, 0.00003, 0).groupBy("cod_client").count().show()

In [None]:
contracts_df.join(clients_df.sample(False, 0.00003, 0).drop("id"), f.col("cod_titular") == f.col("cod_client")).explain()

In [None]:
contracts_df.join(clients_df.sample(False, 0.00003, 0).drop("id"), f.col("cod_titular") == f.col("cod_client"))\
    .write.mode("overwrite").parquet("../../resources/data/parquet/t_skew_join")

In [None]:
spark.conf.set("spark.sql.adaptive.skewJoin.enabled","true")
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionFactor", "2.0")
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes", "50m")
spark.conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes", "10m")

In [None]:
contracts_df.join(clients_df.sample(False, 0.00003, 0).drop("id"), f.col("cod_titular") == f.col("cod_client"))\
    .write.mode("overwrite").parquet("../../resources/data/parquet/t_skew_join")

#### spark.sql.adaptive.coalescePartitions.enabled
##### Default value: true
When true and spark.sql.adaptive.enabled is true, Spark will coalesce contiguous shuffle partitions according to the target size (specified by spark.sql.adaptive.advisoryPartitionSizeInBytes), to avoid too many small tasks.

https://spark.apache.org/docs/latest/sql-performance-tuning.html#coalescing-post-shuffle-partitions

In [None]:
contracts_df = spark.read.parquet("../../resources/data/parquet/big_contracts")

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "false")

contracts_df.repartition("id")\
    .write.mode("overwrite").parquet("../../resources/data/parquet/t_skew_join")

In [None]:
spark.conf.get("spark.sql.adaptive.coalescePartitions.enabled")

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
contracts_df.repartition("id")\
    .write.mode("overwrite").parquet("../../resources/data/parquet/t_skew_join")

In [None]:
spark.conf.set("spark.sql.adaptive.coalescePartitions.parallelismFirst", "false")
spark.conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes", "64m")

In [None]:
contracts_df.repartition("id")\
    .write.mode("overwrite").parquet("../../resources/data/parquet/t_skew_join")