In [None]:
%%HTML <style>pre { white-space: pre !important; }</style>

In [None]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder
    .appName("sesion_1")
    .master("local[*]")
    .getOrCreate()

spark.conf.set("spark.sql.adaptive.enabled", "false")

In [None]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [None]:
spark.sparkContext.uiWebUrl

In [None]:
import org.apache.spark.sql.{functions => f}
import org.apache.spark.sql.{types => t}

In [None]:
val bigContractsDf = spark.read.parquet("../../resources/data/parquet/big_contracts")
    .withColumnRenamed("cod_titular","cod_client")

val bigClientsDf = spark.read.parquet("../../resources/data/parquet/big_clients")

bigContractsDf.show(1)
bigClientsDf.show(1)

In [None]:
println("Contracts:", bigContractsDf.count())
println("Clients:", bigClientsDf.count())

### Coalesce y Repartition

##### Número de particiones

In [None]:
println(bigContractsDf.rdd.getNumPartitions)
println(bigClientsDf.rdd.getNumPartitions)

In [None]:
bigClientsDf.groupBy(f.spark_partition_id()).count().show()

In [None]:
bigContractsDf.groupBy(f.spark_partition_id()).count().show()

In [None]:
bigContractsDf.join(bigClientsDf, Seq("id")).rdd.getNumPartitions

In [None]:
bigContractsDf.join(bigClientsDf, Seq("id")).explain()

In [None]:
bigContractsDf.join(bigClientsDf.sample(false, 0.0001), "id").count()

##### repartition -> Aumenta o disminuye el numero de particiones,

In [None]:
bigContractsDf.repartition(16).explain()

In [None]:
bigContractsDf.repartition(16).groupBy(f.spark_partition_id()).count().show(20)

In [None]:
bigContractsDf.repartition(f.col("cod_client")).explain()

In [None]:
bigContractsDf.repartition(30, f.col("cod_client")).explain()

In [None]:
bigContractsDf.select("cod_client").distinct().count()

In [None]:
bigContractsDf.repartition(30, f.col("cod_client")).groupBy(f.spark_partition_id()).count().show()

In [None]:
bigContractsDf.repartition(30, f.col("cod_client")).groupBy(f.spark_partition_id(), f.col("cod_client")).count().show()

In [None]:
bigContractsDf.repartition(30, f.col("id")).groupBy(f.spark_partition_id(), f.col("id")).count().show()

In [None]:
bigContractsDf.repartition(30, f.col("id")).groupBy(f.spark_partition_id()).count().show()

In [None]:
bigContractsDf.repartition(30, f.col("id")).write.mode("overwrite").parquet("../../resources/data/parquet/t_repartition")

##### coalesce -> Unicamente disminuye el numero de particiones, puede provocar DATA SKEW

In [None]:
bigContractsDf.coalesce(9).rdd.getNumPartitions

In [None]:
bigContractsDf.repartition(f.col("cod_client")).groupBy(f.spark_partition_id()).count().show()

In [None]:
bigContractsDf.repartition(f.col("cod_client")).coalesce(4).groupBy(f.spark_partition_id()).count().show()

In [None]:
bigContractsDf.repartition(f.col("cod_client")).repartition(4).groupBy(f.spark_partition_id()).count().show()

##### Aplicamos transformaciones con tabla grande

In [None]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.DataFrame

def transformation1(contractsDf: DataFrame, clientsDf: DataFrame): DataFrame = {
    val aggWindow = Window.partitionBy("id")
    val lagWindow = Window.partitionBy("id").orderBy(f.col("operation_day").desc)
    
    contractsDf
        .join(clientsDf.sample(false,0.001, 0).drop("id"), Seq("cod_client"))
        .withColumn("vip_true", f.when(f.col("vip") === true, f.lit(1)).otherwise(f.lit(0)))
        .withColumn("vip_false", f.when(f.col("vip") === false, f.lit(1)).otherwise(f.lit(0)))
        .withColumn("count_vip_true", f.sum("vip_true").over(aggWindow))
        .withColumn("count_vip_false", f.sum("vip_false").over(aggWindow))
        .withColumn("prev_amount", f.lag(f.col("amount"), 5).over(lagWindow))
        .withColumn("diff_amount", f.col("amount") - f.col("prev_amount"))
        .filter(f.col("diff_amount") < -800)
        .filter(f.col("fec_alta") > "2015-01-01")
        .select("id", "amount", "diff_amount", "fec_alta", "operation_day", "count_vip_true", "count_vip_false")
        .distinct()
}



val joinTablesDf = transformation1(bigContractsDf, bigClientsDf)

In [None]:
joinTablesDf.write.mode("overwrite").parquet("../../resources/data/parquet/join_tables_df")

In [None]:
def transformation2(df:DataFrame): DataFrame = {
    val aggWindow = Window.partitionBy("fec_alta")
    df
        .select(
            f.col("id"),
            f.min(f.col("diff_amount")).over(aggWindow).alias("min_diff_amount"),
            f.max(f.col("diff_amount")).over(aggWindow).alias("max_diff_amount"),
            f.mean(f.col("diff_amount")).over(aggWindow).alias("mean_diff_amount"),
            f.stddev_pop(f.col("diff_amount")).over(aggWindow).alias("stddev_diff_amount")
        )
}

def transformation3(df:DataFrame): DataFrame = {
    df
        .select(
            f.abs(f.col("count_vip_true") - f.col("count_vip_false")).alias("id"),
            f.date_sub(f.col("fec_alta"), f.col("count_vip_false").cast(t.IntegerType)).alias("new_fec_alta"),
            f.date_sub(f.col("fec_alta"), f.col("count_vip_true").cast(t.IntegerType)).alias("new_operation_day"),
            (f.col("count_vip_true") + f.col("count_vip_false")).alias("new_count_vip_true"),
            (f.col("count_vip_false") + f.col("count_vip_true")).alias("new_count_vip_false")
        )
}

In [None]:
val joinTablesDf = transformation1(bigContractsDf, bigClientsDf)

val transformation2Df = transformation2(joinTablesDf)
val transformation3Df = transformation3(joinTablesDf)

val expensiveDf = joinTablesDf
    .join(transformation2Df, Seq("id"))
    .join(transformation3Df, Seq("id"))

In [None]:
expensiveDf.write.mode("overwrite").parquet("../../resources/data/parquet/expensive_df")

### Cache y Persist

### What is a Caching?
In applications that reuse the same datasets over and over, one of the most useful optimizations is caching. Caching will place a DataFrame or table into temporary storage across the executors in your cluster and make subsequent reads faster.

**Without Spark Caching**
```
          +------------------+     +------------------+
input --> | Transformation 1 | --> | Transformation 2 | --> Output 1
          +------------------+     +------------------+
          +------------------+     +------------------+
input --> | Transformation 1 | --> | Transformation 3 | --> Output 2
          +------------------+     +------------------+
          +------------------+     +------------------+
input --> | Transformation 1 | --> | Transformation 4 | --> Output 3
          +------------------+     +------------------+
```
**With Spark Caching**
```
                                                     +------------------+
                                               +---> | Transformation 2 | --> Output 1
                                               |     +------------------+
          +------------------+     +-------+   |     +------------------+
input --> | Transformation 1 | --> | Cache | --+---> | Transformation 3 | --> Output 2
          +------------------+     +-------+   |     +------------------+
                                               |     +------------------+
                                               +---> | Transformation 4 | --> Output 3
                                                     +------------------+
```

In [None]:
// - Almacenar el contenido de un Dataframe en cualquier nivel del procesamiento

// Almacenamiento por defecto en DataFrames -> MEMORY_AND_DISK_DESER
// Almacenamiento por defecto en RDD -> MEMORY_ONLY
import org.apache.spark.storage.StorageLevel

In [None]:
val joinPersistedDf = joinTablesDf.persist(StorageLevel.DISK_ONLY)

In [None]:
// El proceso de cache es lazy, por lo tanto se activa posterior a una acción
joinPersistedDf.show()

In [None]:
joinPersistedDf.count()

##### unpersist

In [None]:
joinPersistedDf.unpersist()

##### aplicación de cache/persist

In [None]:
val joinTablesDf = transformation1(bigContractsDf, bigClientsDf)

val joinPersistedDf = joinTablesDf.persist(StorageLevel.DISK_ONLY)

val transformation2Df = transformation2(joinPersistedDf)
val transformation3Df = transformation3(joinPersistedDf)

val expensiveDf = joinPersistedDf
    .join(transformation2Df, Seq("id"))
    .join(transformation3Df, Seq("id"))

In [None]:
expensiveDf.write.mode("overwrite").parquet("../../resources/data/parquet/expensive_df")

In [None]:
expensiveDf.unpersist()

### Broadcast

In [None]:
bigContractsDf
    .join(bigClientsDf.sample(false,0.001, 0).drop("id"), Seq("cod_client"))
    .write.mode("overwrite").parquet("../../resources/data/parquet/join_tables_df")

In [None]:
bigContractsDf
    .join(f.broadcast(bigClientsDf.sample(false,0.001, 0).drop("id")), Seq("cod_client"))
    .write.mode("overwrite").parquet("../../resources/data/parquet/join_tables_df")

In [None]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "10485760")

In [None]:
val joinTablesDf = transformation1(bigContractsDf, bigClientsDf)

val transformation2Df = transformation2(joinTablesDf)
val transformation3Df = transformation3(joinTablesDf)

val expensiveDf = joinTablesDf
    .join(transformation2Df, Seq("id"))
    .join(transformation3Df, Seq("id"))

In [None]:
expensiveDf.write.mode("overwrite").parquet("../../resources/data/parquet/expensive_df")

#### Broadcast + persist

In [None]:
val joinTablesDf = transformation1(bigContractsDf, bigClientsDf)

val joinPersistedDf = joinTablesDf.persist(StorageLevel.DISK_ONLY)

val transformation2Df = transformation2(joinPersistedDf)
val transformation3Df = transformation3(joinPersistedDf)

val expensiveDf = joinPersistedDf
    .join(transformation2Df, Seq("id"))
    .join(transformation3Df, Seq("id"))

In [None]:
expensiveDf.write.mode("overwrite").parquet("../../resources/data/parquet/expensive_df")