- #### Transformaciones
    - ##### select - when
    - ##### where/filter


In [None]:
import org.apache.spark.sql.{SparkSession, DataFrame, Column}

val spark = SparkSession.builder
        .appName("sesion_1")
        .master("local[*]")
        .getOrCreate()
val sc = spark.sparkContext

In [None]:
sc.uiWebUrl

In [None]:
def readCsv(path: String): DataFrame = {
    spark.read
        .option("header","true")
        .option("delimiter",",")
        .option("inferSchema","false")
        .csv(path)
}

val BasePath = "../../resources/data/csv/"
val contractsDf = readCsv(BasePath + "contracts.csv")

contractsDf.show()

In [None]:
Seq(1,2,3) :+ 4

In [None]:
import org.apache.spark.sql.{functions => f}
import org.apache.spark.sql.{types => t}

// Select

def difference(l1: Seq[String], l2: Seq[String]): Seq[Column] = {
    l1.diff(l2).map(colName => f.col(colName))
}

val resultedDf = contractsDf
    .select(
        difference(contractsDf.columns.toSeq, Seq("fec_alta", "activo")) :+
        f.col("activo").cast(t.BooleanType) :+
        f.col("fec_alta").alias("fec_alta_ini").cast(t.DateType) :+
        f.current_date().alias("actual_date") :+
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType).alias("randm_num") :+
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin") :+
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff") :_*
        )
    .drop("randm_num")

resultedDf.show()
resultedDf.printSchema()

In [None]:
// select- when

val cond_1: Column = f.col("cod_producto") <= 300 // baja
val cond_2: Column = f.col("cod_producto") <= 600 // media
val cond_3: Column = f.col("cod_producto") <= 1000 // alta

val selectWhenDf = resultedDf
    .select(
        difference(resultedDf.columns.toSeq, Seq("activo")) :+
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad") :+
        f.when((f.col("activo") === f.lit(true)) && (f.col("calidad").isin("alta", "muy alta")), f.lit("ok")).alias("prioridad_alta") :+
        f.when(f.col("activo") === true, f.lit(true)).alias("activo") :_*
    )

selectWhenDf.show()

In [None]:
// where / filter
selectWhenDf
    .filter(f.col("activo").isNotNull)
    .filter(!f.col("cod_producto").isin("100", "200", "150", "300"))
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01"))
    .where((f.col("calidad") === "media") || (f.col("cod_titular").isin("00006", "00001")))
    .show()

In [None]:
selectWhenDf
    .write.mode("overwrite").parquet("../../resources/data/parquet/contracts_tmp")