In [None]:
import org.apache.spark.sql.{SparkSession, DataFrame}

val spark = SparkSession.builder
        .appName("sesion_1")
        .master("local[*]")
        .getOrCreate()
val sc = spark.sparkContext

In [None]:
sc.uiWebUrl

In [None]:
def readCsv(path: String): DataFrame = {
    spark.read
        .option("header","true")
        .option("delimiter",",")
        .option("inferSchema","false")
        .csv(path)
    }

val BasePath = "../../resources/data/csv/"
val contractsDf = readCsv(BasePath + "contracts.csv")

contractsDf.show(2)

#### Transformaciones


In [None]:
%%HTML <style>pre { white-space: pre !important; }</style>

In [None]:
import org.apache.spark.sql.{functions => f}
import org.apache.spark.sql.{types => t}
import org.apache.spark.sql.Column

// Select

def difference(l1: Seq[String], l2: Seq[String]): Seq[Column] =
    l1.diff(l2).map(colName => f.col(colName))
//difference(contractsDf.columns, Seq("fec_alta", "activo")),


// castear con string
val resultedDf = contractsDf
    .select(
        f.col("cod_iuc"), f.col("cod_producto"), f.col("cod_titular"), 
        f.col("activo").cast(t.BooleanType),
        f.col("fec_alta").alias("fec_alta_ini").cast(t.DateType),
        f.current_date().alias("actual_date"),
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType).alias("randm_num"),
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin"),
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff"),
        f.current_timestamp().alias("actual_timestamp"),
        f.current_timestamp().cast(t.LongType).alias("actual_unix_timestamp"),
        f.from_unixtime(f.unix_timestamp(), "EEEE, MMM d yyyy").alias("from_unixtime"),
        f.lit(0).cast(t.TimestampType).alias("first_timestamp")
        )

val resultedDf2 = resultedDf
    .select(
        resultedDf.columns.map(f.col) :+
        f.split(f.col("actual_timestamp"), " ").getItem(0).alias("date") :+
        f.split(f.col("actual_timestamp"), " ").getItem(1).alias("time") :+
        f.explode(f.array(f.lit(1), f.lit(2), f.lit(3))).alias("explode") :+
        f.regexp_replace(f.col("cod_iuc"), "[1-9]$", "A").alias("replace") :_*
        )
    .drop("randm_num")

resultedDf2.show(20, false)
resultedDf2.printSchema()

In [None]:
// select- when

val cond_1 = f.col("cod_producto") <= 300 // baja
val cond_2 = f.col("cod_producto") <= 600 // media
val cond_3 = f.col("cod_producto") <= 1000 // alta

val selectWhenDf = resultedDf2
    .select(
        difference(resultedDf2.columns, Seq("activo")) :+
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad") :+
        f.when((f.col("activo") === f.lit(true)) && (f.col("calidad").isin("alta", "muy alta")), f.lit("ok")).alias("prioridad_alta") :+
        f.when(f.col("activo") === true, f.lit(true)).alias("activo") :_*
    )

selectWhenDf.show()

In [None]:
// where / filter
selectWhenDf
    .filter(f.col("activo").isNotNull)
    .filter(!f.col("cod_producto").isin("100", "200", "150", "300"))
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01"))
    .where((f.col("calidad") === "media") || (f.col("cod_titular").isin("00006", "00001")))
    .filter(f.col("cod_titular").rlike("0000[1|5]"))
    .withColumn("prioridad_alta_2", f.col("prioridad_alta"))
    .filter(f.col("prioridad_alta").eqNullSafe(f.col("prioridad_alta_2")))
    .show()