In [1]:
import org.apache.spark.sql.{SparkSession, DataFrame}

val spark = SparkSession.builder
        .appName("sesion_1")
        .master("local[*]")
        .getOrCreate()
val sc = spark.sparkContext

sc = org.apache.spark.SparkContext@20960afa


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@11e41082


org.apache.spark.SparkContext@20960afa

In [2]:
sc.uiWebUrl

Some(http://23LAP5CD0176GH7:4041)

In [3]:
def readCsv(path: String): DataFrame = {
    spark.read
        .option("header","true")
        .option("delimiter",",")
        .option("inferSchema","false")
        .csv(path)
    }

val BasePath = "../../resources/data/csv/"
val contractsDf = readCsv(BasePath + "contracts.csv")

contractsDf.show(2)

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30001|      00006|         200|2014-05-01|  true|
+-------+-----------+------------+----------+------+
only showing top 2 rows



contractsDf = [cod_iuc: string, cod_titular: string ... 3 more fields]


readCsv: (path: String)org.apache.spark.sql.DataFrame
BasePath: String = ../../resources/data/csv/


[cod_iuc: string, cod_titular: string ... 3 more fields]

#### Transformaciones


In [4]:
%%HTML <style>pre { white-space: pre !important; }</style>

In [9]:
import org.apache.spark.sql.{functions => f}
import org.apache.spark.sql.{types => t}
import org.apache.spark.sql.Column

// Select

def difference(l1: Seq[String], l2: Seq[String]): Seq[Column] =
    l1.diff(l2).map(colName => f.col(colName))
//difference(contractsDf.columns, Seq("fec_alta", "activo")),


// castear con string
val resultedDf = contractsDf
    .select(
        f.col("cod_iuc"), f.col("cod_producto"), f.col("cod_titular"), 
        f.col("activo").cast(t.BooleanType),
        f.col("fec_alta").alias("fec_alta_ini").cast(t.DateType),
        f.current_date().alias("actual_date"),
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType).alias("randm_num"),
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin"),
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff"),
        f.current_timestamp().alias("actual_timestamp"),
        f.current_timestamp().cast(t.LongType).alias("actual_unix_timestamp"),
        f.from_unixtime(f.unix_timestamp(), "EEEE, MMM d yyyy").alias("from_unixtime"),
        f.lit(0).cast(t.TimestampType).alias("first_timestamp")
        )

val resultedDf2 = resultedDf
    .select(
        resultedDf.columns.map(f.col) :+
        f.split(f.col("actual_timestamp"), " ").getItem(0).alias("date") :+
        f.split(f.col("actual_timestamp"), " ").getItem(1).alias("time") :+
        f.explode(f.array(f.lit(1), f.lit(2), f.lit(3))).alias("explode") :+
        f.regexp_replace(f.col("cod_iuc"), "[1-9]$", "A").alias("replace") :_*
        )
    .drop("randm_num")

resultedDf2.show(20, false)
resultedDf2.printSchema()

+-------+------------+-----------+------+------------+-----------+------------+----+-----------------------+---------------------+--------------------+-------------------+----------+------------+-------+-------+
|cod_iuc|cod_producto|cod_titular|activo|fec_alta_ini|actual_date|fec_alta_fin|diff|actual_timestamp       |actual_unix_timestamp|from_unixtime       |first_timestamp    |date      |time        |explode|replace|
+-------+------------+-----------+------+------------+-----------+------------+----+-----------------------+---------------------+--------------------+-------------------+----------+------------+-------+-------+
|30000  |100         |00006      |true  |2012-05-01  |2024-01-16 |2012-05-09  |8   |2024-01-16 09:39:17.082|1705419557           |Tuesday, Jan 16 2024|1969-12-31 18:00:00|2024-01-16|09:39:17.082|1      |30000  |
|30000  |100         |00006      |true  |2012-05-01  |2024-01-16 |2012-05-09  |8   |2024-01-16 09:39:17.082|1705419557           |Tuesday, Jan 16 2024|1

resultedDf2 = [cod_iuc: string, cod_producto: string ... 14 more fields]


import org.apache.spark.sql.{functions=>f}
import org.apache.spark.sql.{types=>t}
difference: (l1: Seq[String], l2: Seq[String])Seq[org.apache.spark.sql.Column]
resultedDf: org.apache.spark.sql.DataFrame = [cod_iuc: string, cod_producto: string ... 11 more fields]


[cod_iuc: string, cod_producto: string ... 14 more fields]

In [12]:
// select- when

val cond_1 = f.col("cod_producto") <= 300 // baja
val cond_2 = f.col("cod_producto") <= 600 // media
val cond_3 = f.col("cod_producto") <= 1000 // alta

val selectWhenDf = resultedDf2
    .select(
        difference(resultedDf2.columns, Seq("activo")) :+
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad") :+
        f.when((f.col("activo") === f.lit(true)) && (f.col("calidad").isin("alta", "muy alta")), f.lit("ok")).alias("prioridad_alta") :+
        f.when(f.col("activo") === true, f.lit(true)).alias("activo") :_*
    )

selectWhenDf.show()

+-------+-------+--------------+------+
|cod_iuc|calidad|prioridad_alta|activo|
+-------+-------+--------------+------+
|  30000|   baja|          null|  true|
|  30000|   baja|          null|  true|
|  30000|   baja|          null|  true|
|  30001|   baja|          null|  true|
|  30001|   baja|          null|  true|
|  30001|   baja|          null|  true|
|  30002|   baja|          null|  null|
|  30002|   baja|          null|  null|
|  30002|   baja|          null|  null|
|  30003|   baja|          null|  true|
|  30003|   baja|          null|  true|
|  30003|   baja|          null|  true|
|  30002|   baja|          null|  true|
|  30002|   baja|          null|  true|
|  30002|   baja|          null|  true|
|  30004|  media|          null|  null|
|  30004|  media|          null|  null|
|  30004|  media|          null|  null|
|  30005|  media|          null|  true|
|  30005|  media|          null|  true|
+-------+-------+--------------+------+
only showing top 20 rows



selectWhenDf = [cod_iuc: string, calidad: string ... 2 more fields]


cond_1: org.apache.spark.sql.Column = (cod_producto <= 300)
cond_2: org.apache.spark.sql.Column = (cod_producto <= 600)
cond_3: org.apache.spark.sql.Column = (cod_producto <= 1000)


[cod_iuc: string, calidad: string ... 2 more fields]

In [20]:
// where / filter
selectWhenDf
    .filter(f.col("activo").isNotNull)
    .filter(!f.col("cod_producto").isin("100", "200", "150", "300"))
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01"))
    .where((f.col("calidad") === "media") || (f.col("cod_titular").isin("00006", "00001")))
    .filter(f.col("cod_titular").rlike("0000[1|5]"))
    .withColumn("prioridad_alta_2", f.col("prioridad_alta"))
    .filter(f.col("prioridad_alta").eqNullSafe(f.col("prioridad_alta_2")))
    .show()

+-------+-------+--------------+------+----------------+
|cod_iuc|calidad|prioridad_alta|activo|prioridad_alta_2|
+-------+-------+--------------+------+----------------+
|  30007|   alta|            ok|  true|              ok|
|  30007|   alta|            ok|  true|              ok|
|  30007|   alta|            ok|  true|              ok|
|  30006|  media|          null|  true|            null|
|  30006|  media|          null|  true|            null|
|  30006|  media|          null|  true|            null|
+-------+-------+--------------+------+----------------+

