In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [6]:
spark

In [7]:
contracts_df = spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","false")\
        .csv("../../resources/data/csv/contracts.csv")

contracts_df.show(2)

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30001|      00006|         200|2014-05-01|  true|
+-------+-----------+------------+----------+------+
only showing top 2 rows



#### Transformaciones


In [8]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [26]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

# Select

resulted_df = contracts_df\
    .select(
        f.col("cod_iuc"), f.col("cod_titular"), f.col("cod_producto"),
        f.col("activo").cast(t.BooleanType()),
        f.col("fec_alta").cast(t.DateType()).alias("fec_alta_ini"), 
        f.current_date().alias("actual_date"),
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType()).alias("randm_num"),
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin"),
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff"),
        f.current_timestamp().alias("actual_timestamp"),
        f.current_timestamp().cast(t.LongType()).alias("actual_unix_timestamp"),
        f.lit(0).cast(t.TimestampType()).alias("first_timestamp")
)
resulted_df.show(1, False)

+-------+-----------+------------+------+------------+-----------+---------+------------+----+-----------------------+---------------------+-------------------+
|cod_iuc|cod_titular|cod_producto|activo|fec_alta_ini|actual_date|randm_num|fec_alta_fin|diff|actual_timestamp       |actual_unix_timestamp|first_timestamp    |
+-------+-----------+------------+------+------------+-----------+---------+------------+----+-----------------------+---------------------+-------------------+
|30000  |00006      |100         |true  |2012-05-01  |2024-01-15 |8        |2012-05-09  |8   |2024-01-15 23:26:00.376|1705382760           |1969-12-31 18:00:00|
+-------+-----------+------------+------+------------+-----------+---------+------------+----+-----------------------+---------------------+-------------------+
only showing top 1 row



In [58]:
resulted2_df = resulted_df \
    .select(*resulted_df.columns,
            f.split(f.col("actual_timestamp"), " ").alias("array"),
            f.split(f.col("actual_timestamp"), " ").getItem(0).alias("date"),
            f.split(f.col("actual_timestamp"), " ").getItem(1).alias("time"),
            f.explode(f.array(f.lit(1), f.lit(2), f.lit(3))).alias("explode"),
            f.regexp_replace(f.col("cod_iuc"), f.lit("[1-9]$"), f.lit("A")).alias("replace")
           )

resulted2_df.show(5, False)

+-------+-----------+------------+------+------------+-----------+---------+------------+----+-----------------------+---------------------+-------------------+--------------------------+----------+------------+-------+-------+
|cod_iuc|cod_titular|cod_producto|activo|fec_alta_ini|actual_date|randm_num|fec_alta_fin|diff|actual_timestamp       |actual_unix_timestamp|first_timestamp    |array                     |date      |time        |explode|replace|
+-------+-----------+------------+------+------------+-----------+---------+------------+----+-----------------------+---------------------+-------------------+--------------------------+----------+------------+-------+-------+
|30000  |00006      |100         |true  |2012-05-01  |2024-01-16 |8        |2012-05-09  |8   |2024-01-16 00:02:22.366|1705384942           |1969-12-31 18:00:00|[2024-01-16, 00:02:22.366]|2024-01-16|00:02:22.366|1      |30000  |
|30000  |00006      |100         |true  |2012-05-01  |2024-01-16 |8        |2012-05-09  

In [59]:
# select- when

cond_1 = f.col("cod_producto") <= 300 # baja
cond_2 = f.col("cod_producto") <= 600 # media
cond_3 = f.col("cod_producto") <= 1000 # alta

cond_4 = (f.col("activo") == f.lit(True)) & (f.col("calidad").isin("alta", "muy alta"))

cond_5 = f.col("activo") == f.lit(True)

def diff(l1, l2):
    return list(set(l1) - set(l2))

resulted3_df = resulted2_df \
    .select(
        *diff(resulted2_df.columns, ["activo"]),
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad"),
        f.when(cond_4, f.lit("ok")).alias("prioridad_alta"),
        f.when(cond_5, True).alias("activo")
    )

resulted3_df.show(50, False)

+--------------------------+-----------------------+-----------+------------+---------+------------+-------+----+-------+---------------------+------------+-----------+----------+-------------------+------------+-------+--------+--------------+------+
|array                     |actual_timestamp       |actual_date|time        |randm_num|fec_alta_ini|replace|diff|explode|actual_unix_timestamp|fec_alta_fin|cod_titular|date      |first_timestamp    |cod_producto|cod_iuc|calidad |prioridad_alta|activo|
+--------------------------+-----------------------+-----------+------------+---------+------------+-------+----+-------+---------------------+------------+-----------+----------+-------------------+------------+-------+--------+--------------+------+
|[2024-01-16, 00:03:21.229]|2024-01-16 00:03:21.229|2024-01-16 |00:03:21.229|8        |2012-05-01  |30000  |8   |1      |1705385001           |2012-05-09  |00006      |2024-01-16|1969-12-31 18:00:00|100         |30000  |baja    |null          |

In [65]:
# where / filter
resulted3_df \
    .filter(f.col("activo").isNotNull()) \
    .filter(~f.col("cod_producto").isin("100", "200", "150", "300")) \
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01")) \
    .where((f.col("calidad") == "media") | (f.col("cod_titular").isin("00006", "00001"))) \
    .filter(f.col("cod_titular").rlike("0000[1|5]")) \
    .withColumn("prioridad_alta_2", f.col("prioridad_alta"))\
    .filter(f.col("prioridad_alta").eqNullSafe(f.col("prioridad_alta_2")))\
    .show()


+--------------------+--------------------+-----------+------------+---------+------------+-------+----+-------+---------------------+------------+-----------+----------+-------------------+------------+-------+-------+--------------+------+----------------+
|               array|    actual_timestamp|actual_date|        time|randm_num|fec_alta_ini|replace|diff|explode|actual_unix_timestamp|fec_alta_fin|cod_titular|      date|    first_timestamp|cod_producto|cod_iuc|calidad|prioridad_alta|activo|prioridad_alta_2|
+--------------------+--------------------+-----------+------------+---------+------------+-------+----+-------+---------------------+------------+-----------+----------+-------------------+------------+-------+-------+--------------+------+----------------+
|[2024-01-16, 09:5...|2024-01-16 09:51:...| 2024-01-16|09:51:16.779|        2|  2016-07-01|  3000A|   2|      1|           1705420276|  2016-07-03|      00001|2024-01-16|1969-12-31 18:00:00|         700|  30007|   alta|    