In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [None]:
spark

In [None]:
contracts_df = spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","false")\
        .csv("../../resources/data/csv/contracts.csv")

contracts_df.show(2)

#### Transformaciones


In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

# Select

resulted_df = contracts_df\
    .select(
        f.col("cod_iuc"), f.col("cod_titular"), f.col("cod_producto"),
        f.col("activo").cast(t.BooleanType()),
        f.col("fec_alta").cast(t.DateType()).alias("fec_alta_ini"), 
        f.current_date().alias("actual_date"),
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType()).alias("randm_num"),
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin"),
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff"),
        f.current_timestamp().alias("actual_timestamp"),
        f.current_timestamp().cast(t.LongType()).alias("actual_unix_timestamp"),
        f.lit(0).cast(t.TimestampType()).alias("first_timestamp")
)
resulted_df.show(1, False)

In [None]:
resulted2_df = resulted_df \
    .select(*resulted_df.columns,
            f.split(f.col("actual_timestamp"), " ").alias("array"),
            f.split(f.col("actual_timestamp"), " ").getItem(0).alias("date"),
            f.split(f.col("actual_timestamp"), " ").getItem(1).alias("time"),
            f.explode(f.array(f.lit(1), f.lit(2), f.lit(3))).alias("explode"),
            f.regexp_replace(f.col("cod_iuc"), f.lit("[1-9]$"), f.lit("A")).alias("replace")
           )

resulted2_df.show(5, False)

In [None]:
# select- when

cond_1 = f.col("cod_producto") <= 300 # baja
cond_2 = f.col("cod_producto") <= 600 # media
cond_3 = f.col("cod_producto") <= 1000 # alta

cond_4 = (f.col("activo") == f.lit(True)) & (f.col("calidad").isin("alta", "muy alta"))

cond_5 = f.col("activo") == f.lit(True)

def diff(l1, l2):
    return list(set(l1) - set(l2))

resulted3_df = resulted2_df \
    .select(
        *diff(resulted2_df.columns, ["activo"]),
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad"),
        f.when(cond_4, f.lit("ok")).alias("prioridad_alta"),
        f.when(cond_5, True).alias("activo")
    )

resulted3_df.show(50, False)

In [None]:
# where / filter
resulted3_df \
    .filter(f.col("activo").isNotNull()) \
    .filter(~f.col("cod_producto").isin("100", "200", "150", "300")) \
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01")) \
    .where((f.col("calidad") == "media") | (f.col("cod_titular").isin("00006", "00001"))) \
    .filter(f.col("cod_titular").rlike("0000[1|5]")) \
    .withColumn("prioridad_alta_2", f.col("prioridad_alta"))\
    .filter(f.col("prioridad_alta").eqNullSafe(f.col("prioridad_alta_2")))\
    .show()
