- #### Transformaciones
    - ##### select - when
    - ##### where/filter


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [None]:
def read_csv(path):
    return spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","false")\
        .csv(path)

base_path = "../../resources/data/csv/"
contracts_df = read_csv(base_path + "contracts.csv")

contracts_df.show()

In [None]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

# Select

def difference(l1, l2):
    return list(set(l1) - set(l2))

resulted_df = contracts_df \
    .select(
        *difference(contracts_df.columns, ["fec_alta", "activo"]),
        f.col("activo").cast(t.BooleanType()),
        f.col("fec_alta").alias("fec_alta_ini").cast(t.DateType()),
        f.current_date().alias("actual_date"),
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType()).alias("randm_num"),
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin"),
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff")
        ) \
    .drop("randm_num")

resulted_df.show()
resulted_df.printSchema()

In [None]:
# select- when

cond_1 = f.col("cod_producto") <= 300 # baja
cond_2 = f.col("cod_producto") <= 600 # media
cond_3 = f.col("cod_producto") <= 1000 # alta

select_when_df = resulted_df \
    .select(
        *difference(resulted_df.columns, ["activo"]),
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad"),
        f.when((f.col("activo") == f.lit(True)) & (f.col("calidad").isin("alta", "muy alta")), f.lit("ok")).alias("prioridad_alta"),
        f.when(f.col("activo") == True, f.lit(True)).alias("activo")
    )

select_when_df.show()

In [None]:
# where / filter
select_when_df \
    .filter(f.col("activo").isNotNull()) \
    .filter(~f.col("cod_producto").isin("100", "200", "150", "300")) \
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01")) \
    .where((f.col("calidad") == "media") | (f.col("cod_titular").isin("00006", "00001"))) \
    .show()

In [None]:
select_when_df \
    .write.mode("overwrite").parquet("../../resources/data/parquet/contracts_tmp")