In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [None]:

contracts_df = spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","true")\
        .csv("../../resources/data/csv/contracts.csv")

contracts_df.show()
contracts_df.printSchema()

In [None]:
import pyspark.sql.functions as f

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

#### Transformaciones

In [None]:
# sort/orderBy
contracts_df.orderBy(f.col("fec_alta").asc(), f.col("cod_producto").desc()).show()

In [None]:
# dropDuplicates distinct
contracts_df.show()
contracts_df.dropDuplicates(["cod_producto", "cod_titular"]).show()
contracts_df.select("cod_titular","activo").orderBy("cod_titular","activo").dropDuplicates().show()
contracts_df.select("cod_titular","activo").orderBy("cod_titular","activo").distinct().show()

In [None]:
# agg
contracts_df\
    .agg(
        f.count("*").alias("count"),
        f.sum(f.col("cod_titular")).alias("sum_cod_titular"),
        f.max("cod_producto").alias("max_cod_producto"),
        f.min("cod_producto").alias("min_cod_producto"),
        f.mean("cod_producto").alias("mean_cod_producto"),
        f.avg("cod_producto").alias("avg_cod_producto"),
        f.approx_count_distinct("fec_alta", 0.05).alias("approx_fec_count"),
        f.count_distinct(f.col("fec_alta")).alias("distinct_fec_count"),
        f.max("fec_alta").alias("max_fec_alta")
    ).show()

In [None]:
# groupBy -> count, max, min, avg, mean, sum
contracts_df.show()
contracts_df.groupBy(f.col("cod_titular")).count().show()
contracts_df.groupBy(f.col("cod_titular")).sum("cod_producto").show()
#contracts_df.groupBy(f.col("cod_titular")).max("fec_alta").show()

contracts_df\
    .groupBy(f.col("cod_titular"))\
    .agg(
        f.count("*").alias("count"),
        f.sum(f.col("cod_titular")).alias("sum_cod_titular"),
        f.max("cod_producto").alias("max_cod_producto"),
        f.min("cod_producto").alias("min_cod_producto"),
        f.mean("cod_producto").alias("mean_cod_producto"),
        f.avg("cod_producto").alias("avg_cod_producto"),
        f.approx_count_distinct("fec_alta", 0.05).alias("approx_fec_count"),
        f.count_distinct(f.col("fec_alta")).alias("distinct_fec_count"),
        f.max("fec_alta").alias("max_fec_alta")
    ).show()

In [None]:
# groupBy -> pivot
contracts_df.show()
contracts_df\
    .groupBy(f.col("cod_titular"))\
    .pivot("cod_producto", [100, 200])\
    .agg(
        f.count("*"),
        f.max("fec_alta")
    )\
    .show()

In [None]:
# groupBy -> unpivot
contracts_df\
    .groupBy(f.col("cod_titular"))\
    .pivot("cod_producto", [100, 200])\
    .agg(
        f.count("*"),
        f.max("fec_alta")
    )\
    .unpivot("cod_titular", ["100_max(fec_alta)","200_max(fec_alta)"], "metria", "valor").show()

In [None]:
## union/unionAll unionByName

In [None]:
contracts_tmp_df = contracts_df.filter(f.col("activo") == "false")

contracts_tmp_df.show()

contracts_tmp_df.select("cod_titular","fec_alta",f.col("cod_producto").alias("cod_producto_"))\
    .unionAll(contracts_tmp_df.select("cod_titular","fec_alta","cod_producto")).show()

contracts_tmp_df.select("cod_titular","fec_alta","cod_producto")\
    .unionByName(contracts_tmp_df.select("cod_producto","fec_alta","cod_titular", "activo"),  True).show()