In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [None]:
contracts_df = spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","true")\
        .csv("../../resources/data/csv/contracts.csv")

contracts_df.show()

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
import pyspark.sql.functions as f

contracts_df.orderBy(f.col("cod_iuc")).show()

#### Transformaciones

In [None]:
# Windows
    # Agregacion -> max, min, count, avg, mean, sum, ...
    # Ranking -> row_number, rank, dense_rank, ...
    # lag lead

from pyspark.sql import Window

window_1 = Window.partitionBy(f.col("cod_iuc"))
window_2 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("fec_alta").asc())

contracts_df.select(
    *contracts_df.columns,
    f.count("*").over(window_1).alias("count_w"),
    f.max(f.col("fec_alta")).over(window_2).alias("max_fec_alta_w"),
    f.sum(f.col("cod_producto")).over(window_2).alias("sum_cod_producto_w")
).orderBy(f.col("cod_iuc")).show()

In [None]:
contracts_df.orderBy(f.col("cod_iuc")).show()

window_3 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("cod_titular").asc())

contracts_df.select(
    *contracts_df.columns,
    f.row_number().over(window_3).alias("row_number"),
    f.rank().over(window_3).alias("rank"),
    f.dense_rank().over(window_3).alias("dense_rank")
).show(100)

In [None]:
contracts_df.orderBy(f.col("cod_iuc")).show()

window_4 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("fec_alta").desc())
window_5 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("fec_alta").desc())


contracts_df.select(
    *contracts_df.columns,
    f.lag(f.col("fec_alta"), 2, "1970-01-01").over(window_4).alias("lag"),
    f.lead(f.col("fec_alta"), 2, "1970-01-01").over(window_5).alias("lead")
).show(100)

In [None]:
#na.functions
    # drop -> ALIAS  DataFrame.dropna
    # fill -> ALIAS DataFrame.fillna
    # replace -> ALIAS DataFrame.replace

cond_column = lambda col_name, threshold: f.when(f.round(f.rand()*10, 0) >= threshold, None).otherwise(f.col(col_name))

null_contracts_df = contracts_df\
    .orderBy("cod_iuc", "cod_titular")\
    .select(
        "cod_iuc", "cod_titular",
        cond_column("cod_producto", 5).alias("cod_producto"),
        cond_column("fec_alta", 5).alias("fec_alta"),
        cond_column("activo", 5).alias("activo")
    )
null_contracts_df.show()

In [None]:
null_contracts_df\
    .dropna("all", 2, ["cod_producto", "fec_alta", "activo"])\
    .show()

In [None]:
null_contracts_df\
    .fillna({"cod_producto": 10000, "fec_alta": "2014-05-01", "activo": True})\
    .show()

In [None]:
null_contracts_df.show()

In [None]:
null_contracts_df.replace(
        to_replace = {400:2, 150:150000, True: False},
        subset = ["cod_producto", "cod_titular"]
        )\
    .orderBy("cod_iuc", "cod_titular")\
    .show()

In [None]:
# sample
print(contracts_df.count())
contracts_df.sample(True, 0.5, 0).orderBy(f.col("cod_iuc")).show()

In [None]:
# subtract / exceptAll
contracts_df.union(contracts_df).subtract(contracts_df.filter(f.col("activo") == True).union(contracts_df.filter(f.col("activo") == True))).show()