In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [None]:

contracts_df = spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","true")\
        .csv("../../resources/data/csv/contracts.csv")
clients_df = spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","true")\
        .csv("../../resources/data/csv/clients.csv")

contracts_df.show(2)
contracts_df.printSchema()
clients_df.show(2)
clients_df.printSchema()

In [None]:
import pyspark.sql.functions as f

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

#### Transformaciones

In [None]:
# Joins

# inner -> [inner, cross] Mantiene información de ambas tablas (columnas) para los registros (filas) coincidentes
# outer -> [outer, full, fullouter, full_outer,] Mantiene información de ambas tablas (columnas y filas) para los registros coincidentes y no-coincidentes
# left -> [left, leftouter, left_outer] Mantiene columnas de ambas tablas y filas únicamente de la tabla izquierda, elimina filas no coincidentes de la tabla derecha
# right -> [right, rightouter, right_outer] Mantiene columnas de ambas tablas y filas únicamente de la tabla derecha, elimina filas no coincidentes de la tabla izquierda
# left_semi -> [semi, leftsemi, left_semi] Mantiene filas y columnas únicamente de la tabla izquierda para los registros coincidentes
# left_anti -> [anti, leftanti, left_anti] Mantiene filas y columnas únicamente de la tabla izquierda para los registros no-coincidentes

In [None]:
clients_tmp_df = clients_df.filter((f.col("edad") >= 40) & (f.col("edad") <= 50))
contracts_tmp_df = contracts_df.filter(f.col("activo") == False)\
    .withColumnRenamed("cod_titular", "cod_client")
clients_tmp_df.show()
contracts_tmp_df.show()

contracts_tmp_df.join(clients_tmp_df, ["cod_client"]).show()

In [None]:
# crossJoin -> WARNING
clients_tmp_df = clients_df.filter((f.col("edad") >= 40) & (f.col("edad") <= 50))
contracts_tmp_df = contracts_df.filter(f.col("activo") == False)
clients_tmp_df.show()
contracts_tmp_df.show()

clients_df.crossJoin(contracts_df).count()

In [None]:
# UDF - User Defined Functions - WARNING

import pyspark.sql.types as t

def upper_case(value):
    return "" if value is None else value.upper()

def len_concat(value1, value2):
    value_1 = "" if value1 is None else value1
    value_2 = "" if value2 is None else value2
    return len(value_1 + value_2)

@f.udf(returnType=t.LongType())
def sum_values(value_1, value_2):
    value_1 = 0 if value_1 is None else value_1
    value_2 = 0 if value_2 is None else value_2
    return value_1 + value_2

upper_case_udf = f.udf(upper_case, t.StringType())
len_concat_udf = f.udf(len_concat, t.IntegerType())


join_df = contracts_tmp_df.join(clients_tmp_df, f.col("cod_client") == f.col("cod_titular"), "outer")
join_df.show()

join_df\
    .select(
        *join_df.columns,
        upper_case_udf(f.col("nombre")).alias("nombre_mayus"),
        len_concat_udf(f.col("nombre"), f.col("provincia")).alias("len_concat"),
        sum_values(f.col("cod_titular"), f.col("cod_titular")).alias("sum_values")
    ).show()

In [None]:
# spark.sql

spark.udf.register("UDF_NAME", upper_case_udf)

join_df.createOrReplaceTempView("view")
spark.sql("""
    SELECT UDF_NAME(nombre) AS udf_result, nombre FROM view 
""").show()


spark.udf.register("UDF_LAMBDA", lambda value: "" if value is None else value.upper())
spark.sql("""
    SELECT UDF_LAMBDA(nombre) AS udf_result, nombre FROM view 
""").show()