- #### Transformaciones
    - ##### agg
    - ##### describe
    - ##### summary
    - ##### crossJoin
    - ##### sample
    - ##### na functions
    - ##### UDF

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_10") \
        .master("local[*]") \
        .getOrCreate()

In [None]:
base_path = "../../resources/data/tmp/parquet/"
movies_df = spark.read.parquet(base_path + "09/movies")
top_movies_df = spark.read.parquet(base_path + "09/top_movies")
movies_df.show(1)

In [None]:
movies_df.count()

In [None]:
movies_df.printSchema()

### agg

In [None]:
import pyspark.sql.functions as f
movies_df.agg(
    f.count("movie_id").alias("count_movie_id"),
    f.approx_count_distinct(f.col("movie_id")).alias("approx_count_movie_id"),
    f.count_distinct(f.col("movie_id")).alias("count_d_movie_id"),
    f.min("min_time_tag").alias("min_time_tag"),
    f.min("min_time_rating").alias("min_time_rating"),
    f.max("count_rating").alias("max_count_rating"),
    f.max(f.size("tag_count")).alias("max_tags")
).show()

## equivalente a DataFrame.groupBy().agg( ... )

### describe

In [None]:
movies_df.describe().show()

In [None]:
### se pueden enviar columnas - for all numerical or string columns
movies_df.describe(["year", "avg_rating", "stddev_rating", "count_rating", "min_time_tag", "min_time_rating"]).show()

### summary

In [None]:
## Use summary for expanded statistics and control over which statistics to compute.
movies_df.summary().show()

In [None]:
movies_df.summary(["count", "min", "25%", "75%", "max"]).show()

### crossJoin

In [None]:
df = movies_df.summary()
df.show()

In [None]:
df_ids = spark.range(1,10,1)
df_ids.show()

In [None]:
df.crossJoin(df_ids).count()

### sample

In [None]:
movies_df.count() * 0.00009

In [None]:
movies_sample_df = movies_df \
    .select('movie_id', 'title', 'genres', 'year', 'min_time_tag', 'avg_rating', 'stddev_rating', 'count_rating', 'min_time_rating') \
    .sample(False, 0.00009, 1)
movies_sample_df.count()

In [None]:
movies_sample_df.show()

### na

In [None]:
first_movies_df = movies_df \
    .select('movie_id', 'title', 'genres', 'year', 'min_time_tag', 'avg_rating', 'stddev_rating', 'count_rating', 'min_time_rating') \
    .limit(5)
#first_movies_df.show(1)

In [None]:
id_df = spark.range(1,11,1).select(f.col("id").alias("movie_id"))

In [None]:
null_df = id_df.join(first_movies_df, ["movie_id"], "left")
null_df.show()

##### na.fill

In [None]:
# alias de fillna

# null_df.na.fill("Unknow").show()
# null_df.na.fill(0).show()
# null_df.na.fill(0, ["avg_rating", "stddev_rating", "count_rating"]).show()

# sólo funciona con los tipos: int, float, string, bool, no funciona con tipos complejos
#null_df.na.fill({"title": "Unknow", "year": 1970}).show()

##### na.replace

In [None]:
# alias de replace()

# null_df.na.replace(1995, 1900).show()
# null_df.na.replace([1995, 1, 2], [1970, 0, 0]).show()
# null_df.na.replace([1995, 1, 2, 3.89], [1970, 0, 0, 4], ["movie_id", "year"]).show()

# No se permite colocar múltiples tipos
# null_df.na.replace([1995, 1, 2, "Toy Story (1995)"], [1970, 0, 0, "Toy Story"], ["movie_id", "year"]).show() # -> error

#null_df.na.replace({1995:1970, 1:0, 2:0, 3.89:4}, subset=["movie_id", "year"]).show()

##### na.drop

In [None]:
# alias de dropna

drop_df = null_df \
    .na.fill(0) \
    .na.fill("Unknow")

# drop_df.show()

# drop_df.na.drop("any").show()
# drop_df.select("genres", "min_time_tag", "min_time_rating").na.drop("all").show()
# drop_df.na.drop("all", thresh=6).show()

# drop_df.na.drop("any", subset=["genres", "avg_rating", "stddev_rating"]).show()
# drop_df.na.drop("any", thresh=3, subset=["genres", "avg_rating", "stddev_rating"]).show()

### UDF (User Defined Functions)

In [None]:
from pyspark.sql.types import *

In [None]:
## Forma 1 - utilizando la funcion udf

def some_str_function(param_1: str, param_2: str) -> str:
    return param_1 + param_2

function_type1_udf = f.udf(lambda item_1, item_2: some_str_function(item_1, item_2), StringType())

movies_df \
    .select("movie_id", "title", function_type1_udf(f.col("movie_id"), f.col("title")).alias("concat")) \
    .show(1,False)

In [None]:
## Forma 2 - utilizando la funcion udfcomo decorador
@f.udf(returnType=StringType())
def some_str_function_(param_1: str, param_2: str) -> str:
    return param_1 + param_2

movies_df \
    .select("movie_id", "title", some_str_function_(f.col("movie_id"), f.col("title")).alias("concat")) \
    .show(1,False)

##### Utilizar UDF en sql

In [None]:
movies_df.createOrReplaceTempView("movies")

spark.table("movies").show(1)

In [None]:
spark.udf.register("CONCAT_UDF_V1", some_str_function)

spark.sql("select CONCAT_UDF_V1(movie_id, title) as concat, movie_id, title from movies").show(1)

In [None]:
spark.udf.register("CONCAT_UDF_V2", function_type1_udf)

spark.sql("select CONCAT_UDF_V2(movie_id, title) as concat, movie_id, title from movies").show(1)

##### UDF isn't null-safe!!

In [None]:
def sum_values(val_1, val_2):
    return val_1 + val_2

number_ops_udf = f.udf(lambda val_1, val_2: sum_values(val_1, val_2), DoubleType())

spark.udf.register("NUMBER_UPS_UDF", number_ops_udf)

spark \
    .sql("""select avg_rating, stddev_rating, NUMBER_UPS_UDF(avg_rating, stddev_rating) as sum_values from movies""") \
    .filter(f.col("avg_rating").isNull()) \
    #.show(10)

In [None]:
def sum_values_null_safe(val_1, val_2):
    if (val_1 is None) | (val_2 is None):
        return 0.0
    return val_1 + val_2

number_ops_udf = f.udf(lambda val_1, val_2: sum_values_null_safe(val_1, val_2), DoubleType())

spark.udf.register("NUMBER_UPS_UDF", number_ops_udf)

spark \
    .sql("""select avg_rating, stddev_rating, NUMBER_UPS_UDF(avg_rating, stddev_rating) as sum_values from movies""") \
    .filter(f.col("avg_rating").isNull()) \
    .show(10)