In [0]:
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc
from pyspark.sql.functions import grouping, explode, array_contains, udf


# Uso de UDFs: Funciones Personalizadas en Spark

Las UDF permiten aplicar funciones de Python a tus DataFrames. Úsalas solo cuando las funciones nativas no sean suficientes.
Aquí, analizamos los títulos de películas del dataset MovieLens.


In [0]:
# Tabla Movies
movies_schema  = StructType(fields=[
    StructField("movieId",IntegerType(),True), 
    StructField("title",StringType(),True),
    StructField("genres",StringType(),True)
])

moviesDf = spark.read\
    .option("header", True)\
    .schema(movies_schema)\
    .csv("dbfs:/FileStore/tables/movies.csv")

moviesDf = moviesDf.withColumn("genresSplit", split(moviesDf["genres"],"\|"))\
                        .drop('genres').withColumnRenamed("genresSplit","genres")\
                            .withColumn(\
                                "year",\
                                regexp_extract(\
                                           moviesDf["title"],\
                                           "^.+\(([0-9]+)\)$",\
                                           1)\
                                .cast(IntegerType()))\
                            .withColumn(\
                            "title_temp",\
                            regexp_extract(\
                                           moviesDf["title"],\
                                           "^(.+?) \([0-9]+\)$",\
                                           1))\
                            .drop('title')\
                        .withColumnRenamed("title_temp","title")

In [0]:
# Definir UDF: contar número de palabras en el título de cada película
def contar_palabras(titulo):
    return len(titulo.split(" ")) if titulo else 0

contar_palabras_udf = udf(contar_palabras, IntegerType())




In [0]:
%time
# Aplicar UDF
moviesDf = moviesDf.withColumn("num_palabras_titulo", contar_palabras_udf(col("title")))
moviesDf.select("title", "num_palabras_titulo").show(10, truncate=False)



CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
+---------------------------+-------------------+
|title                      |num_palabras_titulo|
+---------------------------+-------------------+
|Toy Story                  |2                  |
|Jumanji                    |1                  |
|Grumpier Old Men           |3                  |
|Waiting to Exhale          |3                  |
|Father of the Bride Part II|6                  |
|Heat                       |1                  |
|Sabrina                    |1                  |
|Tom and Huck               |3                  |
|Sudden Death               |2                  |
|GoldenEye                  |1                  |
+---------------------------+-------------------+
only showing top 10 rows



In [0]:
%time
# Comparar con una función Spark nativa (split)
from pyspark.sql.functions import size, split
moviesDf = moviesDf.withColumn("num_palabras_nativo", size(split(col("title"), " ")))
moviesDf.select("title", "num_palabras_titulo", "num_palabras_nativo").show(10, truncate=False)

# Nota: siempre que sea posible, prefiere las funciones Spark nativas por rendimiento.

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 4.05 µs
+---------------------------+-------------------+-------------------+
|title                      |num_palabras_titulo|num_palabras_nativo|
+---------------------------+-------------------+-------------------+
|Toy Story                  |2                  |2                  |
|Jumanji                    |1                  |1                  |
|Grumpier Old Men           |3                  |3                  |
|Waiting to Exhale          |3                  |3                  |
|Father of the Bride Part II|6                  |6                  |
|Heat                       |1                  |1                  |
|Sabrina                    |1                  |1                  |
|Tom and Huck               |3                  |3                  |
|Sudden Death               |2                  |2                  |
|GoldenEye                  |1                  |1                  |
+-------------------------