In [None]:
from datetime import datetime
import time
import numpy as np
import pandas as pd

import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType, DoubleType, FloatType, DateType, StructType, StructField, ArrayType

from sklearn.linear_model import LinearRegression

import seaborn as sns

# UDFs

In this notebook, you'll learn how to create and use User Defined Functions (UDFs) in PySpark. UDFs are a way to extend the functionality of PySpark by creating custom functions that can be used in PySpark SQL queries.

These functions are especially useful when you need to perform complex operations on your data that are not supported by the built-in functions in PySpark.

Let's start by creating the dummy orders DataFrame that we've used in the previous notebook. This DataFrame contains information about orders placed by customers, and has a column with complex data types.

In [None]:
# Sample data
data = [
    {"order_id": 5642, "order_date": datetime.strptime("2024-05-18", "%Y-%m-%d").date(),
    "items": [
        {"name": "Apple", "quantity": 1.0, "price": 2.99},
        {"name": "Banana", "quantity": 1.7, "price": 1.99}],
    'items_discount': ['Apple']},
    {"order_id": 9762, "order_date": datetime.strptime("2024-05-02", "%Y-%m-%d").date(),
    "items": [
        {"name": "Strawberry", "quantity": 0.5, "price": 6.99},
        {"name": "Apple", "quantity": 3.0, "price": 2.99},
        {"name": "Peach", "quantity": 2.5, "price": 3.39}],
    'items_discount': ['Apple', 'Peach']},
    {"order_id": 3652, "order_date": datetime.strptime("2024-05-23", "%Y-%m-%d").date(),
    "items": [
        {"name": "Banana", "quantity": 1.5, "price": 1.99}],
    'items_discount': []},
    {"order_id": 1276, "order_date": datetime.strptime("2024-05-10", "%Y-%m-%d").date(),
    "items": [
        {"name": "Apple", "quantity": 2.0, "price": 2.99},
        {"name": "Banana", "quantity": 0.5, "price": 1.99},
        {"name": "Strawberry", "quantity": 1.0, "price": 6.99},
        {"name": "Strawberry", "quantity": 1.0, "price": 6.99},
        {"name": "Peach", "quantity": 1.0, "price": 3.39}],
    'items_discount': ['Peach', 'Banana']},
    {"order_id": 8763, "order_date": datetime.strptime("2024-05-14", "%Y-%m-%d").date(),
    "items": [
        {"name": "Strawberry", "quantity": 1.0, "price": 6.99},
        {"name": "Peach", "quantity": 1.0, "price": 3.39},
        {"name": "Mango", "quantity": 1.5, "price": 5.99}],
    'items_discount': ['Mango']},
    {"order_id": 7652, "order_date": datetime.strptime("2024-05-22", "%Y-%m-%d").date(),
    "items": [
        {"name": "Banana", "quantity": 1.0, "price": 1.99},
        {"name": "Mango", "quantity": 1.5, "price": 5.99}],
    'items_discount': ['Mango', 'Banana']},
    {"order_id": 7631, "order_date": datetime.strptime("2024-05-22", "%Y-%m-%d").date(),
    "items": [
        {"name": "Banana", "quantity": 1.0, "price": 1.99},
        {"name": "Banana", "quantity": 2.5, "price": 1.99},],
    'items_discount': []}
]

# Define the schema
schema = StructType([
    StructField('order_id', IntegerType(), False),
    StructField('order_date', DateType(), False),
    StructField(
        'items',
        ArrayType(
            StructType([
                StructField('name', StringType(), False),
                StructField('quantity', FloatType(), False),
                StructField('price', FloatType(), False)
            ]),
            False
        ),
        False
    ),
    StructField("items_discount", ArrayType(StringType()), True)
])


# Create DataFrame
df_fruitshop = spark.createDataFrame(data, schema=schema)

df_fruitshop.display()

Let's check the schema of this DataFrame.

In [None]:
df_fruitshop.printSchema()

## UDFs VS Built-in Functions

UDFs can be used to extract information from complex data types, such as arrays and structs. Let's see how we can answer some questions from the last notebook using UDFs instead of the built-in functions in PySpark.

1. Get the name of each product in the 'items' column.

In the last notebook, we used the `transform` spark function. This applies a function to each element in an array and returns an array of the same length.

In [None]:
# Define the transform function. This receives a column and a function to apply to the column.
fn = f.transform(f.col('items'), lambda x: x['name'])

df_fruitshop_item_names = (
    df_fruitshop
    # Apply the transformation to the dataframe and save the results to a new column
    .withColumn('item_names', fn)
)

df_fruitshop_item_names.display()

Let's see how we can achieve the same result using a UDF.

First we need to define a python function that receives a certain argument and returns the desired output.

In [None]:
def extract_item_names(items):
    return [item['name'] for item in items]

Then we need to register this function as a UDF using the `udf` function from the `pyspark.sql.functions` module.

The `udf` function receives two arguments: the python function that we want to use as a UDF, and the return type of this function.

In [None]:
extract_item_names_udf = f.udf(extract_item_names, ArrayType(StringType()))

Finally, we can use this UDF in a PySpark SQL query to extract the information we need.

In [None]:
df_fruitshop_item_names = (
    df_fruitshop
    .withColumn(
        'item_names',
        extract_item_names_udf(f.col('items'))
    )
)

df_fruitshop_item_names.display()

The UDF is applied to the column `items` of each row in the DataFrame. Therefore, it is important that the function we defined is expecting the same data type as the column we are applying it to.

Out function is expecting a list of structs, just like the `items` column in the DataFrame.

2. Get the number of items in each order.

In the last notebook, we used the `size` spark function. This returns the number of elements in an array.

In [None]:
df_fruitshop_size = (
    df_fruitshop_item_names
    .withColumn(
        'nr_items',
        f.size(f.col('items'))
    )
)

df_fruitshop_size.display()

Now let's see how we can achieve the same result using a UDF.

In [None]:
# Define a python function that receives a list of items and returns the size of that list
def count_items(items):
    return len(items)

# Register the function as a UDF and define the return type
count_items_udf = f.udf(count_items, IntegerType())

# Apply the UDF to the items column
df_fruitshop_size = (
    df_fruitshop_item_names
    .withColumn(
        'nr_items',
        count_items_udf(f.col('items'))
    )
)

df_fruitshop_size.display()

3. Get the number of unique item names in each order.

In the last notebook, we used the `size` and `array_distinct` spark functions. The `array_distinct` function returns the distinct elements in an array, while the `size` function returns the number of elements in that array.

In [None]:
df_fruitshop_unique = (
    df_fruitshop_size
    .withColumn(
        'unique_item_names',
        f.array_distinct(f.col('item_names'))
    )
    .withColumn(
        'nr_unique_items',
        f.size(f.col('unique_item_names'))
    )
)

df_fruitshop_unique.display()

When using UDFs, we can combine these two functions into a single UDF.

When our UDF is applied to the `items` column, we can get the number of unique item names in each order using python functions instead of the built-in functions in PySpark.

In [None]:
# Define a python function that receives a list of items and returns the number of unique items
def count_unique_items(items):
   # Get the unique items using the set data structure in python
   unique_items_set = set([item['name'] for item in items])
   # Return the length of the set
   return len(unique_items_set)

# Register the function as a UDF and define the return type
count_unique_items_udf = f.udf(count_unique_items, IntegerType())

df_fruitshop_unique = (
    df_fruitshop_size
    .withColumn(
        'nr_unique_items',
        count_unique_items_udf(f.col('items'))
    )
)

df_fruitshop_unique.display()

4. Check if an order contains a specific item (Banana).

In the last notebook, we used the `array_contains` spark function. This function returns true if the specified value is found in the array.

In [None]:
df_fruitshop_banana = (
    df_fruitshop_unique
    .withColumn(
        'has_banana',
        f.array_contains(f.col('item_names'), 'Banana')
    )
)

df_fruitshop_banana.display()

Once again, we do not need any intermediate column to check if 'Banana' is in the 'items' column. We can apply a single UDF to the 'items' column to check if 'Banana' is in the list of items for each order.

In [None]:
def contains_banana(items):
    return 'Banana' in [item['name'] for item in items]

contains_banana_udf = f.udf(contains_banana, BooleanType())

df_fruitshop_banana = (
    df_fruitshop_unique
    .withColumn(
        'has_banana',
        contains_banana_udf(f.col('items'))
    )
)

df_fruitshop_banana.display()

## Complex operations

Now you know how to create and use UDFs in PySpark.

However, we have only recreated the functionality of already existing PySpark functions. UDFs are especially useful when you need to perform complex operations on your data that are not supported by the built-in functions in PySpark.

Let's see some examples.


1. Calculate the average price per unit of each order, considering only items that were bought in a quantity greater than 1.

Let's break the question into steps:
- For each order, consider only items with quantity greater than 1.
- Calculate the total price and the total quantity of items bought in each order.
- Calculate the average price per unit of each order.

In [None]:
# Define a UDF to calculate average price per unit for items with quantity > 1
def avg_price_per_unit(items):
    total_price = 0.0
    total_quantity = 0.0

    # Iterate over the items and calculate the total price and quantity
    for item in items:
        if item['quantity'] > 1:
            total_price += item['price'] * item['quantity']
            total_quantity += item['quantity']

    # Calculate the average price per unit
    if total_quantity > 0:
        return total_price / total_quantity
    else:
        return 0.0
    
avg_price_per_unit_udf = f.udf(avg_price_per_unit, DoubleType())

# Apply the UDF to calculate average price per unit for each item
avg_price_per_unit_df = df_fruitshop.withColumn(
    'avg_price_per_unit',
    avg_price_per_unit_udf(f.col('items'))
)

avg_price_per_unit_df.display()

2. Identify the order with the highest total cost, considering both item price and quantity.

In [None]:
# Define a UDF to calculate total cost for each order
def calculate_total_cost(items):
    total_cost = 0

    for item in items:
        total_cost += item['quantity'] * item['price']
    
    return total_cost

calculate_total_cost_udf = f.udf(calculate_total_cost, DoubleType())

# Add a column with the total cost for each order
df_fruitshop_with_total_cost = (
    df_fruitshop
    .withColumn(
        'total_cost',
        calculate_total_cost_udf(f.col('items'))
    )
    .orderBy(f.desc(f.col('total_cost')))
    .limit(1)
)

df_fruitshop_with_total_cost.display()

## Pandas UDFs

Pandas UDFs are similar to regular UDFs, but they operate on Pandas DataFrames instead of individual rows. This allows you to perform vectorized operations on Pandas DataFrame chunks, which are more efficient than row-by-row operations typical in regular UDFs.

Here are two of the main advantages of using Pandas UDFs:

1. **Performance Optimization:** Pandas UDFs can perform vectorized operations on Pandas DataFrame chunks, which are more efficient than row-by-row operations typical in regular UDFs.
2. **Ease of Use and Familiarity:** Pandas UDFs allow you to leverage the rich functionality of Pandas, including its extensive libraries for data manipulation, statistical operations, and more.

Let's download some data to demonstrate how to use Pandas UDFs in PySpark.

In [None]:
%sh 

wget https://raw.githubusercontent.com/inesmcm26/lp-big-data/main/data/spotify.json

In [None]:
%fs

cp file:/databricks/driver/spotify.json dbfs:/FileStore/lp-big-data/spotify.json

The dataset we've downloaded contains information about spotify playlists. Let's look at the schema and the first few rows of this DataFrame.

In [None]:
df_spotify = (
    spark.read.format('json')
    .load("/FileStore/lp-big-data/spotify.json")
)

df_spotify.display()

In [None]:
df_spotify.printSchema()

As you can see in the schema, each record corresponds to a playlist.

Each playlist is characterized by its name, the number of followers, a collaborative indicator, and a list of tracks. This list of tracks is a complex data type, as it contains a list of structs with information about each track.

Let's answer some questions to demonstrate how you can use Pandas UDFs in PySpark.

1. Get the number of tracks in each playlist

This is a simple question and we can answer it using built-in functions in PySpark.

Let's do it and register the time it takes to run this query.

In [None]:
start = time.time()

df_num_tracks = df_spotify.withColumn("num_tracks", f.size(f.col("tracks")))

df_num_tracks.display()

end = time.time()

time_spark_functions = end - start

We can also use a simple UDF to answer this question. Let's see how we can do it.

In [None]:
# Define a python function to counts the lenght of a list of tracks
def count_tracks(tracks):
    return len(tracks)

# Register the function as a UDF and define the return type
count_tracks_udf = f.udf(count_tracks, IntegerType())

start = time.time()

# Apply the UDF to the tracks column
df_num_tracks = df_spotify.withColumn("num_tracks", count_tracks_udf(f.col("tracks")))

df_num_tracks.display()

end = time.time()

time_udf = end - start

Finally, let's see how we can use a Pandas UDF to answer this question.

A pandas UDF is a special type of udf that uses a Pandas DataFrame/Series as input and returns a Pandas DataFrame/Series as output. This allows you to perform vectorized operations on Pandas DataFrame chunks, which are more efficient than row-by-row operations typical in regular UDFs.

To define a Pandas UDF, you need to use the `pandas_udf` decorator from the `pyspark.sql.functions` module. This decorator receives the return type of the function as argument.

Since we will apply the Pandas UDF to the 'tracks' column, our function should expect a pandas Series as input. It will return the length of each element in the Series, which corresponds to another Series.

In [None]:
# Define the Pandas UDF decorator
@f.pandas_udf(IntegerType())
def count_tracks(tracks: pd.Series) -> pd.Series:
    # Use the 'apply' method of a pandas Series to count the number of tracks in each row
    return tracks.apply(len)

# Apply the Pandas UDF to the tracks column
start = time.time()

df_num_tracks = df_spotify.withColumn("num_tracks", count_tracks(f.col("tracks")))

df_num_tracks.display()

end = time.time()

time_pandas_udf = end - start

Now let's compare the performance of the three methods we used to answer the question.

In [None]:
sns.barplot(x=['Built-In Functions', 'UDF', 'Pandas UDF'], y=[time_spark_functions, time_udf, time_pandas_udf])

We can see that using bulit-in functions is the fastest way to answer this question.

Using Pandas UDFs is more efficient than using regular UDFs, but it is still slower than using built-in functions.

2. Get the number of albums in each playlist

In [None]:
# 1. Using pyspark built-in functions
start = time.time()

df_num_albums = (
    df_num_tracks
    .withColumn("albums", f.transform(f.col("tracks"), lambda x: x["album_name"]))
    .withColumn("num_albums", f.size(f.array_distinct(f.col("albums"))))
)

df_num_albums.display()

end = time.time()

time_spark_functions = end - start


In [None]:
# 2. Using UDF
def count_albums(tracks):
    return len(set([song["album_name"] for song in tracks]))

count_albums_udf = f.udf(count_albums, IntegerType())

start = time.time()

df_num_albums = (
    df_num_tracks
    .withColumn("num_albums", count_albums_udf(f.col("tracks")))
)

df_num_albums.display()

end = time.time()

time_udf = end - start

In [None]:
# 3. Using pandas udfs
@f.pandas_udf(IntegerType())
def count_albums(tracks: pd.Series) -> pd.Series:
    return tracks.apply(lambda x: len(set([song["album_name"] for song in x])))

start = time.time()

df_num_albums = (
    df_num_tracks
    .withColumn("num_albums", count_albums(f.col("tracks")))
)

df_num_albums.display()

end = time.time()

time_pandas_udf = end - start

Let's compare the performance of the three methods we used to answer the question.

In [None]:
sns.barplot(x=['Built-In Functions', 'UDF', 'Pandas UDF'], y=[time_spark_functions, time_udf, time_pandas_udf])

Now we can see that the built-in function is slower, probably because we need to use two functions to get the desired result.

On the other hand, Pandas UDFs are still more efficient than regular UDFs.

3. Get the total duration of each playlist

In [None]:
# 1. Using pyspark built-in functions
start = time.time()

df_total_duration = (
    df_num_albums
    .withColumn('track', f.explode(f.col('tracks')))
    .groupBy('playlist_name')
    .agg(f.sum(f.col('track.duration_ms')).alias('total_duration'))
)

df_total_duration.display()

end = time.time()

time_spark_functions = end - start

In [None]:
# 2. Using UDF
def sum_durations(tracks):
    return sum([song['duration_ms'] for song in tracks])

sum_durations_udf = f.udf(sum_durations, IntegerType())

start = time.time()

df_total_duration = (
    df_num_albums
    .withColumn('total_duration', sum_durations_udf(f.col('tracks')))
)

df_total_duration.display()

end = time.time()

time_udf = end - start

In [None]:
# 3. Using pandas udfs
@f.pandas_udf(IntegerType())
def sum_durations(tracks: pd.Series) -> pd.Series:
    return tracks.apply(lambda x: sum([song['duration_ms'] for song in x]))

start = time.time()

df_total_duration = (
    df_num_albums
    .withColumn('total_duration', sum_durations(f.col('tracks')))
)

df_total_duration.display()

end = time.time()

time_pandas_udf = end - start

Once again, let's compare the performance of the three methods we used to answer the question.

In [None]:
sns.barplot(x=['Built-In Functions', 'UDF', 'Pandas UDF'], y=[time_spark_functions, time_udf, time_pandas_udf])

Now we can see that using built-in functions is faster than using regular UDFs, but slower than using Pandas UDFs.

This is because the operation we are computing is complex and leveraging the rich functionality of Pandas is more efficient than using built-in functions.

4. Finally, let's predict the number of followers of each playlist based on the number of tracks, number of albums, and total duration.

To answer this question we should use a Pandas UDF. Firstly, because it is a very complex question that involves multiple operations.

Secondly, because we can perform vectorized operations on Pandas DataFrame chunks, which are more efficient than row-by-row operations typical in regular UDFs.

Let's use a linear regression model to predict the number of followers of each playlist based on the number of tracks, number of albums, and total duration.

In [None]:
@f.pandas_udf(FloatType())
def predict_num_followers_udf(
    num_tracks: pd.Series,
    num_albums: pd.Series,
    total_duration: pd.Series,
    num_followers: pd.Series
) -> pd.Series:
    """Predict the number of followers of a spotify playlist based
    on the number of tracks, albums and total duration
    """

    X = np.vstack([num_tracks, num_albums, total_duration]).T
    y = num_followers.values
    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)
    return pd.Series(predictions)

df_predictions = (
    df_total_duration
    .withColumn('predicted_num_followers',
                predict_num_followers_udf(
                    f.col('num_tracks'),
                    f.col('num_albums'),
                    f.col('total_duration'),
                    f.col('num_followers'))
                )
)

df_predictions.display()

----

Now you know how to create and use UDFs and Pandas UDFs in PySpark. Go to the `exercises` notebook and practice what you've learned.