In [None]:
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, StringType, IntegerType, BooleanType, DoubleType, FloatType
from sklearn.linear_model import LinearRegression

Create the dataframe with complex data types

In [None]:
import pandas as pd

# Sample data
data = [
    {"order_id": 5642, "order_date": datetime.strptime("2024-05-18", "%Y-%m-%d").date(),
    "items": [
        {"name": "Apple", "quantity": 1.0, "price": 2.99},
        {"name": "Banana", "quantity": 1.7, "price": 1.99}],
    'items_discount': ['Apple']},
    {"order_id": 9762, "order_date": datetime.strptime("2024-05-02", "%Y-%m-%d").date(),
    "items": [
        {"name": "Strawberry", "quantity": 0.5, "price": 6.99},
        {"name": "Apple", "quantity": 3.0, "price": 2.99},
        {"name": "Peach", "quantity": 2.5, "price": 3.39}],
    'items_discount': ['Apple', 'Peach']},
    {"order_id": 3652, "order_date": datetime.strptime("2024-05-23", "%Y-%m-%d").date(),
    "items": [
        {"name": "Banana", "quantity": 1.5, "price": 1.99}],
    'items_discount': []},
    {"order_id": 1276, "order_date": datetime.strptime("2024-05-10", "%Y-%m-%d").date(),
    "items": [
        {"name": "Apple", "quantity": 2.0, "price": 2.99},
        {"name": "Banana", "quantity": 0.5, "price": 1.99},
        {"name": "Strawberry", "quantity": 1.0, "price": 6.99},
        {"name": "Strawberry", "quantity": 1.0, "price": 6.99},
        {"name": "Peach", "quantity": 1.0, "price": 3.39}],
    'items_discount': ['Peach', 'Banana']},
    {"order_id": 8763, "order_date": datetime.strptime("2024-05-14", "%Y-%m-%d").date(),
    "items": [
        {"name": "Strawberry", "quantity": 1.0, "price": 6.99},
        {"name": "Peach", "quantity": 1.0, "price": 3.39},
        {"name": "Mango", "quantity": 1.5, "price": 5.99}],
    'items_discount': ['Mango']},
    {"order_id": 7652, "order_date": datetime.strptime("2024-05-22", "%Y-%m-%d").date(),
    "items": [
        {"name": "Banana", "quantity": 1.0, "price": 1.99},
        {"name": "Mango", "quantity": 1.5, "price": 5.99}],
    'items_discount': ['Mango', 'Banana']},
    {"order_id": 7631, "order_date": datetime.strptime("2024-05-22", "%Y-%m-%d").date(),
    "items": [
        {"name": "Banana", "quantity": 1.0, "price": 1.99},
        {"name": "Banana", "quantity": 2.5, "price": 1.99},],
    'items_discount': []}
]

# Define the schema
schema = StructType([
    StructField('order_id', IntegerType(), False),
    StructField('order_date', DateType(), False),
    StructField(
        'items',
        ArrayType(
            StructType([
                StructField('name', StringType(), False),
                StructField('quantity', FloatType(), False),
                StructField('price', FloatType(), False)
            ]),
            False
        ),
        False
    ),
    StructField("items_discount", ArrayType(StringType()), True)
])


# Create DataFrame
df_fruitshop = spark.createDataFrame(data, schema=schema)

df_fruitshop.display()

Transform

In [None]:
# Define the transform function. This receives a column and a function to apply to the column.
fn = f.transform(f.col('items'), lambda x: x['name'])

df_fruitshop_transformed = (
    df_fruitshop
    # Apply the transformation to the dataframe and save the results to a new column
    .withColumn('item_names', fn)
)

df_fruitshop_transformed.display()

In [None]:
def extract_item_names(items):
    return [item['name'] for item in items]

extract_item_names_udf = f.udf(extract_item_names, ArrayType(StringType()))

(
    df_fruitshop
    .withColumn(
        'item_names',
        extract_item_names_udf(f.col('items'))
    )
).display()

Size

In [None]:

df_fruitshop_size = (
    df_fruitshop_transformed
    .withColumn(
        'nr_items',
        f.size(f.col('items'))
    )
)

df_fruitshop_size.display()

In [None]:
def count_items(items):
    return len(items)

count_items_udf = f.udf(count_items, IntegerType())

(
    df_fruitshop
    .withColumn(
        'nr_items',
        count_items_udf(f.col('items'))
    )
).display()

Array distinct

In [None]:
df_fruitshop_unique = (
    df_fruitshop_size
    .withColumn(
        'unique_item_names',
        f.array_distinct(f.col('item_names'))
    )
    .withColumn(
        'nr_unique_items',
        f.size(f.col('unique_item_names'))
    )
)

df_fruitshop_unique.display()

In [None]:
def count_unique_items(items):
   unique_items_list = list(set([item['name'] for item in items]))
   return len(unique_items_list)

count_unique_items_udf = f.udf(count_unique_items, ArrayType(IntegerType()))

(
    df_fruitshop
    .withColumn(
        'nr_unique_items',
        count_unique_items_udf(f.col('items'))
    )
).display()

Array contains

In [None]:
filtered_fruitshop_df = (
    df_fruitshop_unique
    .filter(
        f.array_contains(f.col('unique_item_names'), 'Banana')
    )
)

filtered_fruitshop_df.display()

In [None]:
def contains_banana(items):
    return 'Banana' in [item['name'] for item in items]

contains_banana_udf = f.udf(contains_banana, BooleanType())

(
    df_fruitshop
    .filter(
        contains_banana_udf(f.col('items'))
    )
).display()

Here are some questions that would be hard to answer using built-in functions:

1. Calculate the average price per unit of each item across all orders where the item has a quantity greater than 1.

In [None]:
# Define a UDF to calculate average price per unit for items with quantity > 1
def avg_price_per_unit(items):
    total_price = 0.0
    total_quantity = 0.0
    
    for item in items:
        if item['quantity'] > 1:
            total_price += item['price'] * item['quantity']
            total_quantity += item['quantity']
    
    if total_quantity > 0:
        return total_price / total_quantity
    else:
        return 0.0
    
avg_price_per_unit_udf = f.udf(avg_price_per_unit, DoubleType())

# Apply the UDF to calculate average price per unit for each item
avg_price_per_unit_df = df_fruitshop.withColumn(
    'avg_price_per_unit',
    avg_price_per_unit_udf(f.col('items'))
)

avg_price_per_unit_df.display()

2. Identify the order with the highest total cost, considering both item price and quantity.

In [None]:
# Define a UDF to calculate total cost for each order
def calculate_total_cost(items):
    total_cost = 0

    for item in items:
        total_cost += item['quantity'] * item['price']
    
    return total_cost

calculate_total_cost_udf = f.udf(calculate_total_cost, DoubleType())

# Add a column with the total cost for each order
df_fruitshop_with_total_cost = df_fruitshop.withColumn(
    'total_cost',
    calculate_total_cost_udf(f.col('items'))
)

# Find the order with the highest total cost
max_total_cost_order = df_fruitshop_with_total_cost.orderBy(
    f.col('total_cost').desc()
).limit(1)

max_total_cost_order.display()

Pandas UDFs. Explain the theory behind it.

Find a way to show the advantages.

Pandas UDFS:

1. **Performance Optimization:** Pandas UDFs can perform vectorized operations on Pandas DataFrame chunks, which are more efficient than row-by-row operations typical in regular UDFs.
2. **Ease of Use and Familiarity:** Pandas UDFs allow you to leverage the rich functionality of Pandas, including its extensive libraries for data manipulation, statistical operations, and more.

Load the spotify playlists dataset

In [None]:
%sh 

wget https://github.com/inesmcm26/lp-big-data/raw/main/data/spotify.csv

In [None]:
%fs

cp file:/databricks/driver/spotify.csv dbfs:/FileStore/lp-big-data/spotify.csv

Answer some questions using

1. PySpark functions
2. UDFs
3. Pandas UDFs

Performance of udfs vs pandas udfs

1. Get the number of tracks in each playlist

In [None]:
# 1. Using pyspark built-in functions
df_num_tracks = df_spotify.withColumn("num_tracks", f.size(f.col("tracks")))

# 2. Using UDF
def count_tracks(tracks):
    return len(tracks)

count_tracks_udf = f.udf(count_tracks, IntegerType())

df_num_tracks = df_spotify.withColumn("num_tracks", count_tracks_udf(f.col("tracks")))


# Using pandas udfs
@f.pandas_udf(IntegerType())
def count_tracks(tracks: pd.Series) -> pd.Series:
    return tracks.apply(len)

df_num_tracks = df_spotify.withColumn("num_tracks", count_tracks(f.col("tracks")))

2. Get the number of albuns in each playlist

In [None]:
# 1. Using pyspark built-in functions
df_num_albums = (
    df_num_tracks
    .withColumn("albums", f.transform(f.col("tracks"), lambda x: x["album"]))
    .withColumn("num_albuns", f.size(f.array_distinct(f.col("albums"))))
)

# 2. Using UDF
def count_albums(tracks):
    return len(set([song["name"] for song in tracks]))

count_albums_udf = f.udf(count_albums, IntegerType())

df_num_albums = (
    df_num_tracks
    .withColumn("num_albuns", count_albums_udf(f.col("tracks")))
)

# Using pandas udfs
@f.pandas_udf(IntegerType())
def count_albums(tracks: pd.Series) -> pd.Series:
    return tracks.apply(lambda x: len(set([song["name"] for song in x])))

df_num_albums = (
    df_num_tracks
    .withColumn("num_albuns", count_albums(f.col("tracks")))
)

3. Get the total duration of each playlist

In [None]:
# 1. Using pyspark built-in functions
df_total_duration = (
    df_num_albums
    # First apply transform to get a list of durations
    .withColumn("durations", f.transform(f.col("tracks"), lambda x: x["duration_ms"]))
    # Then sum the durations
    .withColumn("total_duration", f.sum(f.col("durations")))
)


# 2. Using UDF
def sum_durations(tracks):
    return sum([song["duration_ms"] for song in tracks])

sum_durations_udf = f.udf(sum_durations, IntegerType())

df_total_duration = (
    df_num_albums
    .withColumn("total_duration", sum_durations_udf(f.col("tracks")))
)

# Using pandas udfs
@f.pandas_udf(IntegerType())
def sum_durations(tracks: pd.Series) -> pd.Series:
    return tracks.apply(lambda x: sum([song["duration_ms"] for song in x]))

df_total_duration = (
    df_num_albums
    .withColumn("total_duration", sum_durations(f.col("tracks")))
)

Predict the number of followers based on the number of tracks, number of albums and total duration.

In [None]:
@f.pandas_udf(FloatType())
def predict_num_followers_udf(
    num_tracks: pd.Series,
    num_albums: pd.Series,
    total_duration: pd.Series,
    num_followers: pd.Series
) -> pd.Series:
    """Predict the number of followers of a spotify playlist based
    on the number of tracks, albums and total duration
    """

    X = np.vstack([num_tracks, num_albums, total_duration]).T
    y = num_followers.values
    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)
    return pd.Series(predictions)

df_predictions = (
    df_total_duration
    .withColumn("predicted_num_followers",
                predict_num_followers_udf(
                    f.col("num_tracks"),
                    f.col("num_albums"),
                    f.col("total_duration"),
                    f.col("num_followers"))
                )
)

df_predictions.display()