In [None]:
import numpy as np
import pandas as pd

import pyspark.sql.functions as f
from pyspark.sql.types import StringType, IntegerType, DoubleType, FloatType, MapType, LongType

from sklearn.linear_model import LinearRegression

Read the spotify data form the dbfs and answer the following questions.

In [None]:
df_spotify = (
    spark.read.format('json')
    .load("/FileStore/lp-big-data/spotify.json")
)

1. What is the average number of tracks in the playlists?

Use PySpark built-in functions to answer the question.

In [None]:
(
    df_spotify
    .withColumn('num_tracks', f.size(f.col('tracks')))
    .agg(f.avg('num_tracks').alias('avg_num_tracks'))
).display()

2. What is the average duration of a track in each playlist?

Answer using
- PySpark built-in functions
- UDF

In [None]:
# Using pyspark built-in functions

(
    df_spotify
    .withColumn('track', f.explode(f.col('tracks')))
    .groupBy('playlist_name')
    .agg(f.avg(f.col('track.duration_ms')).alias('avg_track_duration'))
).display()

In [None]:
# Using udfs

def avg_duration_udf(tracks):
    if len(tracks) == 0:
        return 0.0
    total_duration = sum([track['duration_ms'] for track in tracks])
    return total_duration / len(tracks)

avg_duration = f.udf(avg_duration_udf, DoubleType())

(
    df_spotify
    .withColumn('avg_track_duration', avg_duration(f.col('tracks')))
).display()

3. Which playlist has the highest number of unique artists?

Answer using
- PySpark built-in functions
- Pandas UDF

In [None]:
# Using pyspark built-in functions

(
    df_spotify
    .withColumn('artits', f.transform(f.col('tracks'), lambda x: x['artist_name']))
    .withColumn('artists_count', f.size(f.array_distinct(f.col('artits'))))
    .orderBy(f.desc('artists_count'))
    .limit(1)
).display()

In [None]:
# Using pandas udfs

@f.pandas_udf(IntegerType())
def count_artists(tracks_col: pd.Series) -> pd.Series:
    return tracks_col.apply(lambda tracks: len(set([song["artist_name"] for song in tracks])))

(
    df_spotify
    .withColumn("artists_count", count_artists(f.col("tracks")))
    .orderBy(f.desc("artists_count"))
    .limit(1)
).display()

4. What is the total duration of all tracks by each unique artist in each playlist?

Create a new column `total_duration_per_artist` of the type `MapType(StringType(), IntegerType())` that contains the total duration of all tracks by each unique artist in each playlist.

Answer using a Pandas UDF.

In [None]:
@f.pandas_udf(MapType(StringType(), LongType()))
def total_duration_per_artist_udf(tracks: pd.Series) -> pd.Series:
    result = {}
    
    for track_list in tracks:
        artist_durations = {}
        
        for track in track_list:
            artist_name = track['artist_name']
            duration = track['duration_ms']
            
            if artist_name in artist_durations:
                artist_durations[artist_name] += duration
            else:
                artist_durations[artist_name] = duration
        
        # Append result for each playlist
        result[str(track_list)] = artist_durations
    
    return pd.Series(result)

(
    df_spotify
    .withColumn("total_duration_per_artist", total_duration_per_artist_udf(f.col("tracks")))
).display()

5. Train a linear regression model to predict the total duration of a playlist based on the number of artists and number of tracks in the playlist.

Also create a column with the absolute difference between the actual total duration and the predicted total duration.

In the end, calculate the MAE of the model.

In [None]:
@f.pandas_udf(FloatType())
def predict_total_duration_udf(tracks: pd.Series) -> pd.Series:
    """Predict the total duration of a spotify playlist based
    on the number of artists and number of tracks in the playlist.
    """

    # Extract features using pandas
    num_artists = tracks.apply(lambda x: len(set([song["artist_name"] for song in x])))
    num_tracks = tracks.apply(lambda x: len(x))
    total_duration = tracks.apply(lambda x: sum([song["duration_ms"] for song in x]))

    X = np.vstack([num_artists, num_tracks]).T
    y = total_duration.values
    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)
    return pd.Series(predictions)

@f.pandas_udf(FloatType())
def get_total_duration_udf(tracks: pd.Series) -> pd.Series:
    return tracks.apply(lambda x: sum([song["duration_ms"] for song in x]))

df_spotify_predictions = (
    df_spotify
    .withColumn(
        'predicted_total_duration',
        predict_total_duration_udf(
            f.col('tracks')
        )
    )
    .withColumn('total_duration', get_total_duration_udf(f.col('tracks')))
    .withColumn('abs_error', f.abs(f.col('predicted_total_duration') - f.col('total_duration')))
)

df_spotify_predictions.display()

In [None]:
df_spotify_predictions.agg(f.avg('abs_error')).show()