In [None]:
import numpy as np
import pandas as pd

import pyspark.sql.functions as f
from pyspark.sql.types import StringType, IntegerType, DoubleType, FloatType, MapType, LongType

from sklearn.linear_model import LinearRegression

In [None]:
%sh 

wget https://raw.githubusercontent.com/inesmcm26/lp-big-data-mercedes/main/data/spotify.json

In [None]:
df_spotify = spark.createDataFrame(pd.read_json('file:/databricks/driver/spotify.json'))

In [None]:
df_spotify.printSchema()

1. What is the average number of tracks amongst all the playlists?

Use PySpark built-in functions to answer the question.

In [None]:
(
  df_spotify
  .withColumn('nr_tracks', f.size('tracks'))
  .select(
    f.avg('nr_tracks')
  )
).display()

2. What is the average duration of a track in milliseconds in each playlist?

Answer using
- PySpark built-in functions
- UDF

In [None]:
(
    df_spotify
    .select(
        'playlist_id',
        f.inline('tracks')
    )
    .groupBy('playlist_id')
    .agg(f.avg('duration_ms').alias('avg_duration'))
).display()

In [None]:
# Using udfs

def avg_duration(tracks):
    if len(tracks) == 0:
        return 0.0
    total_duration = sum([track['duration_ms'] for track in tracks])
    return total_duration / len(tracks)

avg_duration_udf = f.udf(avg_duration, DoubleType())

(
    df_spotify
    .select(
        'playlist_id',
        avg_duration_udf(f.col('tracks')).alias('avg_track_duration')
    )
).display()

3. Which playlist has the highest number of unique artists?

Answer using
- PySpark built-in functions
- UDFs

In [None]:
# Using pyspark built-in functions

(
    df_spotify
    .withColumn('artists_uri', f.transform('tracks', lambda x: x.getField('artist_uri')))
    .withColumn('nr_unique_artists', f.size(f.array_distinct('artists_uri')))
    .orderBy(f.desc('nr_unique_artists'))
    .select(
        'playlist_id',
        'nr_unique_artists'
    )
    .limit(1)
).display()

In [None]:
# Using udfs
def nr_unique_artists(tracks):
  return len(set([track['artist_uri'] for track in tracks]))

nr_unique_artists_udf = f.udf(nr_unique_artists, IntegerType())

(
  df_spotify
  .withColumn('nr_unique_artists', nr_unique_artists_udf(f.col('tracks')))
  .orderBy(f.desc('nr_unique_artists'))
    .select(
        'playlist_id',
        'nr_unique_artists'
    )
    .limit(1)
).display()

4. What is the total duration of all tracks by each unique artist in each playlist?

Create a new column `total_duration_per_artist` of the type `MapType(StringType(), IntegerType())` that maps each artist URI to the total duration of tracks by that artist.

Answer using UDFs

In [None]:
def map_artist_duration(tracks):
    artists_duration = {}
    for track in tracks:
        artist_uri = track['artist_uri']
        if artist_uri not in artists_duration:
            artists_duration[artist_uri] = 0
        artists_duration[artist_uri] += track['duration_ms']
    
    return artists_duration

map_artist_duration_udf = f.udf(map_artist_duration, MapType(StringType(), LongType()))

(
    df_spotify
    .withColumn('total_duration_per_artist', map_artist_duration_udf(f.col('tracks')))
).display()

**(Optional for later) Extra exercises about Pandas UDFs:**

5. Repeat question 4 but answer using Pandas UDFs

In [None]:
@f.pandas_udf(MapType(StringType(), LongType()))
def total_duration_per_artist_udf(tracks: pd.Series) -> pd.Series:
    result = {}
    
    for track_list in tracks:
        artist_durations = {}
        
        for track in track_list:
            artist_name = track['artist_name']
            duration = track['duration_ms']
            
            if artist_name in artist_durations:
                artist_durations[artist_name] += duration
            else:
                artist_durations[artist_name] = duration
        
        # Append result for each playlist
        result[str(track_list)] = artist_durations
    
    return pd.Series(result)

(
    df_spotify
    .withColumn("total_duration_per_artist", total_duration_per_artist_udf(f.col("tracks")))
).display()

6. Train a linear regression model to predict the total duration of a playlist based on the number of artists and number of tracks in the playlist.

Also create a column with the absolute difference between the actual total duration and the predicted total duration.

In the end, calculate the MAE of the model.

In [None]:
# Your code goes here@f.pandas_udf(FloatType())
def predict_total_duration_udf(tracks: pd.Series) -> pd.Series:
    """Predict the total duration of a spotify playlist based
    on the number of artists and number of tracks in the playlist.
    """

    # Extract features using pandas
    num_artists = tracks.apply(lambda x: len(set([song["artist_name"] for song in x])))
    num_tracks = tracks.apply(lambda x: len(x))
    total_duration = tracks.apply(lambda x: sum([song["duration_ms"] for song in x]))

    X = np.vstack([num_artists, num_tracks]).T
    y = total_duration.values
    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)
    return pd.Series(predictions)

@f.pandas_udf(FloatType())
def get_total_duration_udf(tracks: pd.Series) -> pd.Series:
    return tracks.apply(lambda x: sum([song["duration_ms"] for song in x]))

df_spotify_predictions = (
    df_spotify
    .withColumn(
        'predicted_total_duration',
        predict_total_duration_udf(
            f.col('tracks')
        )
    )
    .withColumn('total_duration', get_total_duration_udf(f.col('tracks')))
    .withColumn('abs_error', f.abs(f.col('predicted_total_duration') - f.col('total_duration')))
)

df_spotify_predictions.display()

In [None]:
df_spotify_predictions.agg(f.avg('abs_error')).show()