<a href="https://colab.research.google.com/github/lauragabrysiak/mitx_applied_data_science/blob/main/spotipy_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [62]:
# Installing Spotify Web API spotipy
!pip install findspark pyspark spotipy



In [117]:
import pandas as pd
import time

from spotipy.oauth2 import SpotifyClientCredentials
import spotipy

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

import findspark
findspark.init()

# A dictionary output that does not raise a key error
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict

### Setting up your PySpark Session

In [89]:
# Set up Spark session
spark = SparkSession.builder.appName("SparkSpotipyIntegration").getOrCreate()

In [90]:
# Replace 'your_data.csv' with your actual dataset
df_final = pd.read_csv('/content/sample_data/df_final.csv'
                  , on_bad_lines='skip'
                  )

In [107]:
df = spark.createDataFrame(pd.DataFrame(df_final.head(500)))

In [108]:
df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- song_id: long (nullable = true)
 |-- play_count: long (nullable = true)
 |-- song_title: string (nullable = true)
 |-- song_release: string (nullable = true)
 |-- song_artist: string (nullable = true)
 |-- song_year: long (nullable = true)



In [113]:
df_check = df.withColumn("song_title", F.col("song_title"))
df_check.show(3)

+-------+-------+----------+--------------------+---------------+---------------+---------+
|user_id|song_id|play_count|          song_title|   song_release|    song_artist|song_year|
+-------+-------+----------+--------------------+---------------+---------------+---------+
|   6958|    447|         1|  Daisy And Prudence|   Distillation|   Erin McKeown|     2000|
|   6958|    512|         1|The Ballad of Mic...|        Sawdust|    The Killers|     2004|
|   6958|    549|         1|I Stand Corrected...|Vampire Weekend|Vampire Weekend|     2007|
+-------+-------+----------+--------------------+---------------+---------------+---------+
only showing top 3 rows



### Spotify Web API

Source: https://developer.spotify.com/documentation/web-api/reference/get-track

In [114]:
# Spotipy credentials
client_id = '930c85172af549c7bed7661f025edf11'
client_secret = 'b33c88126c014981b3fc1d7c426d3a1c'

# Set up Spotipy
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [118]:
def get_spotify_track_info(artist, track):
    try:
        result = sp.search(q=f'artist:{artist} track:{track}', type='track', limit=1)

        if not result['tracks']['items']:
            # No matching track found
            return None

        track_info = result['tracks']['items'][0]

        return {
            'artist': artist,
            'track': track,
            'spotify_id': track_info['id'],
            'title': track_info['name'],
            'popularity': track_info['popularity'],
            'is_local': track_info['is_local'],
            'explicit': track_info['explicit'],
            'duration_ms': track_info['duration_ms'],

            # Album info
            'album': track_info['album']['name'],
            'release_date': track_info['album']['release_date'],
            'album_type': track_info['album']['album_type'],
            'album_available_markets': track_info['album']['available_markets'],

            # Artist Info
            'artist_name': track_info['artists'][0]['name'],
            'artist_popularity': sp.artist(track_info['artists'][0]['id'])['popularity'],
            'artist_followers': sp.artist(track_info['artists'][0]['id'])['followers']['total'],
            'artist_genres': sp.artist(track_info['artists'][0]['id'])['genres']
        }

    except Exception as e:
        print(f"Error processing {artist} - {track}: {e}")
        return None

# Register the UDF
spotify_udf = udf(get_spotify_track_info, StructType([
    StructField("artist", StringType()),
    StructField("track", StringType()),
    StructField("spotify_id", StringType()),
    StructField("title", StringType()),
    StructField("popularity", IntegerType()),
    StructField("is_local", StringType()),
    StructField("explicit", StringType()),
    StructField("duration_ms", IntegerType()),
    StructField("album", StringType()),
    StructField("release_date", StringType()),
    StructField("album_type", StringType()),
    StructField("album_available_markets", ArrayType(StringType())),
    StructField("artist_name", StringType()),
    StructField("artist_popularity", IntegerType()),
    StructField("artist_followers", IntegerType()),
    StructField("artist_genres", ArrayType(StringType()))
]))

In [119]:
result_df = df.withColumn("metadata", spotify_udf(df['song_artist'], df['song_title']))

In [None]:
result_df.show(3, truncate=False)

In [None]:
spark.stop()

In [None]:
# Write the result DataFrame back to a CSV file or other storage
result_df.write.csv('df_final_spotify_augmented.csv', header=True, mode='overwrite')