# Spotify API Feature Extraction

* Spotify Mlllion Playlist Dataset Challenge [Homepage](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge)
* [Spotify Web API docs](https://developer.spotify.com/documentation/web-api/reference/#/)

**Community Examples**
* [Extracting song lists](https://github.com/tojhe/recsys-spotify/blob/master/processing/songlist_extraction.py)
* [construct audio features with Spotify API](https://github.com/tojhe/recsys-spotify/blob/master/processing/audio_features_construction.py)
* [Using Spotify API](https://towardsdatascience.com/extracting-song-data-from-the-spotify-api-using-python-b1e79388d50)

### Package Installation

In [1]:
# ! pip3 install -U spotipy google-cloud-storage google-cloud-aiplatform gcsfs --user -q
# ! pip3 install --user kfp google-cloud-pipeline-components --upgrade -q

In [2]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

KFP SDK version: 1.8.19
google_cloud_pipeline_components version: 1.0.39
aiplatform SDK version: 1.22.0


### Parameters

In [3]:
PROJECT_ID = 'hybrid-vertex' #update
LOCATION = 'us-central1' 

BUCKET_NAME = 'matching-engine-content'
# BUCKET_NAME = 'spotify-million-playlists'

PIPELINE_VERSION = 'v3' # pipeline code
PIPELINE_TAG = f'{PIPELINE_VERSION}-spotify-feature-enrich'
print("PIPELINE_TAG:", PIPELINE_TAG)

PIPELINE_TAG: v3-spotify-feature-enrich


Create Bucket if needed

In [4]:
# !gsutil mb -l $LOCATION gs://$BUCKET_NAME

In [5]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import re
from tqdm import tqdm

import pandas as pd
import numpy as np
import json
from io import BytesIO

import time

import gcsfs

# GCP
from google.cloud import aiplatform
from google.cloud import storage

# Pipelines
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.types import artifact_types

# Kubeflow SDK
# TODO: fix these
from kfp.v2 import dsl
import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)


pd.set_option('display.max_columns', 100)

### clients & credentials

GCP clients

Spotify shoulld be stored in a json file with a your credentials

In [6]:
# Setup clients
aiplatform.init(
    project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_NAME
)

In [7]:
# Get spotify credentials
# This file has id and secret stored as attributes

creds = open('spotify-creds.json')
spotify_creds = json.load(creds)
creds.close()

Pipeline setup

In [8]:
# Pipeline Stuff
import os
PIPELINES = {}

PIPELINES_FILEPATH = f'gs://{BUCKET_NAME}/pipelines/pipelines.json' # <--- TODO: CHANGE THIS; can be blank json file
print("PIPELINES_FILEPATH:", PIPELINES_FILEPATH)

if os.path.isfile(PIPELINES_FILEPATH):
    with open(PIPELINES_FILEPATH) as f:
        PIPELINES = json.load(f)
else:
    PIPELINES = {}

def save_pipelines():
    with open(PIPELINES_FILEPATH, 'w') as f:
        json.dump(PIPELINES, f)

PIPELINES_FILEPATH: gs://matching-engine-content/pipelines/pipelines.json


# Create Pipeline Components

### Audio Features

In [15]:
@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=['fsspec', 'google-cloud-bigquery',
                         'google-cloud-storage',
                         'gcsfs',
                         'spotipy','requests','db-dtypes',
                         'numpy','pandas','pyarrow','absl-py', 'pandas-gbq==0.17.4'])
def call_spotify_api_split_audio(
    project: str,
    location: str,
    client_id: str,
    target_table: str,
    client_secret: str,
    unique_table: str,
    sleep_param: int,
) -> NamedTuple('Outputs', [('track_audio_feat_split_1_gcs_uri', str),]):
    print(f'pip install complete')
    import os
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    import re
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    import pandas as pd
    import json
    from io import BytesIO
    import time
    from google.cloud import storage
    import gcsfs
    import numpy as np
    from requests.exceptions import ReadTimeout, HTTPError, ConnectionError, RequestException
    from absl import logging
    from google.cloud import bigquery
    import pandas_gbq

    # print(f'package import complete')

    logging.set_verbosity(logging.INFO)
    logging.info(f'package import complete')

    
    bq_client = bigquery.Client(
      project=project, location=location
    )

    logging.info(f'spotipy auth complete')
    def spot_audio_features(uri, client_id, client_secret):

        # Authenticate
        client_credentials_manager = SpotifyClientCredentials(
            client_id=client_id, 
            client_secret=client_secret
        )
        sp = spotipy.Spotify(
            client_credentials_manager = client_credentials_manager, 
            requests_timeout=10, 
            retries=10 )
        ############################################################################
        # Create Track Audio Features DF for Split
        ############################################################################

        #Audio features
        uri = [u.replace('"', '') for u in uri] #fix the quotes 
        a_feats = sp.audio_features(uri)
        features = pd.json_normalize(a_feats, ).to_dict('list')
        if features is None:
            features = {}
        #Artist of the track, for genres and popularity
        popularity = []
        #tracks API call
        tracks = sp.tracks(uri)
        # if tracks:
        for track in tracks['tracks']:
            if track is not None:
                popularity.append(track['popularity'])
            else:
                popularity.append(-1)

        audio_df = pd.DataFrame(features)
        audio_df['popularity'] = popularity
        audio_df['track_uri'] = uri
        return audio_df

        bq_client = bigquery.Client(
          project=project, location=location
        )

        audio_featureDF = pd.DataFrame()

    query = f"select distinct track_uri from `{unique_table}`" 
    # uri_list_length_df = bq_client.query(query).result().to_dataframe()
    # uri_list_length = uri_list_length_df['count'][0]

    count = 1
    uri_batch = []
    # handling bad track/artist_uris

    #refactor
    schema = [{'name':'danceability', 'type': 'FLOAT'},
            {'name':'energy', 'type': 'FLOAT'},
            {'name':'key', 'type': 'FLOAT'},
            {'name':'loudness', 'type': 'FLOAT'},
            {'name':'mode', 'type': 'INTEGER'},
            {'name':'speechiness', 'type': 'FLOAT'},
            {'name':'acousticness', 'type': 'FLOAT'},
            {'name':'instrumentalness', 'type': 'FLOAT'},
            {'name':'liveness', 'type': 'FLOAT'},
            {'name':'valence', 'type': 'FLOAT'},
            {'name':'followers', 'type': 'FLOAT'},
            {'name':'tempo', 'type': 'FLOAT'},
            {'name':'type', 'type': 'STRING'},
            {'name':'id', 'type': 'STRING'},
            {'name':'uri', 'type': 'STRING'},
            {'name':'track_href', 'type': 'STRING'},
            {'name':'analysis_url', 'type': 'STRING'},
            {'name':'duration_ms_y', 'type': 'INTEGER'},
            {'name':'time_signature', 'type': 'INTEGER'},
            {'name':'popularity', 'type': 'INTEGER'},
            {'name':'track_uri', 'type': 'STRING'},
    ]

    tracks = bq_client.query(query).result().to_dataframe()
    track_list = tracks.track_uri.to_list()
    uri_list_length = len(track_list)
    for uri in track_list:
        if count % 50 == 0 or uri_list_length == count: #grab a batch of 50 songs
            uri_batch.append(uri)
            ### Try catch block for function
            try:
                audio_featureDF = spot_audio_features(uri_batch, client_id, client_secret)

            except ReadTimeout:
                logging.info("'Spotify timed out... trying again...'")
                audio_featureDF = spot_audio_features(uri_batch, client_id, client_secret)
            except HTTPError as err: #JW ADDED
                logging.info(f"HTTP Error: {err}")
            except spotipy.exceptions.SpotifyException as spotify_error: #jw_added
                logging.info(f"Spotify error: {spotify_error}")
            try:
                audio_featureDF.to_gbq(
                    destination_table=target_table, 
                    project_id=f'{project}', # TODO: param
                    location='us-central1', 
                    table_schema=schema,
                    progress_bar=False, 
                    reauth=False, 
                    if_exists='append'
                    )
            except pandas_gbq.gbq.InvalidSchema as invalid_schema:
                logging.info('invalid schema, skipping')
                pass
            logging.info(f'{count} of {uri_list_length} complete!')
            uri_batch = []
            count += 1
            time.sleep(sleep_param)
        else:
            uri_batch.append(uri)
            count += 1


    logging.info(f'audio features appended')

    return (
      f'DONE',
    )

### Artists

In [16]:
### Artist tsracks api call

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=['fsspec',' google-cloud-bigquery',
                         'google-cloud-storage',
                         'gcsfs',
                         'spotipy','requests','db-dtypes',
                         'numpy','pandas','pyarrow','absl-py', 'pandas-gbq==0.17.4'])
def call_spotify_api_split_artist(
    project: str,
    location: str,
    unique_table: str,
    client_id: str,
    client_secret: str,
    sleep_param: int,
    target_table: str,
) -> NamedTuple('Outputs', [('track_audio_feat_split_1_gcs_uri', str),]):
    print(f'pip install complete')
    import os
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    import re
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    import pandas as pd
    import json
    from io import BytesIO
    import time
    from google.cloud import storage
    import gcsfs
    import numpy as np
    from requests.exceptions import ReadTimeout, HTTPError, ConnectionError, RequestException
    from absl import logging
    from google.cloud import bigquery
    import pandas_gbq
    # print(f'package import complete')

    logging.set_verbosity(logging.INFO)
    logging.info(f'package import complete')

    storage_client = storage.Client(
        project=project
    )
    
    logging.info(f'spotipy auth complete')
    
    def spot_audio_features(uri, client_id, client_secret):

        # Authenticate
        client_credentials_manager = SpotifyClientCredentials(
            client_id=client_id, 
            client_secret=client_secret
        )
        sp = spotipy.Spotify(
            client_credentials_manager = client_credentials_manager, 
            requests_timeout=10, 
            retries=10 )
        ############################################################################
        # Create Track Audio Features DF for Split
        ############################################################################

        #Audio features

        #Artist of the track, for genres and popularity
        features = {}
        #tracks API call
        # if tracks:

        #artists api call
        uri = [u.replace('"', '') for u in uri] #fix the quotes 
        artists = sp.artists(uri)

        artist_pop = []
        artist_genres = []
        followers = []
        id_list = uri
        for artist in artists['artists']:
            if artist is not None:
                artist_pop.append(artist['popularity'])
                artist_genres.append(artist['genres'])
                # if artist['followers']['total'] is None:
                followers.append(artist['followers']['total'])
                # else:
                #   followers.append(-1)
            else:
                artist_pop.append(-1)
                artist_genres.append('unknown')

        # logging.info(print(f"artist: {artist_pop}"))
        # logging.info(print(f"genres: {artist_genres}"))
        # logging.info(print(f"followers: {followers}"))
        features["artist_pop"] = artist_pop
        features["genres"] = artist_genres
        features['followers'] = followers
        features['artist_uri'] = id_list
        audio_df = pd.DataFrame(features)
        audio_df['genres'] = audio_df['genres'].astype(str)
        # logging.info(print(audio_df)) #logging
        return audio_df

    bq_client = bigquery.Client(
      project=project, location=location
    )


    query = f"select distinct artist_uri from `{unique_table}`"
    logging.info(f'finished downloading tracks')
    # uri_list_length_df = bq_client.query(query).result().to_dataframe()
    # n_artists = uri_list_length_df['count'][0]
    # logging.info(f'number of distinct artists: {n_artists}')
    audio_featureDF = pd.DataFrame()


    schema = [{'name': 'artist_pop', 'type': 'INTEGER'},
            {'name':'genres', 'type': 'STRING'},
            {'name':'followers', 'type': 'INTEGER'},
            {'name':'artist_uri', 'type': 'STRING'}
    ]
    count = 1
    uri_batch = []
    # handling bad track/artist_uris

    ats = bq_client.query(query).result().to_dataframe()
    artist_set = ats.artist_uri.to_list()
    uri_list_length = len(artist_set)
    for uri in artist_set:
        if count % 50 == 0 or uri_list_length == count: #grab a batch of 50 artists
            uri_batch.append(uri)
            ### Try catch block for function
            try:
                audio_featureDF = spot_audio_features(uri_batch, client_id, client_secret)
            except ReadTimeout:
                logging.info("'Spotify timed out... trying again...'")
                audio_featureDF = spot_audio_features(uri_batch, client_id, client_secret)
            except HTTPError as err: #JW ADDED
                logging.info(f"HTTP Error: {err}")
            except spotipy.exceptions.SpotifyException as spotify_error: #jw_added
                logging.info(f"Spotify error: {spotify_error}")
            try:
                audio_featureDF.to_gbq(
                    destination_table=target_table, 
                    project_id=f'{project}', # TODO: param
                    location='us-central1', 
                    # table_schema=schema,
                    progress_bar=False, 
                    reauth=False, 
                    if_exists='append'
                    )
            except pandas_gbq.gbq.InvalidSchema as invalid_schema:
                logging.info('invalid schema, skipping')
                pass
            logging.info(f'{count} of {uri_list_length} complete!')
            uri_batch = []
            count += 1

            time.sleep(sleep_param)

        else:
            uri_batch.append(uri)
            count += 1

    return (
          f'DONE',
      )

## Build Pipeline

In [17]:
from typing import Dict

@kfp.v2.dsl.pipeline(
  name=f'spotify-feature-enrichment-{PIPELINE_TAG}'.replace('_', '-')
)
def pipeline(
    project: str,
    location: str,
    unique_table: str,
    target_table_audio: str,
    target_table_artist: str,
    spotify_id: str = spotify_creds['id'],
    spotify_secret: str = spotify_creds['secret'],
    ):


    call_spotify_api_split_artist_op = call_spotify_api_split_artist(
        project=project,
        location=location,
        client_id=spotify_id,
        client_secret=spotify_secret,
        sleep_param=20,
        unique_table=unique_table,
        target_table=target_table_artist,
    )

    call_spotify_api_split_audio_op = call_spotify_api_split_audio(
        project=project,
        location=location,
        client_id=spotify_id,
        client_secret=spotify_secret,
        sleep_param=20,
        unique_table=unique_table,
        target_table=target_table_audio,
    )

In [18]:
kfp.v2.compiler.Compiler().compile(
  pipeline_func=pipeline, 
  package_path='custom_container_pipeline_spec.json',
)

In [19]:
# jtotten-project #
GCS_BUCKET = 'matching-engine-content'
BQ_DATASET = 'mdp_eda_test'
VERSION = 5


PIPELINE_PARAMETERS = dict(
    project = PROJECT_ID,
    location = 'us-central1',
    unique_table = f'{PROJECT_ID}.{BQ_DATASET}.tracks_unique'
    target_table_audio = f'{PROJECT_ID}.{BQ_DATASET}.audio_features',
    target_table_artist = f'{PROJECT_ID}.{BQ_DATASET}.artist_features',
)

PIPELINE_PARAMETERS

{'project': 'hybrid-vertex',
 'location': 'us-central1',
 'unique_table': 'hybrid-vertex.mdp_eda_test.tracks_unique'}

In [20]:
job = aiplatform.PipelineJob(display_name = f'spotify-feature-enrichment-{PIPELINE_TAG}'.replace('_', '-'),
                             template_path = 'custom_container_pipeline_spec.json',
                             pipeline_root = f'gs://{BUCKET_NAME}/{VERSION}',
                             parameter_values = PIPELINE_PARAMETERS,
                             project = PROJECT_ID,
                             location = LOCATION,
                              enable_caching=True)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/spotify-feature-enrichment-v3-spotify-feature-enrich-20230223220852
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/spotify-feature-enrichment-v3-spotify-feature-enrich-20230223220852')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/spotify-feature-enrichment-v3-spotify-feature-enrich-20230223220852?project=934903580331
