## environment setup

In [1]:
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import SpotifyException
from IPython.display import clear_output
import pandas as pd
import numpy as np
import requests
import spotipy
import os

In [13]:
DATA_PATH = r"D:\Documents\datasets\AIST4010\muse\muse_v3.csv"
DATA_STORE = r"D:\Documents\datasets\AIST4010\muse\songs"
VALID_DATA_PATH = r"D:\Documents\datasets\AIST4010\muse\valid_data.csv"
# client ID for fetching spotify preview URL
SPOTIFY_CLIENT_ID = r""
# client secret for fetching spotify preview URL
SPOTIFY_CLIENT_SECRET = r""
# OAUTH_TOKEN for making queries to spotify API endpoints directly
OAUTH_TOKEN = r""
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)

## data preparation

In [3]:
data_types = {'valence_tags': np.float32, 'arousal_tags':np.float32, 'dominance_tags':np.float32}
data = pd.read_csv(DATA_PATH, dtype=data_types)
data.set_index('spotify_id', inplace=True)

data.head()

Unnamed: 0_level_0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,genre
spotify_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4xkOaSrkexMciUUogZKVTS,https://www.last.fm/music/eminem/_/%2527till%2...,'Till I Collapse,Eminem,['aggressive'],6,4.55,5.273125,5.690625,cab93def-26c5-4fb0-bedd-26ec4c1619e1,rap
3fOc9x06lKJBhz435mInlH,https://www.last.fm/music/metallica/_/st.%2banger,St. Anger,Metallica,['aggressive'],8,3.71,5.833,5.42725,727a2529-7ee8-4860-aef6-7959884895cb,metal
3Y96xd4Ce0J47dcalLrEC8,https://www.last.fm/music/rick%2bross/_/speedi...,Speedin',Rick Ross,['aggressive'],1,3.08,5.87,5.49,,rap
6tqFC1DIOphJkCwrjVzPmg,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,"['aggressive', 'fun', 'sexy', 'energetic']",13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,hip-hop
5bU4KX47KqtDKKaLM4QCzh,https://www.last.fm/music/dope/_/die%2bmf%2bdie,Die MF Die,Dope,['aggressive'],7,3.771177,5.348235,5.441765,b9eb3484-5e0e-4690-ab5a-ca91937032a5,metal


In [4]:
# data filtering

data = data.loc[data.index.notna()]  # remove the entries that have no spotify id
data = data.loc[~data.index.duplicated()]  # remove the duplicated entries
data.track = data.track + ' - ' + data.artist  # combine the track column with the artist name
del data['artist']  # remove artist as this info is already added to the track attribute
print(len(data))

60656


## fetch data

In [None]:
# through spotipy module
# ALERT: may have max retries error after fetching for a while
# fetch api setup
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
spotify.max_retries=10
spotify.backoff_factor = 0.4
spotify.retries = 10

# fetching
track_ids = data.index
ttl_track = len(track_ids)
valid_track = [False] * ttl_track

for base in range(0, ttl_track, 50):
    track_queries = track_ids[base:min(base+50, ttl_track)]
    tracks = spotify.tracks(track_queries)
    high_lv_features = spotify.audio_features(track_queries)
    for idx, track_id in enumerate(track_queries):
        track = tracks[idx]
        track_feature = high_lv_features[idx]
        preview_url = track.get("preview_url", None)
        speechiness = track_feature.get("speechiness", None)
        if preview_url and speechiness <= 0.6:  # also filter out songs that consists of too many words
            valid_track[base + idx] = True
            song_preview = requests.get(preview_url)
            with open(os.path.join(DATA_STORE, f'{track_id}.mp3'), 'wb') as f:
                f.write(song_preview.content)
    clear_output(wait=True)
    ttl_valid = sum(valid_track)
    print(f"{ttl_valid:<6} / {len(track_ids):<6} - {idx + 1 - ttl_valid} invalid")

valid_data = data.loc[valid_track]  # data with preview url
valid_data.to_csv(VALID_DATA_PATH)

In [24]:
# through direct API endpoint query after getting oauth token from Spotify API website
# ALERT: the token may expire after a while (~an hour)
header = {"Accept":"application/json",
          "Authorization":"Bearer "+ OAUTH_TOKEN,  # define OAUTH_TOKEN in environment
          "Content-Type":"application/json"}

def fetch_tracks(track_ids):
    response = requests.get(r"https://api.spotify.com/v1/tracks?ids="+"%2C".join(track_ids), headers=header)
    if response.status_code != 200:
        print(f"Fail to fetch tracks {track_ids[0]} ... {track_ids[-1]}.")
        print(f"{track_data.json()}")
        return None
    return response.json()['tracks']


def fetch_features(track_ids):
    response = requests.get(r"	https://api.spotify.com/v1/audio-features?ids="+"%2C".join(track_ids), headers=header)
    if response.status_code != 200:
        print(f"Fail to fetch tracks {track_ids[0]} ... {track_ids[-1]}.")
        print(f"{track_data.json()}")
        return None
    return response.json()['audio_features']
    

# fetching
track_ids = data.index
ttl_track = len(track_ids)
valid_track = [False] * ttl_track

for base in range(0, ttl_track, 50):
    track_queries = track_ids[base:min(base+50, ttl_track)]
    tracks = fetch_tracks(track_queries)
    high_lv_features = fetch_features(track_queries)
    for idx, track_id in enumerate(track_queries):
        track = tracks[idx]
        track_feature = high_lv_features[idx]
        if track is None or track_feature is None: continue
        preview_url = track.get("preview_url", None)
        speechiness = track_feature.get("speechiness", None)
        if preview_url and speechiness <= 0.6:  # also filter out songs that consists of too many words
            valid_track[base + idx] = True
            song_preview = requests.get(preview_url)
            with open(os.path.join(DATA_STORE, f'{track_id}.mp3'), 'wb') as f:
                f.write(song_preview.content)
    clear_output(wait=True)
    ttl_valid = sum(valid_track)
    print(f"{ttl_valid:<6} / {len(track_ids):<6} - {base + len(track_queries) + 1 - ttl_valid} invalid")

# valid_data = data.loc[valid_track]  # data with preview url
# valid_data.to_csv(VALID_DATA_PATH)

## Import the valid data

In [30]:
track_data = pd.read_csv(VALID_DATA_PATH)
track_data.set_index("spotify_id", inplace=True)

print(track_data.shape)
track_data.head()

(39971, 9)


Unnamed: 0_level_0,lastfm_url,track,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,genre
spotify_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3fOc9x06lKJBhz435mInlH,https://www.last.fm/music/metallica/_/st.%2banger,St. Anger - Metallica,['aggressive'],8,3.71,5.833,5.42725,727a2529-7ee8-4860-aef6-7959884895cb,metal
5bU4KX47KqtDKKaLM4QCzh,https://www.last.fm/music/dope/_/die%2bmf%2bdie,Die MF Die - Dope,['aggressive'],7,3.771177,5.348235,5.441765,b9eb3484-5e0e-4690-ab5a-ca91937032a5,metal
6DoXuH326aAYEN8CnlLmhP,https://www.last.fm/music/deftones/_/7%2bwords,7 Words - Deftones,"['aggressive', 'angry']",10,3.807121,5.473939,4.729091,1a826083-5585-445f-a708-415dc90aa050,nu metal
104YdibC7VQy78xAVmgRYr,https://www.last.fm/music/fiona%2bapple/_/limp,Limp - Fiona Apple,"['aggressive', 'angry', 'bitter']",20,3.737211,5.610204,4.626735,4435982c-b83e-4daa-af2b-9f3430036bb7,singer-songwriter
5fU6qjmD38P90BMsuqpiuU,https://www.last.fm/music/metallica/_/sweet%2b...,Sweet Amber - Metallica,['aggressive'],4,3.582759,5.757241,5.34,fe1cc051-faa7-4953-b331-f6196cd3ddae,metal


In [40]:
genre_cnt = track_data.genre.value_counts()
for idx in genre_cnt.index:
    print(f"{idx:<20} - {genre_cnt[idx]}")

indie                - 3096
rock                 - 2851
electronic           - 2516
pop                  - 2214
folk                 - 1325
ambient              - 1273
alternative          - 988
soundtrack           - 813
hip-hop              - 783
soul                 - 719
jazz                 - 615
singer-songwriter    - 614
experimental         - 593
industrial           - 542
piano                - 533
acoustic             - 521
indie rock           - 421
alternative rock     - 404
punk                 - 402
classic rock         - 376
new age              - 373
metal                - 356
progressive rock     - 351
classical            - 341
trip-hop             - 337
dance                - 336
electronica          - 316
indie pop            - 294
post-rock            - 284
trance               - 282
hard rock            - 279
downtempo            - 257
country              - 249
new wave             - 232
chill                - 227
hip hop              - 224
post-punk            -