In [2]:
# Imports Required for Accessing the MSD data
import os
import sys
import glob
import tables

# Imports Required for Querying the Spotify API and Creating the Spotify Audio Features Dataset
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotifyusercredentials
import pandas as pd

# Spotify API Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=spotifyusercredentials.client_ID, client_secret=spotifyusercredentials.client_SECRET)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [3]:
# Establishes path to the Million Song Dataset (or subset)
msd_path='./data/MillionSongSubset'

# Establishes path to the Million Song Dataset code
msd_code_path='./code/MSongsDB'
assert os.path .isdir(msd_code_path),'wrong path' # sanity check
# we add some paths to python so we can import MSD code
sys.path.append( os.path.join(msd_code_path,'PythonSrc') )

# Imports the MSD Python modules we need
import hdf5_getters as GETTERS

In [3]:
# we define this very useful function to iterate through all the files in the dataset
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # count files
        cnt += len(files)
        # apply function to all files
        for f in files :
            func(f)       
    return cnt

In [96]:
# Defines a function that queries the Spotify API for the audio features of a file
def get_spotify_audio_features(filename, dataframe):
    # Opens the file
    file = GETTERS.open_h5_file_read(filename)
    # Creates a variable track_id that contains the track id in a file
    msd_track_id = GETTERS.get_track_id(file).decode('utf-8')
    # Creates a variable song_title that contains the song title in a file
    song_title = GETTERS.get_title(file).decode('utf-8')
    # Creates a variable release that contains the release in a file
    album_name = GETTERS.get_release(file).decode('utf-8')
    # Creates a variable artist_name that contains the artist name in a file
    artist_name = GETTERS.get_artist_name(file).decode('utf-8')
    # Build the search query
    api_query = f'track:{song_title} album:{album_name} artist:{artist_name}'
    # Queries Spotify API for song data using the song title, album title, and artist
    results = sp.search(q=api_query, type='track', limit=1)
    # Extracts the spotify track id from the results
    spotify_track_id = results['tracks']['items'][0]['id']
    # Queries Spotify API for audio features using the track id
    audio_features = sp.audio_features([spotify_track_id])
    # Appends the audio features to the global dataframe
    audio_features_df = pd.DataFrame(audio_features, index=[msd_track_id])
    dataframe = pd.concat([dataframe, audio_features_df], ignore_index=False)
    # Closes the file
    file.close()
    # Return the updated dataframe
    return dataframe

In [1]:
import os
import glob
import pandas as pd

# Defines a function that queries the Spotify API for the audio features of a file
def get_spotify_audio_features(filename, dataframe):
    # Opens the file
    file = GETTERS.open_h5_file_read(filename)
    # Creates a variable track_id that contains the track id in a file
    msd_track_id = GETTERS.get_track_id(file).decode('utf-8')
    # Creates a variable song_title that contains the song title in a file
    song_title = GETTERS.get_title(file).decode('utf-8')
    # Creates a variable release that contains the release in a file
    album_name = GETTERS.get_release(file).decode('utf-8')
    # Creates a variable artist_name that contains the artist name in a file
    artist_name = GETTERS.get_artist_name(file).decode('utf-8')
    # Build the search query
    api_query = f'track:{song_title} album:{album_name} artist:{artist_name}'
    # Queries Spotify API for song data using the song title, album title, and artist
    results = sp.search(q=api_query, type='track', limit=1)
    # Check if any tracks were found
    # if len(results['tracks']['items']) > 0:
        # Extracts the Spotify track id from the results
        spotify_track_id = results['tracks']['items'][0]['id']
        # Queries Spotify API for audio features using the track id
        audio_features = sp.audio_features([spotify_track_id])
        # Appends the audio features to the global dataframe
        audio_features_df = pd.DataFrame(audio_features, index=[msd_track_id])
        dataframe = pd.concat([dataframe, audio_features_df], ignore_index=False)
    # else:
    #     print(f"No results found for file: {filename}")
    # Closes the file
    file.close()
    # Return the updated dataframe
    return dataframe


# Combines the two functions
def get_all_audio_features(basedir, ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, and get the audio features
    for each .h5 file from the Spotify API.
    Returns a dataframe of all the audio features.
    """
    dataframe = pd.DataFrame()  # Create an empty dataframe to store the audio features
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*' + ext))
        # apply function to all files
        for f in files:
            dataframe = get_spotify_audio_features(f, dataframe)
    return dataframe

In [4]:
get_all_audio_features(msd_path, ext='.h5')

No results found for file: ./data/MillionSongSubset\A\A\A\TRAAADZ128F9348C2E.h5
No results found for file: ./data/MillionSongSubset\A\A\A\TRAAAVG12903CFA543.h5
No results found for file: ./data/MillionSongSubset\A\A\B\TRAABCL128F4286650.h5
No results found for file: ./data/MillionSongSubset\A\A\B\TRAABDL12903CAABBA.h5
No results found for file: ./data/MillionSongSubset\A\A\B\TRAABJL12903CDCF1A.h5
No results found for file: ./data/MillionSongSubset\A\A\B\TRAABVM128F92CA9DC.h5
No results found for file: ./data/MillionSongSubset\A\A\B\TRAABXG128F9318EBD.h5
No results found for file: ./data/MillionSongSubset\A\A\B\TRAABYN12903CFD305.h5
No results found for file: ./data/MillionSongSubset\A\A\C\TRAACHN128F1489601.h5
No results found for file: ./data/MillionSongSubset\A\A\C\TRAACOW128F933E35F.h5
No results found for file: ./data/MillionSongSubset\A\A\C\TRAACSL128F93462F4.h5
No results found for file: ./data/MillionSongSubset\A\A\C\TRAACVS128E078BE39.h5
No results found for file: ./data/Millio

KeyboardInterrupt: 