In [1]:
# Imports
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

In [2]:
# Authenticate spotipy
with open("spotify_credentials.txt") as f:
    client_id = f.readline()[:-1]
    client_secret = f.readline()

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
def get_genre_by_search(client, genre, limit=1000, pop_thresh=0):
    '''
    Retrieves tracks and their audio features that fall under a particular genre by a direct search.

    Parameters:
    client (spotipy.client.Spotify object): Client to perform the search with. 
    genre (str): Genre of songs to obtain.
    limit (int): Max number of songs to retrieve. Defaults to 1000.
    pop_thresh (float): Minimum popularity a song must have to be included. Defaults to 0.
    '''
    # Get initial query. See https://developer.spotify.com/documentation/web-api/reference/search/ for search documentation.
    results = client.search(q=f'genre:{genre}', type='track', limit=50) # Spotify API maxes out at 50
    tracks = results['tracks']['items']
    filtered_tracks = [track for track in tracks if track['popularity'] >= pop_thresh]

    # Get the rest of the songs.
    offset = 50
    while len(filtered_tracks) < limit and offset < 1000: # Search returns a maximum of 1000 results
        results = client.search(q=f'genre:{genre}', offset=offset, type='track', limit=50)
        tracks = results['tracks']['items']
        new_filtered_tracks = [track for track in tracks if track['popularity'] >= pop_thresh]
        filtered_tracks.extend(new_filtered_tracks)
        offset += 50

    # Get track data to use for the audio features query
    track_data = []
    for track in filtered_tracks:
        track_info = {
            'name': track['name'],
            'artist': track['artists'][0]['name'],
            'album': track['album']['name'],
            'id': track['id'],
            'popularity': track['popularity'] 
        }
        track_data.append(track_info)
    track_df = pd.DataFrame(track_data)

    # Retrieve audio features
    ids = track_df['id'].tolist()
    features = []
    for i in range(0,len(ids), 100):
        audio_features = client.audio_features(ids[i:i+100])
        features.extend(audio_features)
    features_df = pd.DataFrame(features)

    # Combine dataframes
    combined_df = pd.concat([track_df, features_df], axis=1)
    combined_df['genre'] = genre
    return combined_df

In [4]:
# Get 5 sufficiently different genres
country_df = get_genre_by_search(sp, 'country', limit=1000)
hiphop_df = get_genre_by_search(sp, 'hip hop', limit=1000)
heavy_metal_df = get_genre_by_search(sp, 'heavy metal', limit=1000)
classical_df = get_genre_by_search(sp, 'classical', limit=1000)
techno_df = get_genre_by_search(sp, 'techno', limit=1000)

In [6]:
# Save to data
final_data = pd.concat([country_df, hiphop_df, heavy_metal_df, classical_df, techno_df]).reset_index(drop=True)
final_data.to_csv('genre_dataset.csv', index=False)