In [1]:
import numpy as np
import pandas as pd
import re
import random
import time
import requests
import base64
import itertools
import spotipy
from spotipy.oauth2 import SpotifyOAuth

api_keys = pd.read_csv('apikeys.csv',header=None)
api_keys = api_keys.iloc[0].to_numpy()

data=pd.read_csv('tracks_features.csv')
data=data.drop(['album', 'album_id', 'track_number', 'disc_number', 'year'],axis=1)
data = data.dropna()

# Data Collection

## Functions

In [24]:
#Spotify api
def refresh_key(CLIENT_ID, CLIENT_SECRET):
    # Base64 encode the client ID and client secret
    client_credentials = f"{CLIENT_ID}:{CLIENT_SECRET}"
    client_credentials_base64 = base64.b64encode(client_credentials.encode())

    # Request the access token
    token_url = 'https://accounts.spotify.com/api/token'
    headers = {
        'Authorization': f'Basic {client_credentials_base64.decode()}'
    }
    data = {
        'grant_type': 'client_credentials'
    }

    response = requests.post(token_url, data=data, headers=headers)

    if response.status_code == 200:
        access_token = response.json()['access_token']
        print("Access token obtained successfully.")
    else:
        print("Error obtaining access token.")
        exit()

    #spotify access
    sp = spotipy.Spotify(auth=access_token)
    return(sp)

sp = refresh_key(api_keys[0],api_keys[1])

Access token obtained successfully.


In [26]:
#given track data, add genre and popularity and reformat it.
def get_track_info(track_row, debug=False):
    track = sp.track(track_row['id'])
    #removes artists besides main artist from category
    artist_name_formatted = track_row['artists'].strip("['").strip("]").replace("'", "").split(', ')[0]
    artist_id_formatted = track_row['artist_ids'].strip("['").strip("]").replace("'", "").split(', ')[0]
    artist = sp.artist(artist_id_formatted)

    ### genre ###
    artist_genres = set(artist['genres'])
    top_genres = ['rap', 'pop', 'rock', 'country', 'edm', 'latin', 'alt',
                  'r&b', 'instrumental', 'jazz', 'indie', 'folk']
    #common genres to be binned with top genres. obtained by running function with random songs selected
    similar_genres = {'hip':'rap', 'dance':'edm', 'classical':'instrumental', 'orchestra':'instrumental', 'metal':'alt',
                      'grunge':'alt', 'blues':'jazz', 'soul':'jazz', 'punk':'alt', 'symphony':'instrumental',
                      'guitar':'instrumental', 'piano':'instrumental', 'brass':'instrumental', 'raggae':'jazz',
                      'ensemble':'instrumental', 'orchestral':'instrumental', 'house':'edm', 'disco':'edm','reggaeton':'latin',
                      'ambient':'instrumental', 'prog':'rock'}
    
    #check if genre found
    artist_top_genre = list(set(top_genres).intersection(artist_genres)) 
    g = len(artist_top_genre) #used to adjust for multi-genre artists

    if g == 0: #split combined genres (ex. detroit rap -> rap)
        artist_genres = [re.split(' |-', s) for s in artist_genres]
        artist_genres = list(itertools.chain.from_iterable(artist_genres))
        artist_genres = list(set(artist_genres))
        artist_top_genre = list(set(top_genres).intersection(artist_genres))
        g = len(artist_top_genre)

        if g == 0: #look for similar genres (ex. hip-hop -> rap)
            genre_matches = [word for word in artist_genres if word in similar_genres]
            artist_top_genre = []
            for i in range(0,len(genre_matches)): artist_top_genre.append(similar_genres[genre_matches[i]]) #search dictionary
            g = len(artist_top_genre)

            if len(artist_top_genre) == 0: #still not found
                if debug == True: print(artist_genres) #used to find 'similar_genres'
                g = 1; artist_top_genre = 'other'
    

    track_info = pd.DataFrame({
        #track info
        'track_id': [track_row['id']] * g,
        'track_name': [track_row['name']]*g,
        'artist_id': [artist_id_formatted]*g ,
        'artist_name': [artist_name_formatted]*g,
        'artist_genre': artist_top_genre,
        'release_date': [track_row['release_date']]*g,
        'popularity': [track['popularity']]*g,

        #song data
        'explicit': track_row['explicit'],
        'danceability': track_row['danceability'],
        'energy': track_row['energy'],
        'key': track_row['key'],
        'loudness': track_row['loudness'] ,
        'mode': track_row['mode'],
        'speechiness': track_row['speechiness'],
        'acousticness': track_row['acousticness'],
        'instrumentalness': track_row['instrumentalness'],
        'liveness': track_row['liveness'],
        'valence': track_row['valence'],
        'tempo': track_row['tempo']
    })
    return(track_info)

#get_track_info(data.iloc[1])

In [27]:
#data generation
def generate_data(dataset, length = 0, debug = False, perc = False):
    #sp = refresh_key(api_keys[0],api_keys[1])
    #start_time = time.time()

    if length == 0:
        l = dataset.shape[0]
    else:
        l = length

    random_index = random.sample(range(0, dataset.shape[0]), l)
    df = get_track_info(dataset.iloc[random_index[0]],debug)
    
    for i in random_index:
        print(i)
        # if (start_time - time.time()) > 3000: #check if process takes over 50 min and refresh api key if so
        #     sp = refresh_key(api_keys[0],api_keys[1])
        #     start_time = time.time()

        # if perc == True:
        #     if (i/l*100)%5 == 0:
        #         percent += 5
        #         print(percent,'%')

        df = pd.concat([df, get_track_info(dataset.iloc[i],debug)], ignore_index=True)
        # time.sleep(.01)

    df = df.iloc[1:].reset_index()
    return(df)

### hide

In [229]:
#given playlist id, return info on all tracks


#using old code to comply with playlist function
def get_track_info_original(track_id):
    track = sp.track(track_id)
    artist_id = track['album']['artists'][0]['id']
    artist = sp.artist(artist_id)

    # #split obscure genres into (hopefully) recognisable ones
    # artist_genres = artist['genres']
    # artist_genres = [s.split() for s in artist_genres]
    # artist_genres = list(itertools.chain.from_iterable(artist_genres))
    # artist_genres = list(set(artist_genres))
    # #check if in top 10 categories
    # top_genres = ['rap', 'pop', 'rock', 'country', 'edm', 'latin', 'k-pop', 'r&b', 'classical', 'indie']
    # artist_top_genre = list(set(top_genres).intersection(artist_genres))

    ### genre ###
    artist_genres = set(artist['genres'])
    top_genres = ['rap', 'pop', 'rock', 'country', 'edm', 'latin', 'alt',
                  'r&b', 'instrumental', 'jazz', 'indie', 'folk']
    #common genres to be binned with top genres. obtained by running function with random songs selected
    similar_genres = {'hip':'rap', 'dance':'edm', 'classical':'instrumental', 'orchestra':'instrumental', 'metal':'alt',
                      'grunge':'alt', 'blues':'jazz', 'soul':'jazz', 'punk':'alt', 'symphony':'instrumental',
                      'guitar':'instrumental', 'piano':'instrumental', 'brass':'instrumental', 'raggae':'jazz',
                      'ensemble':'instrumental', 'orchestral':'instrumental', 'house':'edm', 'disco':'edm','reggaeton':'latin',
                      'ambient':'instrumental', 'prog':'rock'}
    
    #check if genre found
    artist_top_genre = list(set(top_genres).intersection(artist_genres)) 
    g = len(artist_top_genre) #used to adjust for multi-genre artists

    if g == 0: #split combined genres (ex. detroit rap -> rap)
        artist_genres = [re.split(' |-', s) for s in artist_genres]
        artist_genres = list(itertools.chain.from_iterable(artist_genres))
        artist_genres = list(set(artist_genres))
        artist_top_genre = list(set(top_genres).intersection(artist_genres))
        g = len(artist_top_genre)

        if g == 0: #look for similar genres (ex. hip-hop -> rap)
            genre_matches = [word for word in artist_genres if word in similar_genres]
            artist_top_genre = []
            for i in range(0,len(genre_matches)): artist_top_genre.append(similar_genres[genre_matches[i]]) #search dictionary
            g = len(artist_top_genre)

            if len(artist_top_genre) == 0: #still not found
                print(artist_genres) #used to find 'similar_genres'
                g = 1; artist_top_genre = 'other'

    audio_features = sp.audio_features(track_id)[0]

    track_info = pd.DataFrame({
        #track info
        'track_id': [track_id] * g,
        'track_name': [track['name']] * g,
        'artist_id': [artist_id] * g,
        'artist_name': [track['album']['artists'][0]['name']] * g,
        'artist_genre': artist_top_genre,
        'release_date': [track['album']['release_date']] * g,
        'popularity': [track['popularity']] * g,

        #song data
        'explicit': track['explicit'],
        'danceability': audio_features['danceability'],
        'energy': audio_features['energy'],
        'key': audio_features['key'],
        'loudness': audio_features['loudness'] ,
        'mode': audio_features['mode'],
        'speechiness': audio_features['speechiness'],
        'acousticness': audio_features['acousticness'],
        'instrumentalness': audio_features['instrumentalness'],
        'liveness': audio_features['liveness'],
        'valence': audio_features['valence'],
        'tempo': audio_features['tempo']
    })
    return(track_info)


def get_playlist_info(playlist_id):
    playlist = sp.playlist(playlist_id)
    length_pl = playlist['tracks']['total']

    df = get_track_info_original(playlist['tracks']['items'][0]['track']['id'])
    for i in range(1,length_pl-1):
        time.sleep(.1) #slowing to avoid api limit
        df = pd.concat([df, get_track_info_original(playlist['tracks']['items'][i]['track']['id'])])
        
    return(df.reset_index())

## Dataset

In [28]:
#create dataset
#37i9dQZF1Fa1IIVtEpGUcU
# playlist_id = '4H38k1jiSdakPvIBGLruGS'
# test_data = get_playlist_info(playlist_id)
# test_data = test_data[test_data['artist_genre'] != 'other']
#get_track_info(data.iloc[1])

In [29]:
#final_data = generate_data(data,5)
#5 2s
#50 14s
#500 
#5000 

Sources

https://medium.com/@shruti.somankar/building-a-music-recommendation-system-using-spotify-api-and-python-f7418a21fa41

https://www.unchainedmusic.io/blog-posts/top-music-genres-in-order-the-most-popular-genres-worldwide

https://kworb.net/spotify/songs.html

https://docs.pysimplegui.com/en/latest/

https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs?resource=download