In [1]:
# Import dependencies
import requests
from config import spotify_token
import pandas as pd
import numpy as np
import time
import re

In [2]:
base_url = 'https://api.spotify.com'
headers_dict = {'Content-Type': 'application/json', 
                'Authorization': f'Bearer {spotify_token}',
                'country': 'US'}

In [3]:
# Create a list of all the 50 category ids
t0 = time.time()
query_url = base_url + f'/v1/browse/categories?limit=50'
results = requests.get(query_url, headers=headers_dict).json()

categories_list = []
for id, category in enumerate(results['categories']['items']):
    categories_list.append(category['id'])

t1 = time.time()
print(f'Run time: {t1-t0} seconds')
categories_list

Run time: 0.14841914176940918 seconds


['toplists',
 '2020',
 'holidays',
 'hiphop',
 'pop',
 'country',
 'workout',
 'at_home',
 'rock',
 'latin',
 'mood',
 'rnb',
 'jre_podcast',
 'gaming',
 'shows_with_music',
 'focus',
 'edm_dance',
 'blackhistorymonth',
 'chill',
 'indie_alt',
 'inspirational',
 'decades',
 'instrumental',
 'alternative',
 'wellness',
 'in_the_car',
 'pride',
 'party',
 'sleep',
 'classical',
 'jazz',
 'roots',
 'soul',
 'sessions',
 'dinner',
 'romance',
 'kpop',
 'punk',
 'regional_mexican',
 'popculture',
 'blues',
 'arab',
 'desi',
 'radar',
 'anime',
 'thirdparty',
 'afro',
 'comedy',
 'metal',
 'caribbean']

In [4]:
# Create a dictionary of categories and playlists
t0 = time.time()
category_playlist_ids = {}
for cat in categories_list:
    query_url = base_url + f'/v1/browse/categories/{cat}/playlists?limit=5'
    playlists = []
    results = requests.get(query_url, headers=headers_dict).json()
    try: 
        for id, playlist in enumerate(results['playlists']['items']):
            playlists.append(playlist['id'])
        if len(playlists) > 0:
            category_playlist_ids[cat] = playlists
    except:
        print(f'No playlists for category: {cat}')
print(f'Playlists found for {len(category_playlist_ids)} categories.')
t1 = time.time()
print(f'Run time: {t1-t0} seconds')

No playlists for category: jre_podcast
No playlists for category: shows_with_music
No playlists for category: blackhistorymonth
No playlists for category: alternative
No playlists for category: sessions
No playlists for category: regional_mexican
No playlists for category: popculture
No playlists for category: radar
No playlists for category: anime
No playlists for category: thirdparty
No playlists for category: comedy
Playlists found for 38 categories.
Run time: 8.781615257263184 seconds


In [29]:
# Create a DataFrame of song names, artists, categories, and genres
t0 = time.time()
song_genres_df = pd.DataFrame(columns=['song', 'song_id', 'artist', 'artist_id', 'category'])
song_genres_df
index = 0
for category in category_playlist_ids:
    # Determine how many songs to get from each playlist
    playlists = category_playlist_ids[category]
    len_limit = {1: 25, 2:10, 3: 8, 4: 6, 5: 5}
    length = len(category_playlist_ids[category])
    limit = len_limit[length]
    for playlist_id in playlists:
        # Get songs from each playlist
        query_url = base_url + f'/v1/playlists/{playlist_id}/tracks?limit={limit}'
        results = requests.get(query_url, headers=headers_dict).json()
        try:
            # Get the song, artist, and genres for each song
            for id, item in enumerate(results['items']):
                song = item['track']['name']
                song_id = item['track']['id']
                artist = item['track']['artists'][0]['name']
                artist_id = item['track']['artists'][0]['id']
                song_genres_df.loc[index] = [song, song_id, artist, artist_id, category]
                index+=1
        except:
            print(f'No results for playlist {playlist_id}')

t1 = time.time()
print(f'Run time: {t1-t0} seconds')
song_genres_df.head()


Run time: 631.41060090065 seconds


Unnamed: 0,song,song_id,artist,artist_id,category
0,Budum,4yGNNviGXeLZlf70IwWwEK,Jada Kingdom,2FgooFaZzZy6PUyJImk0kG,toplists
1,Lighter,7CUN0vUHWCCC6k0q7VetJe,Tarrus Riley,4frHO7KPcfMjhnVdIMJ98c,toplists
2,Call Me If,7LzpbhQyI2HIyk73V8UE6r,Dexta Daps,28UDeKu2FPrU0T7dpUiSGY,toplists
3,Lockdown,0izUjTuDrUy2FgQOSRALSU,Koffee,1gWjcmBsveEYMxOZ0VRi32,toplists
4,Gal Policy,7yJG5oJwYfom8AVXDcgFLx,Kranium,1LKo6ZA3RNvKtLa6zDu32S,toplists


In [31]:
# Create a dictionary to check how many songs there are for each category
t0 = time.time()
songs_per_cat = {}
for category in category_playlist_ids:
    songs_per_cat[category] = len(song_genres_df[song_genres_df['category']==category])
t1 = time.time()
print(f'Run time: {t1-t0} seconds')
songs_per_cat

Run time: 0.18999123573303223 seconds


{'toplists': 1307,
 '2020': 1307,
 'holidays': 1307,
 'hiphop': 1307,
 'pop': 1307,
 'country': 1307,
 'workout': 1307,
 'at_home': 1307,
 'rock': 1307,
 'latin': 1307,
 'mood': 1307,
 'rnb': 1307,
 'gaming': 1307,
 'focus': 1307,
 'edm_dance': 1307,
 'blackhistorymonth': 1307,
 'chill': 1307,
 'indie_alt': 1307,
 'inspirational': 1307,
 'decades': 1307,
 'instrumental': 1307,
 'wellness': 1307,
 'pride': 1307,
 'party': 1307,
 'sleep': 1307,
 'classical': 1307,
 'jazz': 1307,
 'roots': 1307,
 'soul': 1307,
 'dinner': 1307,
 'romance': 1307,
 'kpop': 1307,
 'punk': 1307,
 'blues': 1307,
 'arab': 1307,
 'desi': 1307,
 'afro': 1307,
 'metal': 1307,
 'caribbean': 1307}

In [7]:
# Create a dictionary of genres for each artist
t0 = time.time()
artist_genres = {}
count = 0
for artist_id in song_genres_df['artist_id'].unique():
    query_url = base_url + f'/v1/artists/{artist_id}'
    artist = requests.get(query_url, headers=headers_dict).json()
    genres = artist['genres']
    genres = list(set([genre.split()[0] for genre in genres]))
    artist_genres[artist_id] = genres
    if genres == []:
        artist_name = artist['name']
        count+=1

t1 = time.time()
print(f'Run time: {t1-t0} seconds')
print(f'No genres found for {count} artists.')
len(artist_genres)

Run time: 106.29341506958008 seconds
No genres found for 108 artists.


746

In [8]:
# Add a column of artist genres
t0 = time.time()
genre_column = []
for artist_id in song_genres_df['artist_id']:
    genre_column.append(artist_genres[artist_id])
song_genres_df['genres'] = genre_column

t1 = time.time()
print(f'Run time: {t1-t0} seconds')
song_genres_df.head()

Run time: 0.0027539730072021484 seconds


Unnamed: 0,song,song_id,artist,artist_id,category,genres
0,Monster (Shawn Mendes & Justin Bieber),2Z8yfpFX0ZMavHkcIeHiO1,Shawn Mendes,7n2wHs1TKAczGzO7Dd2rGr,toplists,"[pop, viral, canadian, post-teen, dance]"
1,Therefore I Am,54bFM56PmE4YLRnqpW6Tha,Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,toplists,"[pop, electropop]"
2,Levitating (feat. DaBaby),463CkQjx2Zk1yXoBuierM9,Dua Lipa,6M2wZ9GZgrQXHCFfjv46we,toplists,"[pop, uk, dance]"
3,positions,7igeByaBM0MgGsgXtNxDJ7,Ariana Grande,66CXWjxzNUsdJxJ2JdwvnR,toplists,"[pop, post-teen]"
4,HOLIDAY,6zFMeegAMYQo0mt8rXtrli,Lil Nas X,7jVv8c5Fj3E9VhNjxT4snq,toplists,"[lgbtq+, pop, queer, country]"


In [13]:
# Create a function that adds a column of filtered genres
def filter_genres(song_genres_df):
    # Create a non-destructive copy
    song_genres_df = pd.DataFrame(song_genres_df)
    
    # Create a list of all genres
    genre_list = []
    genre_columns = song_genres_df['genres']
    for artist_genres in genre_columns:
        genre_list.extend(artist_genres)
    genre_list = list(set(genre_list))
    print(f'{len(genre_list)} unique genres found.')
    
    # Find most popular genres
    popular_genres = dict.fromkeys(genre_list, 0)
    for artist_genres in genre_columns:
        for genre in artist_genres:
            popular_genres[genre]+=1
    popular_genres = dict(sorted(popular_genres.items(), key=lambda kv: kv[1], reverse=True))
    
    # Create a list of genres that applies to all songs
    t0 = time.time()
    common_genres = []
    song_index_list = []
    for index, row in song_genres_df.iterrows():
        if row['genres'] == []:
            song_index_list.append(index)
    counter = 0
    while len(set(song_index_list)) < len(song_genres_df):
        new_genre = list(popular_genres.keys())[counter]
        new_indeces = []
        for index, row in song_genres_df.iterrows():
            if (new_genre in row['genres']) & (index not in song_index_list):
                new_indeces.append(index)
        if new_indeces != []:
            song_index_list.extend(new_indeces)
            common_genres.append(new_genre)
        counter+=1
    t1 = time.time()
    print(f'Run time to find common genres: {t1-t0} seconds')
    print(f'All songs are encompassed by {len(common_genres)} genres.')
    
    # Add a column of filtered genres
    filtered_genres = []
    for artist_genres in genre_columns:
        filtered = []
        for genre in artist_genres:
            if genre in common_genres:
                filtered.append(genre)
        filtered_genres.append(filtered)
    song_genres_df['filtered_genres'] = filtered_genres
    
    return song_genres_df


In [14]:
# Run the filter_genres function
song_genres_df = filter_genres(song_genres_df)
song_genres_df.head()

385 unique genres found.
Run time to find common genres: 35.76010608673096 seconds
All songs are encompassed by 111 genres.


Unnamed: 0,song,song_id,artist,artist_id,category,genres,filtered_genres
0,Monster (Shawn Mendes & Justin Bieber),2Z8yfpFX0ZMavHkcIeHiO1,Shawn Mendes,7n2wHs1TKAczGzO7Dd2rGr,toplists,"[pop, viral, canadian, post-teen, dance]","[pop, viral, canadian, dance]"
1,Therefore I Am,54bFM56PmE4YLRnqpW6Tha,Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,toplists,"[pop, electropop]",[pop]
2,Levitating (feat. DaBaby),463CkQjx2Zk1yXoBuierM9,Dua Lipa,6M2wZ9GZgrQXHCFfjv46we,toplists,"[pop, uk, dance]","[pop, uk, dance]"
3,positions,7igeByaBM0MgGsgXtNxDJ7,Ariana Grande,66CXWjxzNUsdJxJ2JdwvnR,toplists,"[pop, post-teen]",[pop]
4,HOLIDAY,6zFMeegAMYQo0mt8rXtrli,Lil Nas X,7jVv8c5Fj3E9VhNjxT4snq,toplists,"[lgbtq+, pop, queer, country]","[pop, country]"


In [15]:
# Create lists of audio features
t0 = time.time()
audio_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'duration_ms', 'time_signature']
danceability = [] 
energy = [] 
key = []
loudness = [] 
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
duration_ms = []
time_signature = []
for song_id in song_genres_df['song_id']:
    query_url = base_url + f'/v1/audio-features/{song_id}'
    features = requests.get(query_url, headers=headers_dict).json()
    danceability.append(features['danceability'])
    energy.append(features['energy'])
    key.append(features['key'])
    loudness.append(features['loudness'])
    mode.append(features['mode'])
    speechiness.append(features['speechiness'])
    acousticness.append(features['acousticness'])
    instrumentalness.append(features['instrumentalness'])
    liveness.append(features['liveness'])
    valence.append(features['valence'])
    tempo.append(features['tempo'])
    duration_ms.append(features['duration_ms'])
    time_signature.append(features['time_signature'])

t1 = time.time()
print(f'Run time: {t1-t0} seconds')
print(f'Audio features added for {len(danceability)} songs.')

Run time: 129.6081621646881 seconds
Audio features added for 910 songs.


In [16]:
# Add columns of audio features
t0 = time.time()
song_genres_df['audio_ft_danceability'] = danceability
song_genres_df['audio_ft_energy'] = energy
song_genres_df['audio_ft_key'] = key
song_genres_df['audio_ft_loudness'] = loudness
song_genres_df['audio_ft_mode'] = mode
song_genres_df['audio_ft_speechiness'] = speechiness
song_genres_df['audio_ft_acousticness'] = acousticness
song_genres_df['audio_ft_instrumentalness'] = instrumentalness
song_genres_df['audio_ft_liveness'] = liveness
song_genres_df['audio_ft_valence'] = valence
song_genres_df['audio_ft_tempo'] = tempo
song_genres_df['audio_ft_duration_ms'] = duration_ms
song_genres_df['audio_ft_time_signature'] = time_signature

t1 = time.time()
print(f'Run time: {t1-t0} seconds')
song_genres_df.head()


Run time: 0.023520946502685547 seconds


Unnamed: 0,song,song_id,artist,artist_id,category,genres,filtered_genres,audio_ft_danceability,audio_ft_energy,audio_ft_key,audio_ft_loudness,audio_ft_mode,audio_ft_speechiness,audio_ft_acousticness,audio_ft_instrumentalness,audio_ft_liveness,audio_ft_valence,audio_ft_tempo,audio_ft_duration_ms,audio_ft_time_signature
0,Monster (Shawn Mendes & Justin Bieber),2Z8yfpFX0ZMavHkcIeHiO1,Shawn Mendes,7n2wHs1TKAczGzO7Dd2rGr,toplists,"[pop, viral, canadian, post-teen, dance]","[pop, viral, canadian, dance]",0.652,0.383,2,-7.076,0,0.0516,0.0676,0.0,0.0828,0.549,145.765,178994,4
1,Therefore I Am,54bFM56PmE4YLRnqpW6Tha,Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,toplists,"[pop, electropop]",[pop],0.889,0.34,11,-7.773,0,0.0697,0.218,0.13,0.055,0.716,94.009,174321,4
2,Levitating (feat. DaBaby),463CkQjx2Zk1yXoBuierM9,Dua Lipa,6M2wZ9GZgrQXHCFfjv46we,toplists,"[pop, uk, dance]","[pop, uk, dance]",0.702,0.825,6,-3.787,0,0.0601,0.00883,0.0,0.0674,0.915,102.977,203064,4
3,positions,7igeByaBM0MgGsgXtNxDJ7,Ariana Grande,66CXWjxzNUsdJxJ2JdwvnR,toplists,"[pop, post-teen]",[pop],0.736,0.802,0,-4.759,1,0.0864,0.468,0.0,0.094,0.675,144.005,172325,4
4,HOLIDAY,6zFMeegAMYQo0mt8rXtrli,Lil Nas X,7jVv8c5Fj3E9VhNjxT4snq,toplists,"[lgbtq+, pop, queer, country]","[pop, country]",0.81,0.511,5,-6.924,0,0.164,0.12,0.0,0.0832,0.837,151.947,154998,4


In [17]:
# Save DataFrame to CSV
song_genres_df.to_csv('../Data2/song_genres.csv', index=False)

In [None]:
# Create a dictionary of audio features
#t0 = time.time()
#audio_features = defaultdict(list)
#features_list = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
#                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
#                  'duration_ms', 'time_signature']
#for song_id in song_genres_df['song_id']:
#    query_url = base_url + f'/v1/audio-features/{song_id}'
#    features = requests.get(query_url, headers=headers_dict).json()
#    for feat in features_list:
#        audio_features[feat].append(features[feat])
#audio_features = dict(audio_features)

#t1 = time.time()
#print(f'Run time: {t1-t0} seconds')
#print(f'Audio features added for {len(danceability)} songs.')
#audio_features

In [None]:
# Create a list of all genres
genre_list = []
for artist_genres in genre_column:
    genre_list.extend(artist_genres)
genre_list = list(set(genre_list))
len(genre_list)

In [None]:
genre_list = ['afro' if genre.startswith('afro') else genre for genre in genre_list]
genre_list = ['arab' if genre.startswith('arab') else genre for genre in genre_list]
genre_list = ['bass' if genre.startswith('bass') else genre for genre in genre_list]
genre_list = ['british' if genre.startswith('brit') else genre for genre in genre_list]
genre_list = ['bubble' if genre.startswith('bubble') else genre for genre in genre_list]
genre_list = ['chill' if genre.startswith('chill') else genre for genre in genre_list]
genre_list = ['dance' if genre.startswith('dance') else genre for genre in genre_list]
genre_list = ['electric' if genre.startswith('electr') else genre for genre in genre_list]
genre_list = ['euro' if genre.startswith('euro') else genre for genre in genre_list]
genre_list = ['folk' if genre.startswith('folk') else genre for genre in genre_list]
genre_list = ['indie' if genre.startswith('indie') else genre for genre in genre_list]
genre_list = ['neo-classical' if genre.startswith('neoclassical') else genre for genre in genre_list]
genre_list = ['post' if genre.startswith('post') else genre for genre in genre_list]
genre_list = ['reggae' if genre.startswith('reggae') else genre for genre in genre_list]
genre_list = ['rock' if genre.startswith('rock') else genre for genre in genre_list]
genre_list = ['south' if genre.startswith('south') else genre for genre in genre_list]
genre_list = list(set(genre_list))
print(len(genre_list))