In [1]:
# imports
import numpy as np
import pandas as pd

from tqdm import tqdm

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
sp_cid = 'Insert cid here'
sp_secret = 'Insert secret here'
client_credentials_manager = SpotifyClientCredentials(client_id=sp_cid, client_secret=sp_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
df = pd.read_csv('../data/track_dataset_no_audio_features.csv', index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113000 entries, 0 to 112999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   track_name   113000 non-null  object
 1   artist_name  113000 non-null  object
 2   album_name   113000 non-null  object
 3   genre        113000 non-null  object
 4   duration_ms  113000 non-null  int64 
 5   popularity   113000 non-null  int64 
 6   explicit     113000 non-null  bool  
 7   track_id     113000 non-null  object
 8   artist_id    113000 non-null  object
dtypes: bool(1), int64(2), object(6)
memory usage: 7.9+ MB


In [5]:
grouped = df.groupby(['artist_name','track_name'], as_index=True).size()
# number of duplicate tracks
len(grouped[grouped > 1])

19540

In [6]:
# number of songs after removal of dupes
len(grouped)

77629

In [7]:
df['genre'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop', 'pop-film',
       'pow

In [8]:
final_selection = ['acoustic','alternative','ambient','classical','country',
                   'disco','edm','funk','gospel','heavy-metal','hip-hop','jazz','k-pop','latin','opera','r-n-b','rock',
                   'reggae','techno','world-music']
test_df = df[df['genre'].isin(final_selection)]

In [9]:
print(len(test_df))

tgrouped = test_df.groupby(['artist_name','track_name'], as_index=True).size()
len(tgrouped[tgrouped > 1])

20000


2277

In [10]:
new = test_df.drop_duplicates(subset=['track_name','artist_name'])

In [11]:
len(new)

16025

In [12]:
new.groupby('genre').count()

Unnamed: 0_level_0,track_name,artist_name,album_name,duration_ms,popularity,explicit,track_id,artist_id
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
acoustic,949,949,949,949,949,949,949,949
alternative,958,958,958,958,958,958,958,958
ambient,932,932,932,932,932,932,932,932
classical,920,920,920,920,920,920,920,920
country,941,941,941,941,941,941,941,941
disco,878,878,878,878,878,878,878,878
edm,853,853,853,853,853,853,853,853
funk,598,598,598,598,598,598,598,598
gospel,930,930,930,930,930,930,930,930
heavy-metal,957,957,957,957,957,957,957,957


In [13]:
# Join all genres
multi_genre_df = df.groupby(['track_name','artist_name','album_name','duration_ms','popularity','explicit','track_id','artist_id'])['genre'].apply(', '.join).reset_index()

In [14]:
multi_genre_df

Unnamed: 0,track_name,artist_name,album_name,duration_ms,popularity,explicit,track_id,artist_id,genre
0,"""45""",The Gaslight Anthem,Handwritten,202493,52,False,25Sd73fleKUVPNqITPZkn1,7If8DXZN7mlGdQkLE2FaMo,"hard-rock, hardcore, j-rock, punk, punk-rock, ska"
1,"""A Lovely Night""",Laura Osnes,Rodgers + Hammerstein's Cinderella (Original B...,151466,35,False,7ygnMprPnDekUSeduRDhDt,2TkSDFFCkyiMrHyL1a5BbF,show-tunes
2,"""A"" You're Adorable",Brian Hyland,The Bashful Blond,151680,43,False,6CvreUbryAmXAUo35wuCQJ,6YROFUbu5zRCHi2xkir5pk,"rock-n-roll, rockabilly"
3,"""Befiehl du deine Wege"" (Herzlich tut mich ver...",Johann Sebastian Bach,Classical Spring: Bach,131569,0,False,4e21qOZFA4BUDeaMVVYc6g,5aIqB5nVVvmFsvSdExz408,german
4,"""C"" is for Cookie",Cookie Monster,Sesame Street: Platinum All-Time Favorites,89204,53,False,6EVN97RIyZBCegsYyEnkMv,0KUfoAHP20vQHuDhiEAa8r,children
...,...,...,...,...,...,...,...,...,...
85196,예뻤어 You Were Beautiful,DAY6,SUNRISE,283160,68,False,71WZ7yFuwxmQz5jJUpvkGv,5TnQc2N1iKlFjYD7CPGvFc,k-pop
85197,오랜 소원 It's You,Girls' Generation,Holiday Night - The 6th Album,232615,32,False,2o0tBYtr3tBBqesoZzHklI,0Sadg1vgvaPqGTOjxu0N6c,j-idol
85198,유로파 Europa,Girls' Generation,Mr. Mr. - The 4th Mini Album,202026,38,False,13raMIk3oXkgjP2hAvVCu2,0Sadg1vgvaPqGTOjxu0N6c,j-idol
85199,유리아이 (Lost In Love),Girls' Generation,I GOT A BOY - The 4th Album,239903,37,False,3ds1h65uxnS3zVmaVLGTHQ,0Sadg1vgvaPqGTOjxu0N6c,j-idol


In [15]:
# Join all duplicates of songs and consolidate the genres for each song if multiple 
multi_genre_df['genre'] = multi_genre_df.genre.map(lambda x: list(set(x.split(', '))))

In [16]:
multi_genre_df

Unnamed: 0,track_name,artist_name,album_name,duration_ms,popularity,explicit,track_id,artist_id,genre
0,"""45""",The Gaslight Anthem,Handwritten,202493,52,False,25Sd73fleKUVPNqITPZkn1,7If8DXZN7mlGdQkLE2FaMo,"[hardcore, punk-rock, j-rock, hard-rock, ska, ..."
1,"""A Lovely Night""",Laura Osnes,Rodgers + Hammerstein's Cinderella (Original B...,151466,35,False,7ygnMprPnDekUSeduRDhDt,2TkSDFFCkyiMrHyL1a5BbF,[show-tunes]
2,"""A"" You're Adorable",Brian Hyland,The Bashful Blond,151680,43,False,6CvreUbryAmXAUo35wuCQJ,6YROFUbu5zRCHi2xkir5pk,"[rockabilly, rock-n-roll]"
3,"""Befiehl du deine Wege"" (Herzlich tut mich ver...",Johann Sebastian Bach,Classical Spring: Bach,131569,0,False,4e21qOZFA4BUDeaMVVYc6g,5aIqB5nVVvmFsvSdExz408,[german]
4,"""C"" is for Cookie",Cookie Monster,Sesame Street: Platinum All-Time Favorites,89204,53,False,6EVN97RIyZBCegsYyEnkMv,0KUfoAHP20vQHuDhiEAa8r,[children]
...,...,...,...,...,...,...,...,...,...
85196,예뻤어 You Were Beautiful,DAY6,SUNRISE,283160,68,False,71WZ7yFuwxmQz5jJUpvkGv,5TnQc2N1iKlFjYD7CPGvFc,[k-pop]
85197,오랜 소원 It's You,Girls' Generation,Holiday Night - The 6th Album,232615,32,False,2o0tBYtr3tBBqesoZzHklI,0Sadg1vgvaPqGTOjxu0N6c,[j-idol]
85198,유로파 Europa,Girls' Generation,Mr. Mr. - The 4th Mini Album,202026,38,False,13raMIk3oXkgjP2hAvVCu2,0Sadg1vgvaPqGTOjxu0N6c,[j-idol]
85199,유리아이 (Lost In Love),Girls' Generation,I GOT A BOY - The 4th Album,239903,37,False,3ds1h65uxnS3zVmaVLGTHQ,0Sadg1vgvaPqGTOjxu0N6c,[j-idol]


In [17]:
# We still have dupes!!!
dupes = multi_genre_df.groupby(['artist_name','track_name'], as_index=True).size()
len(dupes[dupes > 1])

5191

In [18]:
# Remove dupes that didn't get consolidated because of minor differences (i.e. same song, different album)
multi_genre_df['num_genres'] = multi_genre_df['genre'].str.len()
sorted_multi = multi_genre_df.sort_values(by='num_genres',ascending=False).drop(columns='num_genres').reset_index(drop=True)
sorted_multi.drop_duplicates(subset=['track_name','artist_name'],inplace=True)

In [19]:
sorted_multi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77629 entries, 0 to 85200
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   track_name   77629 non-null  object
 1   artist_name  77629 non-null  object
 2   album_name   77629 non-null  object
 3   duration_ms  77629 non-null  int64 
 4   popularity   77629 non-null  int64 
 5   explicit     77629 non-null  bool  
 6   track_id     77629 non-null  object
 7   artist_id    77629 non-null  object
 8   genre        77629 non-null  object
dtypes: bool(1), int64(2), object(6)
memory usage: 5.4+ MB


In [20]:
# No more dupes
dupes = sorted_multi.groupby(['artist_name','track_name'], as_index=True).size()
len(dupes[dupes > 1])

0

In [21]:
def audio_feature_collector(track_id_lst):
    """
    Takes a list of track id's and returns a dataframe filled
    with the audio features for the provided songs.

    Parameters:
    -----------
    track_id_lst: list of track id's

    Output:
    -------
    Returns a dataframe object
    """
    audio_features = []
    batchsize = 100

    # Iterate over 100 song batches (due to API limit per request)
    for i in tqdm(range(0,len(track_id_lst),batchsize)):
        batch = track_id_lst[i:i+batchsize]
        # Collect features for 100 tracks
        feature_results = sp.audio_features(batch)
        # Store individual track info in list
        for track in feature_results:
            if track is not None:
                audio_features.append(track)

    df = pd.DataFrame.from_dict(data=audio_features,orient='columns')
    # Rename column to match column name from song dataframe for merge
    return df.rename(columns={'id':'track_id'})

In [23]:
af = audio_feature_collector(sorted_multi.track_id.to_list())

100%|██████████| 777/777 [06:41<00:00,  1.93it/s]


In [24]:
af

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,track_id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.701,0.815,0,-5.995,1,0.0381,0.017400,0.000627,0.0969,0.671,138.142,audio_features,6M6UoxIPn4NOWW0x7JPRfv,spotify:track:6M6UoxIPn4NOWW0x7JPRfv,https://api.spotify.com/v1/tracks/6M6UoxIPn4NO...,https://api.spotify.com/v1/audio-analysis/6M6U...,279027,4
1,0.509,0.646,11,-6.285,0,0.0365,0.000280,0.004680,0.0973,0.142,124.014,audio_features,72hSmnleYTiiOo23q8ZJIS,spotify:track:72hSmnleYTiiOo23q8ZJIS,https://api.spotify.com/v1/tracks/72hSmnleYTii...,https://api.spotify.com/v1/audio-analysis/72hS...,297460,4
2,0.338,0.819,3,-5.666,1,0.0736,0.023400,0.000000,0.1290,0.698,118.905,audio_features,7t19ubLwfZubJh2HpCUTrC,spotify:track:7t19ubLwfZubJh2HpCUTrC,https://api.spotify.com/v1/tracks/7t19ubLwfZub...,https://api.spotify.com/v1/audio-analysis/7t19...,167307,4
3,0.325,0.867,4,-5.530,1,0.0809,0.005480,0.000004,0.1030,0.336,144.866,audio_features,0Kd1R41kTx9Uzk30yv5PP7,spotify:track:0Kd1R41kTx9Uzk30yv5PP7,https://api.spotify.com/v1/tracks/0Kd1R41kTx9U...,https://api.spotify.com/v1/audio-analysis/0Kd1...,264677,4
4,0.482,0.833,5,-5.611,1,0.0449,0.000346,0.000000,0.3650,0.740,148.039,audio_features,1fBl642IhJOE5U319Gy2Go,spotify:track:1fBl642IhJOE5U319Gy2Go,https://api.spotify.com/v1/tracks/1fBl642IhJOE...,https://api.spotify.com/v1/audio-analysis/1fBl...,212293,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77624,0.693,0.708,0,-7.816,1,0.0310,0.309000,0.003050,0.0857,0.949,125.315,audio_features,6c0G8K6G2LlIfJTsRLJjfs,spotify:track:6c0G8K6G2LlIfJTsRLJjfs,https://api.spotify.com/v1/tracks/6c0G8K6G2LlI...,https://api.spotify.com/v1/audio-analysis/6c0G...,187973,4
77625,0.774,0.776,6,-5.331,1,0.3040,0.178000,0.000000,0.2830,0.654,99.071,audio_features,2Uvwc7D5rhphlzfBDnpYpU,spotify:track:2Uvwc7D5rhphlzfBDnpYpU,https://api.spotify.com/v1/tracks/2Uvwc7D5rhph...,https://api.spotify.com/v1/audio-analysis/2Uvw...,311533,4
77626,0.269,0.942,2,-4.604,1,0.0575,0.000173,0.000001,0.1610,0.665,88.850,audio_features,1tSBySlxQCtYcRXAq0lz7u,spotify:track:1tSBySlxQCtYcRXAq0lz7u,https://api.spotify.com/v1/tracks/1tSBySlxQCtY...,https://api.spotify.com/v1/audio-analysis/1tSB...,156162,4
77627,0.829,0.727,7,-8.606,1,0.1650,0.429000,0.000000,0.0709,0.907,101.644,audio_features,6tBzYurAiGkaGopgYPdNo7,spotify:track:6tBzYurAiGkaGopgYPdNo7,https://api.spotify.com/v1/tracks/6tBzYurAiGka...,https://api.spotify.com/v1/audio-analysis/6tBz...,156360,4


In [26]:
# Remove unwanted features
af.drop(['type','uri','track_href','analysis_url','duration_ms','time_signature'],axis=1,inplace=True)

In [29]:
final = pd.merge(sorted_multi,af,how='inner',on='track_id')

In [30]:
final

Unnamed: 0,track_name,artist_name,album_name,duration_ms,popularity,explicit,track_id,artist_id,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Young Folks,Peter Bjorn and John,Writer's Block,279026,65,False,6M6UoxIPn4NOWW0x7JPRfv,6u11Qbko2N2hP4lTBYjX86,"[swedish, indie-pop, dance, indie, alt-rock, p...",0.701,0.815,0,-5.995,1,0.0381,0.017400,0.000627,0.0969,0.671,138.142
1,Black Sheep,Metric,Black Sheep,297459,68,False,72hSmnleYTiiOo23q8ZJIS,1rCIEwPp5OnXW0ornlSsRl,"[indie-pop, dance, indie, alt-rock, punk-rock,...",0.509,0.646,11,-6.285,0,0.0365,0.000280,0.004680,0.0973,0.142,124.014
2,Be Nice To Me,The Front Bottoms,Rose,167306,63,False,7t19ubLwfZubJh2HpCUTrC,5ictveRyhWRs8Gt8Dvt1hS,"[j-pop, indie-pop, emo, indie, alt-rock, punk-...",0.338,0.819,3,-5.666,1,0.0736,0.023400,0.000000,0.1290,0.698,118.905
3,Twin Size Mattress,The Front Bottoms,Talon Of The Hawk,264676,65,False,0Kd1R41kTx9Uzk30yv5PP7,5ictveRyhWRs8Gt8Dvt1hS,"[j-pop, indie-pop, emo, indie, alt-rock, punk-...",0.325,0.867,4,-5.530,1,0.0809,0.005480,0.000004,0.1030,0.336,144.866
4,Animal,Neon Trees,Habits,212293,72,False,1fBl642IhJOE5U319Gy2Go,0RpddSzUHfncUWNJXKOsjy,"[indie-pop, indie, alt-rock, punk-rock, pop, r...",0.482,0.833,5,-5.611,1,0.0449,0.000346,0.000000,0.3650,0.740,148.039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77624,I Love The Nightlife (Disco 'Round),Alicia Bridges,I Love The Nightlife,187973,47,False,6c0G8K6G2LlIfJTsRLJjfs,1UY4oIFpjCKe5qIhKDcloe,[disco],0.693,0.708,0,-7.816,1,0.0310,0.309000,0.003050,0.0857,0.949,125.315
77625,I Love The Dough (feat. Jay-Z & Angela Winbush...,The Notorious B.I.G.,Life After Death (2014 Remastered Edition),311533,56,True,2Uvwc7D5rhphlzfBDnpYpU,5me0Irg2ANcsgc93uaYrpb,[hardcore],0.774,0.776,6,-5.331,1,0.3040,0.178000,0.000000,0.2830,0.654,99.071
77626,I Love Seattle,Tacocat,Lost Time,156162,35,False,1tSBySlxQCtYcRXAq0lz7u,3h0MN1neFknEvlYKxFmSQW,[power-pop],0.269,0.942,2,-4.604,1,0.0575,0.000173,0.000001,0.1610,0.665,88.850
77627,I Love Rocky Road,"""Weird Al"" Yankovic","""Weird Al"" Yankovic",156360,37,False,6tBzYurAiGkaGopgYPdNo7,1bDWGdIC2hardyt55nlQgG,[comedy],0.829,0.727,7,-8.606,1,0.1650,0.429000,0.000000,0.0709,0.907,101.644


In [32]:
# Replace booleans with integers
final.explicit = df.explicit.astype(int)

# Convert from milliseconds to seconds
final['duration_s'] = np.round(final.duration_ms/1000,0)

In [33]:
final.to_csv('../data/multi_genre_track_audio_dataset.csv')