In [1]:
import os
import re
import json
from pathlib import Path

import requests
import spotipy
import pandas as pd
from tqdm import tqdm
from requests.exceptions import ReadTimeout
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
from more_itertools import chunked

In [2]:
load_dotenv()

True

# Data Acquistion

In [3]:
RAW_DATA_PATH_BILLBOARD = Path('data/raw/billboard')
RAW_DATA_PATH_BILLBOARD.mkdir(exist_ok=True, parents=True)
INTERIM_DATA_BILLBOARD = Path('data/interim/billboard')
INTERIM_DATA_BILLBOARD.mkdir(exist_ok=True, parents=True)

RAW_DATA_PATH_SPOTIFY = Path('data/raw/spotify')
RAW_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)
EXTERNAL_DATA_PATH_SPOTIFY = Path('data/external/spotify')
EXTERNAL_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)

## Billboard Data

In [4]:
BILLBOARD_BASE_DOWNLOAD_URL = "https://raw.githubusercontent.com/kevinschaich/billboard/master/data/years/{}.json"
BILLBOARD_DOWNLOAD_YEARS = list(range(1950, 2015 + 1))

BILLBOARD_TABLE_FILEPATH = INTERIM_DATA_BILLBOARD / "billboard_data.csv"

### Download

In [5]:
for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    r = requests.get(BILLBOARD_BASE_DOWNLOAD_URL.format(year))
    if r.status_code != 200: print(f"Error: {year}")
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'wb') as f:
        f.write(r.content)

100%|██████████| 66/66 [01:06<00:00,  1.00s/it]


### Tabulating Data

In [6]:
years_dfs = []
for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        years_dfs.append(pd.DataFrame(json.load(f)))
                         
billboard_table_df = pd.concat(years_dfs, keys=BILLBOARD_DOWNLOAD_YEARS)
billboard_table_df = billboard_table_df.reset_index(level=0).rename(columns={'level_0': 'year'})
billboard_table_df = billboard_table_df.reset_index(drop=True)

print(len(billboard_table_df))
                         
billboard_table_df.to_csv(BILLBOARD_TABLE_FILEPATH) 

100%|██████████| 66/66 [00:00<00:00, 199.29it/s]


4028


## Spotify Data

In [7]:
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager, requests_timeout=15)

SONG_IDS_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_ids.csv"
SONG_AUDIO_FEATURES_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_audiofeatures.csv"

In [21]:
# TODO: Passar isso para um script
def spotify_api_get_song_id(track=None, artist=None, override_query=None):
    query = f'artist:{artist} track:{track}' if override_query is None else override_query
    
    try:
        track_id = sp.search(q=query, type='track', limit=1)
        song_info = track_id['tracks']['items'][0]
    except (KeyError, ReadTimeout, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]

    return api_song_id, api_song_name, api_song_artists

def spotify_api_by_requests(query):
    
    payload = {'q': query, 'type': 'track', 'limit': 1}
    headers = {"Authorization": f"Bearer {os.getenv('SPOTIFY_OAUTH_TOKEN')}"}
    r = requests.get('https://api.spotify.com/v1/search', headers=headers, params=payload)
    try:
        song_info = r.json()['tracks']['items'][0]
    except (KeyError, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]

    return api_song_id, api_song_name, api_song_artists

### Get song names and artists from billboard data

In [9]:
songs_artists = []

for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        data = json.load(f)
        year_songs_artists = [(d['title'], d['artist']) for d in data]
        songs_artists.extend(year_songs_artists)
        
len(songs_artists)

100%|██████████| 66/66 [00:00<00:00, 513.12it/s]


4028

### Get song IDs from Spotify 

In [11]:
billboard_table_df = pd.read_csv(BILLBOARD_TABLE_FILEPATH, index_col=0)
song_artists = [(value['title'], value['artist']) for index, value in billboard_table_df[['title', 'artist']].iterrows()]

spotify_ids_dict = {'name': [], 'artist': [], 'api_id': [], 'api_name': [], 'api_artists': []}
for song_name, song_artist in tqdm(songs_artists):
    
    # Base Search Songs
    base_api_music = spotify_api_get_song_id(song_name, song_artist)
    
    # Searching Without Apostrophes
    if any(x is None for x in base_api_music): 
        song_name = song_name.replace("'", '')
        song_artist = song_artist.replace("'", '')
        base_api_music = spotify_api_get_song_id(song_name, song_artist)
    
    # Searching Without Symbols
    if any(x is None for x in base_api_music):
        song_name = re.sub(r'[-\.\+,\(\)]', "", song_name)
        song_artist = re.sub(r'[-\.\+,\(\)]', "", song_artist)
        base_api_music = spotify_api_get_song_id(song_name, song_artist)
        
    # Wide search
    if any(x is None for x in base_api_music):
        base_api_music = spotify_api_get_song_id(override_query=" ".join([song_name, song_artist]))
        
    # Adding data to dataframe
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = ",".join(base_api_music[2]) if base_api_music[2] is not None else None

    spotify_ids_dict["name"].append(song_name)
    spotify_ids_dict["artist"].append(song_artist)
    spotify_ids_dict["api_id"].append(api_song_id)
    spotify_ids_dict["api_name"].append(api_song_name)
    spotify_ids_dict["api_artists"].append(api_song_artists)
    
spotify_ids_df = pd.DataFrame(spotify_ids_dict)
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

 35%|███▌      | 1412/4028 [09:09<16:57,  2.57it/s]  


KeyboardInterrupt: 

#### Missing Values Treatment

In [14]:
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)
missing_ids = spotify_ids_df[spotify_ids_df.api_id.isnull()]

print(len(missing_ids))
missing_ids

3


Unnamed: 0,name,artist,api_id,api_name,api_artists
1048,Mighty Love Pt 1,Spinners,,,
1520,Its Now Or Never,John Schneider,,,
2242,Here We Go,CC Music Factory,,,


By looking directly on SPotify API Search API https://developer.spotify.com/console/get-search-item/ I found that some values are present but somewow the library didnt' work. So I did a simple function to use the API directly

In [19]:
for song in tqdm(list(missing_ids.itertuples())):
    
    base_api_music = spotify_api_by_requests(" ".join([song.name, song.artist]))
    
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = ",".join(base_api_music[2]) if base_api_music[2] is not None else None
    
    spotify_ids_df.at[song.Index, 'api_id'] = api_song_id
    spotify_ids_df.at[song.Index, 'api_name'] = api_song_name
    spotify_ids_df.at[song.Index, 'api_artists'] = api_song_artists

  0%|          | 0/3 [00:00<?, ?it/s]

{'error': {'status': 401, 'message': 'The access token expired'}}





KeyError: 'tracks'

Finally let's check the last missing values

In [None]:
missing_ids = spotify_ids_df[spotify_ids_df.api_id.isnull()]

print(len(missing_ids))
missing_ids

I didn't found the correct values on spotify API, so they are really missing. Let's save the dataframe

In [None]:
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

### Get Audio Features from IDs
https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject

In [68]:
audio_features_list = []
search_ids = spotify_ids_df.api_id.dropna()
for songs in tqdm(list(chunked(search_ids, 80))):
    audio_features = sp.audio_features(songs)
    audio_features_list.extend(audio_features)
    
audio_features_df = pd.DataFrame([feat for feat in audio_features_list if feat != None])
audio_features_df.to_csv(SONG_AUDIO_FEATURES_FILEPATH)

100%|██████████| 51/51 [00:37<00:00,  1.36it/s]


In [None]:
## Checar os missing values

## Merge Everything

In [69]:
billboard_table_df = pd.read_csv(BILLBOARD_TABLE_FILEPATH, index_col=0)
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)
audio_features_df = pd.read_csv(SONG_AUDIO_FEATURES_FILEPATH, index_col=0)

In [74]:
audio_features_df.drop_duplicates()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.241,0.2370,1,-12.201,1,0.0371,0.95200,0.000000,0.1130,0.303,174.578,audio_features,5dae01pKNjRQtgOeAkFzPY,spotify:track:5dae01pKNjRQtgOeAkFzPY,https://api.spotify.com/v1/tracks/5dae01pKNjRQ...,https://api.spotify.com/v1/audio-analysis/5dae...,194987,3
1,0.455,0.0792,8,-14.181,1,0.0413,0.92800,0.000000,0.1380,0.220,112.112,audio_features,1VBj7RIbwUOjNetvbECBTz,spotify:track:1VBj7RIbwUOjNetvbECBTz,https://api.spotify.com/v1/tracks/1VBj7RIbwUOj...,https://api.spotify.com/v1/audio-analysis/1VBj...,197973,1
2,0.373,0.1240,5,-12.950,1,0.0342,0.96900,0.000891,0.1230,0.351,86.857,audio_features,0BHroBUvBAp561BYqC9LRK,spotify:track:0BHroBUvBAp561BYqC9LRK,https://api.spotify.com/v1/tracks/0BHroBUvBAp5...,https://api.spotify.com/v1/audio-analysis/0BHr...,185000,3
3,0.697,0.1820,7,-12.586,1,0.0357,0.78000,0.000000,0.1260,0.474,82.184,audio_features,0KnD456yC5JuweN932Ems3,spotify:track:0KnD456yC5JuweN932Ems3,https://api.spotify.com/v1/tracks/0KnD456yC5Ju...,https://api.spotify.com/v1/audio-analysis/0KnD...,158000,3
4,0.627,0.1930,8,-14.935,1,0.0451,0.88800,0.000001,0.1190,0.596,123.231,audio_features,19vjeNqhgk0vhH5TuxDok9,spotify:track:19vjeNqhgk0vhH5TuxDok9,https://api.spotify.com/v1/tracks/19vjeNqhgk0v...,https://api.spotify.com/v1/audio-analysis/19vj...,194333,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,0.376,0.8210,0,-3.974,1,0.1040,0.07850,0.000000,0.0823,0.331,120.462,audio_features,1wYZZtamWTQAoj8B812uKQ,spotify:track:1wYZZtamWTQAoj8B812uKQ,https://api.spotify.com/v1/tracks/1wYZZtamWTQA...,https://api.spotify.com/v1/audio-analysis/1wYZ...,199453,4
4013,0.829,0.5100,5,-9.334,0,0.0369,0.00821,0.001430,0.0829,0.450,119.994,audio_features,1HFfMOxCAT4GAwaPfCdmUs,spotify:track:1HFfMOxCAT4GAwaPfCdmUs,https://api.spotify.com/v1/tracks/1HFfMOxCAT4G...,https://api.spotify.com/v1/audio-analysis/1HFf...,208133,4
4014,0.706,0.5720,8,-5.799,0,0.0326,0.02620,0.000000,0.5850,0.574,139.982,audio_features,7seTcUFOhn5caSDbiSfsp0,spotify:track:7seTcUFOhn5caSDbiSfsp0,https://api.spotify.com/v1/tracks/7seTcUFOhn5c...,https://api.spotify.com/v1/audio-analysis/7seT...,214726,4
4015,0.672,0.5200,8,-7.747,1,0.0353,0.85900,0.000000,0.1150,0.370,120.001,audio_features,5O2P9iiztwhomNh8xkR9lJ,spotify:track:5O2P9iiztwhomNh8xkR9lJ,https://api.spotify.com/v1/tracks/5O2P9iiztwho...,https://api.spotify.com/v1/audio-analysis/5O2P...,226600,4


In [55]:
spotify_ids_df = spotify_ids_df.dropna().set_index("api_id")
audio_features_df = audio_features_df.dropna().set_index("id")

In [63]:
spotify_ids_df

Unnamed: 0,name,artist,api_id,api_name,api_artists
0,Mona Lisa,Nat King Cole,5dae01pKNjRQtgOeAkFzPY,Mona Lisa,Nat King Cole
1,I Wanna Be Loved,Andrews Sisters,1VBj7RIbwUOjNetvbECBTz,I Wanna Be Loved,"The Andrews Sisters,Gordon Jenkins & His Orche..."
2,Tennessee Waltz,Patti Page,0BHroBUvBAp561BYqC9LRK,Tennessee Waltz,Patti Page
3,Ill Never Be Free,Tennessee Ernie Ford,0KnD456yC5JuweN932Ems3,I'll Never Be Free,"Tennessee Ernie Ford,Kay Starr"
4,All My Love,Patti Page,19vjeNqhgk0vhH5TuxDok9,All My Love,Patti Page
...,...,...,...,...,...
4023,Here,Alessia Cara,1wYZZtamWTQAoj8B812uKQ,Here,Alessia Cara
4024,Waves,Mr. Probz,1HFfMOxCAT4GAwaPfCdmUs,Waves - Robin Schulz Radio Edit,"Mr. Probz,Robin Schulz"
4025,She Knows,Ne-Yo,7seTcUFOhn5caSDbiSfsp0,She Knows,"Ne-Yo,Juicy J"
4026,Night Changes,One Direction,5O2P9iiztwhomNh8xkR9lJ,Night Changes,One Direction


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
2,0.373,0.124,5,-12.950,1,0.0342,0.969000,0.000891,0.1230,0.351,86.857,audio_features,0BHroBUvBAp561BYqC9LRK,spotify:track:0BHroBUvBAp561BYqC9LRK,https://api.spotify.com/v1/tracks/0BHroBUvBAp5...,https://api.spotify.com/v1/audio-analysis/0BHr...,185000,3
14,0.373,0.124,5,-12.950,1,0.0342,0.969000,0.000891,0.1230,0.351,86.857,audio_features,0BHroBUvBAp561BYqC9LRK,spotify:track:0BHroBUvBAp561BYqC9LRK,https://api.spotify.com/v1/tracks/0BHroBUvBAp5...,https://api.spotify.com/v1/audio-analysis/0BHr...,185000,3
31,0.409,0.435,2,-9.399,1,0.0453,0.907000,0.000007,0.0559,0.567,78.136,audio_features,4BH8EuPAxeFAh1rSWtnGdD,spotify:track:4BH8EuPAxeFAh1rSWtnGdD,https://api.spotify.com/v1/tracks/4BH8EuPAxeFA...,https://api.spotify.com/v1/audio-analysis/4BH8...,168480,4
48,0.409,0.435,2,-9.399,1,0.0453,0.907000,0.000007,0.0559,0.567,78.136,audio_features,4BH8EuPAxeFAh1rSWtnGdD,spotify:track:4BH8EuPAxeFAh1rSWtnGdD,https://api.spotify.com/v1/tracks/4BH8EuPAxeFA...,https://api.spotify.com/v1/audio-analysis/4BH8...,168480,4
51,0.667,0.250,11,-18.533,1,0.2170,0.757000,0.000000,0.2500,0.412,144.145,audio_features,4KM77RUl2IKdXGhtOBbKIS,spotify:track:4KM77RUl2IKdXGhtOBbKIS,https://api.spotify.com/v1/tracks/4KM77RUl2IKd...,https://api.spotify.com/v1/audio-analysis/4KM7...,155902,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3978,0.279,0.742,4,-6.460,0,0.0898,0.000185,0.000000,0.5930,0.328,189.868,audio_features,2bL2gyO6kBdLkNSkxXNh6x,spotify:track:2bL2gyO6kBdLkNSkxXNh6x,https://api.spotify.com/v1/tracks/2bL2gyO6kBdL...,https://api.spotify.com/v1/audio-analysis/2bL2...,231013,4
3987,0.418,0.420,0,-6.444,1,0.0414,0.588000,0.000064,0.1100,0.184,84.094,audio_features,5Nm9ERjJZ5oyfXZTECKmRt,spotify:track:5Nm9ERjJZ5oyfXZTECKmRt,https://api.spotify.com/v1/tracks/5Nm9ERjJZ5oy...,https://api.spotify.com/v1/audio-analysis/5Nm9...,172724,4
4002,0.729,0.650,5,-3.539,1,0.0313,0.070200,0.000067,0.0829,0.347,110.020,audio_features,14OxJlLdcHNpgsm4DRwDOB,spotify:track:14OxJlLdcHNpgsm4DRwDOB,https://api.spotify.com/v1/tracks/14OxJlLdcHNp...,https://api.spotify.com/v1/audio-analysis/14Ox...,209160,4
4008,0.399,0.787,1,-2.880,1,0.0499,0.019700,0.000061,0.0685,0.572,117.089,audio_features,2s1sdSqGcKxpPr5lCl7jAV,spotify:track:2s1sdSqGcKxpPr5lCl7jAV,https://api.spotify.com/v1/tracks/2s1sdSqGcKxp...,https://api.spotify.com/v1/audio-analysis/2s1s...,216120,5


In [58]:
spotify_df = audio_features_df.join(spotify_ids_df)

In [59]:
spotify_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,type,uri,track_href,analysis_url,duration_ms,time_signature,name,artist,api_name,api_artists
005lwxGU1tms6HGELIcUv9,0.699,0.760,5,-3.173,1,0.0677,0.002230,0.000000,0.1320,0.696,...,audio_features,spotify:track:005lwxGU1tms6HGELIcUv9,https://api.spotify.com/v1/tracks/005lwxGU1tms...,https://api.spotify.com/v1/audio-analysis/005l...,179640,4,I Kissed A Girl,Katy Perry,I Kissed A Girl,Katy Perry
00FROhC5g4iJdax5US8jRr,0.764,0.594,6,-10.050,1,0.1850,0.591000,0.000000,0.1450,0.695,...,audio_features,spotify:track:00FROhC5g4iJdax5US8jRr,https://api.spotify.com/v1/tracks/00FROhC5g4iJ...,https://api.spotify.com/v1/audio-analysis/00FR...,286441,4,Satisfy You,Puff Daddy,Satisfy You,Diddy
00Mb3DuaIH1kjrwOku9CGU,0.487,0.900,0,-4.417,1,0.0482,0.000068,0.000000,0.3580,0.484,...,audio_features,spotify:track:00Mb3DuaIH1kjrwOku9CGU,https://api.spotify.com/v1/tracks/00Mb3DuaIH1k...,https://api.spotify.com/v1/audio-analysis/00Mb...,204000,4,Sk8er Boi,Avril Lavigne,Sk8er Boi,Avril Lavigne
00U0pedRUMEzREpyRqbVT6,0.691,0.601,7,-9.698,1,0.0428,0.628000,0.017000,0.0709,0.850,...,audio_features,spotify:track:00U0pedRUMEzREpyRqbVT6,https://api.spotify.com/v1/tracks/00U0pedRUMEz...,https://api.spotify.com/v1/audio-analysis/00U0...,263813,4,Ill Remember,Madonna,I'll Remember (Theme from the Motion Picture W...,Madonna
00U1MDChdOTxWwtKoOoBXE,0.612,0.591,1,-7.020,1,0.0510,0.462000,0.000000,0.1110,0.349,...,audio_features,spotify:track:00U1MDChdOTxWwtKoOoBXE,https://api.spotify.com/v1/tracks/00U1MDChdOTx...,https://api.spotify.com/v1/audio-analysis/00U1...,184333,4,Stay,Lisa Loeb,Stay (I Missed You),Lisa Loeb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zMcNqs55Mxer82bvZFkpg,0.621,0.417,10,-6.941,1,0.0231,0.288000,0.000000,0.1190,0.289,...,audio_features,spotify:track:7zMcNqs55Mxer82bvZFkpg,https://api.spotify.com/v1/tracks/7zMcNqs55Mxe...,https://api.spotify.com/v1/audio-analysis/7zMc...,203040,4,Teardrops On My Guitar,Taylor Swift,Teardrops On My Guitar - Radio Single Remix,Taylor Swift
7zMcNqs55Mxer82bvZFkpg,0.621,0.417,10,-6.941,1,0.0231,0.288000,0.000000,0.1190,0.289,...,audio_features,spotify:track:7zMcNqs55Mxer82bvZFkpg,https://api.spotify.com/v1/tracks/7zMcNqs55Mxe...,https://api.spotify.com/v1/audio-analysis/7zMc...,203040,4,Teardrops On My Guitar,Taylor Swift,Teardrops On My Guitar - Radio Single Remix,Taylor Swift
7zSDDsIlks515d0tZGM64x,0.656,0.347,9,-13.398,1,0.0280,0.488000,0.002210,0.0645,0.368,...,audio_features,spotify:track:7zSDDsIlks515d0tZGM64x,https://api.spotify.com/v1/tracks/7zSDDsIlks51...,https://api.spotify.com/v1/audio-analysis/7zSD...,225973,4,The Next Time I Fall,Peter Cetera,The Next Time I Fall (with Amy Grant),"Peter Cetera,Amy Grant"
7zd7QV2lCDGGmwOxMiULVY,0.392,0.552,11,-9.217,1,0.0335,0.062100,0.004670,0.0818,0.530,...,audio_features,spotify:track:7zd7QV2lCDGGmwOxMiULVY,https://api.spotify.com/v1/tracks/7zd7QV2lCDGG...,https://api.spotify.com/v1/audio-analysis/7zd7...,147827,4,Woman,Peter and Gordon,Woman,Peter And Gordon
