In [1]:
import os
import re
import json
from pathlib import Path

import requests
import spotipy
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from more_itertools import chunked
from requests.exceptions import ReadTimeout
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

load_dotenv()

True

# Data Acquistion

This notebook aims to download all the necessary data used for the project. This data acquisition is done in 2 steps: (i) Downloads the billboard data provided on github [kevinschaich/billboard](https://github.com/kevinschaich/billboard); and (ii) for each song on the billboard dataset we extract informations via Spotify API.

In [3]:
RAW_DATA_PATH = Path('data/raw')
INTERIM_DATA_PATH = Path('data/interim')
EXTERNAL_DATA_PATH = Path('data/external')

RAW_DATA_PATH_BILLBOARD = RAW_DATA_PATH / 'billboard'
RAW_DATA_PATH_SPOTIFY = RAW_DATA_PATH / 'spotify'
INTERIM_DATA_BILLBOARD = INTERIM_DATA_PATH / 'billboard'
EXTERNAL_DATA_PATH_SPOTIFY = EXTERNAL_DATA_PATH / 'spotify'
BILLBOARD_TABLE_FILEPATH = INTERIM_DATA_BILLBOARD / "billboard_data.csv"
SONG_IDS_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_ids.csv"
SONG_AUDIO_FEATURES_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_audiofeatures.csv"

RAW_DATA_PATH_BILLBOARD.mkdir(exist_ok=True, parents=True)
INTERIM_DATA_BILLBOARD.mkdir(exist_ok=True, parents=True)
RAW_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)
EXTERNAL_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)

## Billboard Data

This step can be used to download the data provided on [kevinschaich/billboard](https://github.com/kevinschaich/billboard). The data is also tabulated for a better exploration using Tableau.

### Download

In [6]:
BILLBOARD_BASE_DOWNLOAD_URL = "https://raw.githubusercontent.com/kevinschaich/billboard/master/data/years/{}.json"
BILLBOARD_DOWNLOAD_YEARS = list(range(1950, 2015 + 1))

for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    r = requests.get(BILLBOARD_BASE_DOWNLOAD_URL.format(year))
    if r.status_code != 200: print(f"Error: {year}")
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'wb') as f:
        f.write(r.content)

### Tabulating Data

In [8]:
def flatten_dict(dd, separator ='_', prefix =''): 
    return { prefix + separator + k if prefix else k : v 
             for kk, vv in dd.items() 
             for k, v in flatten_dict(vv, separator, kk).items() 
             } if isinstance(dd, dict) else { prefix : dd } 

years_dfs = []
for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        flatten_data = [flatten_dict(x) for x in json.load(f)]
        years_dfs.append(pd.DataFrame(flatten_data))
                         
billboard_table_df = pd.concat(years_dfs, keys=BILLBOARD_DOWNLOAD_YEARS)
billboard_table_df = billboard_table_df.reset_index(level=0).rename(columns={'level_0': 'year'})
billboard_table_df = billboard_table_df.reset_index(drop=True)

print(len(billboard_table_df))
                         
billboard_table_df.to_csv(BILLBOARD_TABLE_FILEPATH) 

## Spotify Data

Here we are going to download all the audio features info presented in the billboard dataset. This needs to be done in 2 steps: (i) We need to get the IDs for each song by using the search API; these IDs will be used to get the (ii) Audio features by using the Audio Features API. 

**NOTE:** For this to work it is necessary to setup the Spotipy API. Go to README file for more information on how to setup the needed keys

In [None]:
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager, requests_timeout=15)

### Get song names and artists from billboard data

First, we need to get all the songs and artists names to use in the API.

In [11]:
songs_artists = []

for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        data = json.load(f)
        year_songs_artists = [(d['title'], d['artist']) for d in data]
        songs_artists.extend(year_songs_artists)
        
len(songs_artists)

100%|██████████| 66/66 [00:00<00:00, 234.30it/s]


4028

### Getiing song IDs 

#### Spotipy API

On the code below, the IDs are extracted via Spotipy API ([Search API](https://developer.spotify.com/documentation/web-api/reference/#category-search)), to maximize the results we searched through 4 scenarios (each scenario search for the missing songs on the previous one): (i) Names and artists are searched as is on the API; (ii) Song names without apostrophes ;  (iii) Song names without symbols and (iv) a wide search using the name and the song as a search query. 

In [None]:
def spotify_api_get_song_id(track=None, artist=None, override_query=None):
    query = f'artist:{artist} track:{track}' if override_query is None else override_query
    
    try:
        track_id = sp.search(q=query, type='track', limit=1)
        song_info = track_id['tracks']['items'][0]
    except (KeyError, ReadTimeout, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]

    return api_song_id, api_song_name, api_song_artists

In [12]:
billboard_table_df = pd.read_csv(BILLBOARD_TABLE_FILEPATH, index_col=0)
song_artists = [(value['title'], value['artist']) for index, value in billboard_table_df[['title', 'artist']].iterrows()]

spotify_ids_dict = {'name': [], 'artist': [], 'api_id': [], 'api_name': [], 'api_artists': []}
for song_name, song_artist in tqdm(songs_artists):
    
    # Base Search Songs
    base_api_music = spotify_api_get_song_id(song_name, song_artist)
    
    # Searching Without Apostrophes
    if any(x is None for x in base_api_music): 
        song_name_cleaned = song_name.replace("'", '')
        song_artist_cleaned = song_artist.replace("'", '')
        base_api_music = spotify_api_get_song_id(song_name_cleaned, song_artist_cleaned)
    
    # Searching Without Symbols
    if any(x is None for x in base_api_music):
        song_name_cleaned = re.sub(r'[-\.\+,\(\)]', "", song_name_cleaned)
        song_artist_cleaned = re.sub(r'[-\.\+,\(\)]', "", song_artist_cleaned)
        base_api_music = spotify_api_get_song_id(song_name_cleaned, song_artist_cleaned)
        
    # Wide search
    if any(x is None for x in base_api_music):
        base_api_music = spotify_api_get_song_id(override_query=" ".join([song_name_cleaned, song_artist_cleaned]))
        
    # Adding data to dataframe
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = ",".join(base_api_music[2]) if base_api_music[2] is not None else None
    spotify_ids_dict["name"].append(song_name)
    spotify_ids_dict["artist"].append(song_artist)
    spotify_ids_dict["api_id"].append(api_song_id)
    spotify_ids_dict["api_name"].append(api_song_name)
    spotify_ids_dict["api_artists"].append(api_song_artists)
    
spotify_ids_df = pd.DataFrame(spotify_ids_dict)
spotify_ids_df = spotify_ids_df.drop_duplicates()
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

100%|██████████| 4028/4028 [17:09<00:00,  3.91it/s]  


Now, let's look if there is still songs missing

In [13]:
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)
missing_ids = spotify_ids_df[spotify_ids_df.api_id.isnull()]

print(len(missing_ids))
missing_ids

7


Unnamed: 0,name,artist,api_id,api_name,api_artists
91,True Love,Jane Powell,,,
180,Billy,Kathy Linden,,,
183,The Story of My Life,Marty Robins,,,
1048,"Mighty Love, Pt. 1",Spinners,,,
1520,It's Now Or Never,John Schneider,,,
1747,Breakdance,Irene Cara,,,
2242,Here We Go,C+C Music Factory,,,


#### Spotify API via requests

For the missing cases using Spotipy API, by the time this project was done, some of the musics searched through Spotify app are present in the service but are not found in the library. My approach here was to use the API directly using HTTP requests.

In [None]:
def spotify_api_by_requests(query):
    
    payload = {'q': query, 'type': 'track', 'limit': 1}
    headers = {"Authorization": f"Bearer {os.getenv('SPOTIFY_OAUTH_TOKEN')}"}
    r = requests.get('https://api.spotify.com/v1/search', headers=headers, params=payload)
    try:
        song_info = r.json()['tracks']['items'][0]
    except (KeyError, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]

    return api_song_id, api_song_name, api_song_artists

In [14]:
for song in tqdm(list(missing_ids.itertuples())):
    
    base_api_music = spotify_api_by_requests(" ".join([song.name, song.artist]))
    
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = ",".join(base_api_music[2]) if base_api_music[2] is not None else None
    
    spotify_ids_df.at[song.Index, 'api_id'] = api_song_id
    spotify_ids_df.at[song.Index, 'api_name'] = api_song_name
    spotify_ids_df.at[song.Index, 'api_artists'] = api_song_artists

100%|██████████| 7/7 [00:02<00:00,  2.48it/s]


By looking at the missing values, now we only have 2 cases (instead of 7). For these songs we really don't have any information, so, our work is done here

In [15]:
missing_ids = spotify_ids_df[spotify_ids_df.api_id.isnull()]
display(missing_ids)

print(f"# Missing IDs: {len(missing_ids)}")
print(f"# of Songs: {len(spotify_ids_df)}")
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

2


Unnamed: 0,name,artist,api_id,api_name,api_artists
1048,"Mighty Love, Pt. 1",Spinners,,,
1520,It's Now Or Never,John Schneider,,,


### Get Audio Features from IDs

Now that we have the IDs, we can get the audio features using the [Audio Features API](https://developer.spotify.com/documentation/web-api/reference/#endpoint-get-several-audio-features). Details about the meaning of each variable can be found on [Audio Feature Object Reference](https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject).

In [24]:
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)

audio_features_list = []
search_ids = spotify_ids_df.api_id.dropna()

# To improve the performance and do less requisition we are getting information
# from 80 songs in the same request
for songs in tqdm(list(chunked(search_ids, 80))):
    audio_features = sp.audio_features(songs)
    audio_features_list.extend(audio_features)
    
audio_features_df = pd.DataFrame([feat for feat in audio_features_list if feat != None])
audio_features_df.to_csv(SONG_AUDIO_FEATURES_FILEPATH)

100%|██████████| 49/49 [00:15<00:00,  3.24it/s]


By checking the missing values, I found only one id that is really missing on the API.

In [25]:
# Checking if there are missing values
print(len(audio_features_df))

if set(search_ids) != set(audio_features_df.id.unique()):
    print(set(search_ids) - set(audio_features_df.id.unique()))

3882
{'6MFQeWtk7kxWGydnJB2y36'}


## Merge and Exporting Data

In [29]:
FINAL_DATA_PATH = INTERIM_DATA_PATH / 'spotify_billboard_data.csv'

spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)
audio_features_df = pd.read_csv(SONG_AUDIO_FEATURES_FILEPATH, index_col=0)
billboard_table_df = pd.read_csv(BILLBOARD_TABLE_FILEPATH, index_col=0)

### Merging Spotify Data

In [30]:
spotify_ids_df = spotify_ids_df.set_index("api_id")
audio_features_df = audio_features_df.set_index("id")

spotify_df = spotify_ids_df.join(audio_features_df, how='outer')
print(len(spotify_df))

3917


### Merging Spotify and Billboard Data

In [31]:
final_data_df = pd.merge(
    left=spotify_df.reset_index().dropna().drop_duplicates(), 
    right=billboard_table_df, 
    how='right', 
    left_on=['name','artist'], 
    right_on=['title','artist'],
    suffixes=('_left', '_right')
)

len(final_data_df)

4028

### Exporting Data

In [32]:
final_data_df.to_csv(FINAL_DATA_PATH)