In [1]:
import os
import re
import json
from pathlib import Path

import requests
import spotipy
import pandas as pd
from tqdm import tqdm
from requests.exceptions import ReadTimeout
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv
from more_itertools import chunked

In [2]:
load_dotenv()

True

# Data Acquistion

The objective of this notebook is to download all the data necessary for the project. First it downloads the billboard data provided on github [kevinschaich/billboard](https://github.com/kevinschaich/billboard). After that, for each song in the billboard dataset we get information using the Spotify API.

In [3]:
RAW_DATA_PATH = Path('data/raw')
INTERIM_DATA_PATH = Path('data/interim')
EXTERNAL_DATA_PATH = Path('data/external')

RAW_DATA_PATH_BILLBOARD = RAW_DATA_PATH / 'billboard'
RAW_DATA_PATH_SPOTIFY = RAW_DATA_PATH / 'spotify'
INTERIM_DATA_BILLBOARD = INTERIM_DATA_PATH / 'billboard'
EXTERNAL_DATA_PATH_SPOTIFY = EXTERNAL_DATA_PATH / 'spotify'

RAW_DATA_PATH_BILLBOARD.mkdir(exist_ok=True, parents=True)
INTERIM_DATA_BILLBOARD.mkdir(exist_ok=True, parents=True)
RAW_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)
EXTERNAL_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)

In [4]:
BILLBOARD_TABLE_FILEPATH = INTERIM_DATA_BILLBOARD / "billboard_data.csv"
SONG_IDS_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_ids.csv"
SONG_AUDIO_FEATURES_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_audiofeatures.csv"

## Billboard Data

This section downloads and tabulates all the json data provided in the billboard data repository from 1950 to 2015.

### Download

In [5]:
BILLBOARD_BASE_DOWNLOAD_URL = "https://raw.githubusercontent.com/kevinschaich/billboard/master/data/years/{}.json"
BILLBOARD_DOWNLOAD_YEARS = list(range(1950, 2015 + 1))

In [6]:
for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    r = requests.get(BILLBOARD_BASE_DOWNLOAD_URL.format(year))
    if r.status_code != 200: print(f"Error: {year}")
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'wb') as f:
        f.write(r.content)

100%|██████████| 66/66 [00:31<00:00,  2.07it/s]


### Tabulating Data

In [7]:
def flatten_dict(dd, separator ='_', prefix =''): 
    return { prefix + separator + k if prefix else k : v 
             for kk, vv in dd.items() 
             for k, v in flatten_dict(vv, separator, kk).items() 
             } if isinstance(dd, dict) else { prefix : dd } 

In [8]:
years_dfs = []
for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        flatten_data = [flatten_dict(x) for x in json.load(f)]
        years_dfs.append(pd.DataFrame(flatten_data))
                         
billboard_table_df = pd.concat(years_dfs, keys=BILLBOARD_DOWNLOAD_YEARS)
billboard_table_df = billboard_table_df.reset_index(level=0).rename(columns={'level_0': 'year'})
billboard_table_df = billboard_table_df.reset_index(drop=True)

print(len(billboard_table_df))
                         
billboard_table_df.to_csv(BILLBOARD_TABLE_FILEPATH) 

100%|██████████| 66/66 [00:00<00:00, 117.91it/s]


4028


## Spotify Data

This section retrieve, the id and the audio features of each song presented in the billboard dataset. 

For this to work it is necessary to create a .env file in the root folder of the project containing the following information: (1) SPOTIPY_CLIENT_ID, (2) SPOTIPY_CLIENT_SECRET, (3) SPOTIFY_OAUTH_TOKEN. For (1) and (2) the credentials can be obtained by creating an application on spotify, this can be done by following ["Web API Tutorial"](https://developer.spotify.com/documentation/web-api/quick-start/). For (3) you can use the [Developer's console](https://developer.spotify.com/console/get-search-item/), click on the button "Get Token", Login into your account and copy the value generated in the field. 

**OBS:** The SPOTIFY_OAUTH_TOKEN is temporary

In [9]:
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager, requests_timeout=15)

In [10]:
def spotify_api_get_song_id(track=None, artist=None, override_query=None):
    query = f'artist:{artist} track:{track}' if override_query is None else override_query
    
    try:
        track_id = sp.search(q=query, type='track', limit=1)
        song_info = track_id['tracks']['items'][0]
    except (KeyError, ReadTimeout, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]

    return api_song_id, api_song_name, api_song_artists

def spotify_api_by_requests(query):
    
    payload = {'q': query, 'type': 'track', 'limit': 1}
    headers = {"Authorization": f"Bearer {os.getenv('SPOTIFY_OAUTH_TOKEN')}"}
    r = requests.get('https://api.spotify.com/v1/search', headers=headers, params=payload)
    try:
        song_info = r.json()['tracks']['items'][0]
    except (KeyError, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]

    return api_song_id, api_song_name, api_song_artists

### Get song names and artists from billboard data

In [11]:
songs_artists = []

for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        data = json.load(f)
        year_songs_artists = [(d['title'], d['artist']) for d in data]
        songs_artists.extend(year_songs_artists)
        
len(songs_artists)

100%|██████████| 66/66 [00:00<00:00, 234.30it/s]


4028

### Get song IDs from Spotify 

To get the audio features we first need to get the Ids for each music by using the [Search API](https://developer.spotify.com/documentation/web-api/reference/#category-search). 

In [12]:
billboard_table_df = pd.read_csv(BILLBOARD_TABLE_FILEPATH, index_col=0)
song_artists = [(value['title'], value['artist']) for index, value in billboard_table_df[['title', 'artist']].iterrows()]

spotify_ids_dict = {'name': [], 'artist': [], 'api_id': [], 'api_name': [], 'api_artists': []}
for song_name, song_artist in tqdm(songs_artists):
    
    # Base Search Songs
    base_api_music = spotify_api_get_song_id(song_name, song_artist)
    
    # Searching Without Apostrophes
    if any(x is None for x in base_api_music): 
        song_name_cleaned = song_name.replace("'", '')
        song_artist_cleaned = song_artist.replace("'", '')
        base_api_music = spotify_api_get_song_id(song_name_cleaned, song_artist_cleaned)
    
    # Searching Without Symbols
    if any(x is None for x in base_api_music):
        song_name_cleaned = re.sub(r'[-\.\+,\(\)]', "", song_name_cleaned)
        song_artist_cleaned = re.sub(r'[-\.\+,\(\)]', "", song_artist_cleaned)
        base_api_music = spotify_api_get_song_id(song_name_cleaned, song_artist_cleaned)
        
    # Wide search
    if any(x is None for x in base_api_music):
        base_api_music = spotify_api_get_song_id(override_query=" ".join([song_name_cleaned, song_artist_cleaned]))
        
    # Adding data to dataframe
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = ",".join(base_api_music[2]) if base_api_music[2] is not None else None

    spotify_ids_dict["name"].append(song_name)
    spotify_ids_dict["artist"].append(song_artist)
    spotify_ids_dict["api_id"].append(api_song_id)
    spotify_ids_dict["api_name"].append(api_song_name)
    spotify_ids_dict["api_artists"].append(api_song_artists)
    
spotify_ids_df = pd.DataFrame(spotify_ids_dict)
spotify_ids_df = spotify_ids_df.drop_duplicates()
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

100%|██████████| 4028/4028 [17:09<00:00,  3.91it/s]  


#### Missing Values Treatment

In [13]:
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)
missing_ids = spotify_ids_df[spotify_ids_df.api_id.isnull()]

print(len(missing_ids))
missing_ids

7


Unnamed: 0,name,artist,api_id,api_name,api_artists
91,True Love,Jane Powell,,,
180,Billy,Kathy Linden,,,
183,The Story of My Life,Marty Robins,,,
1048,"Mighty Love, Pt. 1",Spinners,,,
1520,It's Now Or Never,John Schneider,,,
1747,Breakdance,Irene Cara,,,
2242,Here We Go,C+C Music Factory,,,


For the cases above, by looking directly from spotify app, I found that some values are present in the service but somehow the library didn't get. To correct that, we are going to search directly using the API with the requests library.

In [14]:
for song in tqdm(list(missing_ids.itertuples())):
    
    base_api_music = spotify_api_by_requests(" ".join([song.name, song.artist]))
    
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = ",".join(base_api_music[2]) if base_api_music[2] is not None else None
    
    spotify_ids_df.at[song.Index, 'api_id'] = api_song_id
    spotify_ids_df.at[song.Index, 'api_name'] = api_song_name
    spotify_ids_df.at[song.Index, 'api_artists'] = api_song_artists

100%|██████████| 7/7 [00:02<00:00,  2.48it/s]


Now we just have 2 missing values, they are:

In [15]:
missing_ids = spotify_ids_df[spotify_ids_df.api_id.isnull()]

print(len(missing_ids))
missing_ids

2


Unnamed: 0,name,artist,api_id,api_name,api_artists
1048,"Mighty Love, Pt. 1",Spinners,,,
1520,It's Now Or Never,John Schneider,,,


I didn't found the correct values on spotify API, so they are really missing. Let's save the dataframe

In [16]:
print(len(spotify_ids_df))
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

3885


### Get Audio Features from IDs

With the IDs we can now get the audio features information using the [Audio Features API](https://developer.spotify.com/documentation/web-api/reference/#endpoint-get-several-audio-features). To understand the meaning of each variable you can read the [Audio Feature Object](https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject).

In [24]:
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)

audio_features_list = []
search_ids = spotify_ids_df.api_id.dropna()

# To improve the performance and do less requisition we are getting information
# from 80 songs in the same request
for songs in tqdm(list(chunked(search_ids, 80))):
    audio_features = sp.audio_features(songs)
    audio_features_list.extend(audio_features)
    
audio_features_df = pd.DataFrame([feat for feat in audio_features_list if feat != None])
audio_features_df.to_csv(SONG_AUDIO_FEATURES_FILEPATH)

100%|██████████| 49/49 [00:15<00:00,  3.24it/s]


In [25]:
# Checking if there are missing values
print(len(audio_features_df))

if set(search_ids) != set(audio_features_df.id.unique()):
    print(set(search_ids) - set(audio_features_df.id.unique()))

3882
{'6MFQeWtk7kxWGydnJB2y36'}


I checked the value above and didn't find any song information, so the data is really missing. 

## Merge and Exporting Data

In [29]:
FINAL_DATA_PATH = INTERIM_DATA_PATH / 'spotify_billboard_data.csv'

spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH, index_col=0)
audio_features_df = pd.read_csv(SONG_AUDIO_FEATURES_FILEPATH, index_col=0)
billboard_table_df = pd.read_csv(BILLBOARD_TABLE_FILEPATH, index_col=0)

### Merging Spotify Data

In [30]:
spotify_ids_df = spotify_ids_df.set_index("api_id")
audio_features_df = audio_features_df.set_index("id")

spotify_df = spotify_ids_df.join(audio_features_df, how='outer')
print(len(spotify_df))

3917


### Merging Spotify and Billboard Data

In [31]:
final_data_df = pd.merge(
    left=spotify_df.reset_index().dropna().drop_duplicates(), 
    right=billboard_table_df, 
    how='right', 
    left_on=['name','artist'], 
    right_on=['title','artist'],
    suffixes=('_left', '_right')
)

len(final_data_df)

4028

### Exporting Data

In [32]:
final_data_df.to_csv(FINAL_DATA_PATH)