In [3]:
import json
from pathlib import Path

import requests
import spotipy
import pandas as pd
from tqdm import tqdm
from requests.exceptions import ReadTimeout
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

# Data Acquistion

In [5]:
RAW_DATA_PATH_BILLBOARD = Path('data/raw/billboard')
RAW_DATA_PATH_BILLBOARD.mkdir(exist_ok=True, parents=True)

RAW_DATA_PATH_SPOTIFY = Path('data/raw/spotify')
RAW_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)
EXTERNAL_DATA_PATH_SPOTIFY = Path('data/external/spotify')
EXTERNAL_DATA_PATH_SPOTIFY.mkdir(exist_ok=True, parents=True)

## Billboard Data

In [6]:
BILLBOARD_BASE_DOWNLOAD_URL = "https://raw.githubusercontent.com/kevinschaich/billboard/master/data/years/{}.json"
BILLBOARD_DOWNLOAD_YEARS = list(range(1950, 2015 + 1))

### Download

In [7]:
for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    r = requests.get(BILLBOARD_BASE_DOWNLOAD_URL.format(year))
    if r.status_code != 200: print(f"Error: {year}")
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'wb') as f:
        f.write(r.content)

 14%|█▎        | 9/66 [00:12<01:21,  1.42s/it]


KeyboardInterrupt: 

## Spotify Data

In [13]:
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=auth_manager, requests_timeout=15)

SONG_IDS_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_ids.csv"
SONG_AUDIO_FEATURES_FILEPATH = EXTERNAL_DATA_PATH_SPOTIFY / "song_audiofeatures.csv"

In [10]:
# TODO: Passar isso para um script
def spotify_api_get_song_id(track=None, artist=None, override_query=None, limit=1):
    query = f'artist:{artist} track:{track}' if override_query is None else override_query
    
    try:
        track_id = sp.search(q=query, type='track', limit=limit)
        song_info = track_id['tracks']['items'][0]
    except (ReadTimeout, IndexError):
        song_info = {}
        
    api_song_id = song_info.get('id')
    api_song_name = song_info.get('name')
    api_song_artists = song_info.get('artists')

    return api_song_id, api_song_name, api_song_artists

### Get song names and artists

In [9]:
songs_artists = []

for year in tqdm(BILLBOARD_DOWNLOAD_YEARS):
    with open(RAW_DATA_PATH_BILLBOARD/"{}.json".format(year), 'r') as f:
        data = json.load(f)
        year_songs_artists = [(d['title'], d['artist']) for d in data]
        songs_artists.extend(year_songs_artists)
        
len(songs_artists)

100%|██████████| 66/66 [00:00<00:00, 94.17it/s] 


4028

### Get song IDs from Spotify 

In [1]:
spotify_ids_dict = {'name': [], 'artist': [], 'api_id': [], 'api_name': [], 'api_artists': []}

for songs in tqdm(songs_artists):

    # Get Song Info
    song_name = songs[0]
    song_artist = songs[1]
    song_name = song_name.replace("'", '')
    song_artist = song_artist.replace("'", '')
    
    base_api_music = spotify_api_get_song_id(song_name, song_artist)

    # Verify info match
    api_song_id = base_api_music[0]
    api_song_name = base_api_music[1]
    api_song_artists = base_api_music[2]
    
    if api_song_artists is not None: 
        api_song_artists = [artist.get('name') for artist in api_song_artists]
        
    # Adding data to make the dataframe
    spotify_ids_dict["name"].append(song_name)
    spotify_ids_dict["artist"].append(song_artist)
    spotify_ids_dict["api_id"].append(api_song_id)
    spotify_ids_dict["api_name"].append(api_song_name)
    spotify_ids_dict["api_artists"].append(
        ",".join(api_song_artists) if api_song_artists is not None else None
    )
    
spotify_ids_df = pd.DataFrame(spotify_ids_dict)
spotify_ids_df.to_csv(SONG_IDS_FILEPATH)

#### Checking missing values

In [14]:
spotify_ids_df = pd.read_csv(SONG_IDS_FILEPATH)

In [None]:
teste = spotify_ids_df[spotify_ids_df.api_id.isnull()]
teste