## Spotify Song-Album Popularity

This is a CS109A project to build a model to predict whether a song will be the most popular one on its album. This notebook allows for the collection of the data.

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import pandas as pd
import time
from tqdm import trange, tqdm_notebook

In [3]:
credentials = json.load(open('credentials.json'))
client_id = credentials['client_id']
client_secret = credentials['client_secret']

In [4]:
client_credentials_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### Gather The Data From The API

In [5]:
columns = ['track_id', 
           'album_id', 
           'track_number', 
           'track_count', 
           'duration', 
           'explicit', 
           'track_pop', 
           'album_pop', 
           'comparative_pop', 
           'danceability', 
           'energy', 
           'loudness', 
           'speechiness', 
           'acousticness', 
           'instrumentalness', 
           'liveness', 
           'valence', 
           'tempo']

In [6]:
# Set up statistics on logistics
track_csv_limit = 10000
num_output_csvs = 0
num_tracks = 0
num_albums = 0
num_failed_track_calls = 0
num_failed_album_calls = 0
num_failed_audio_features = 0
num_invalid_track_ids = 0
num_invalid_album_ids = 0

# Set up the track set to avoid duplicates
visited_tracks = set()

In [7]:
error_offset = 0
error_sum = 0

In [None]:
rows = []
for csv_id in tqdm_notebook(range(3,333), desc="csv progress"):
    csv = pd.read_csv(f'data/Songs/songs{csv_id}.csv')
    for uri in tqdm_notebook(csv["track_uri"], desc=f"uri progress for songs{csv_id}"):
        # Check if the track has already been visited:
        if uri in visited_tracks:
            continue
        else:
            visited_tracks.add(uri)
            
        # Extract the track object
        # If the API fails, iteratively pause for 10 seconds and try again
        sleep_timer = 10
        while True:
            try:
                track_object = sp.track(uri)
                break
            except:
                num_failed_track_calls += 1
                sleep_timer += 10
                time.sleep(sleep_timer)
            if sleep_timer > 120:
                break
        if sleep_timer > 120:
            num_output_csvs += 1
            print("Slept for over 2 minutes and still being limited.")
            print("Saving the data, sleeping for 10 minutes, then continuing.")
            df = pd.DataFrame(rows)
            df.to_csv(f'data/output/clean_spotify_{num_output_csvs}.csv', index=False)
            time.sleep(600)
            break

        # Check if the track is a single
        if track_object['album']['album_type'] == 'single':
            continue
        
        num_tracks += 1
        # Get the track features
        track_id = track_object['id']
        album_id = track_object['album']['id']
        track_count = track_object['album']['total_tracks']
        
        # Skip albums and compilations with one track
        if track_count == 1:
            continue

        duration = track_object['duration_ms']
        explicit = track_object['explicit']
        track_pop = track_object['popularity']
        track_number = track_object['track_number']
        
        # Get the album features
        # If the API fails, iteratively pause for 10 seconds and try again
        sleep_timer = 10
        while True:
            try:
                album = sp.album(album_id)
                break
            except:
                num_failed_album_calls += 1
                sleep_timer += 10
                time.sleep(sleep_timer)
            if sleep_timer > 120:
                break

        if sleep_timer > 120:
            num_output_csvs += 1
            print("Slept for over 2 minutes and still being limited.")
            print("Saving the data, sleeping for 10 minutes, then continuing.")
            df = pd.DataFrame(rows)
            df.to_csv(f'data/output/clean_spotify_{num_output_csvs}.csv', index=False)
            time.sleep(600)
            break
            
        num_albums += 1
        album_pop = album['popularity']

        comparative_pop = track_pop - (((album_pop * track_count) - track_pop) / (track_count - 1))
        
        # Get the audio features
        sleep_timer = 10
        while True:
            try:
                features = sp.audio_features(track_id)[0]
                break
            except:
                num_failed_audio_features += 1
                sleep_timer += 10
                time.sleep(sleep_timer)
            if sleep_timer > 120:
                break

        if sleep_timer > 120:
            num_output_csvs += 1
            print("Slept for over 2 minutes and still being limited.")
            print("Saving the data, sleeping for 10 minutes, then continuing.")
            df = pd.DataFrame(rows)
            df.to_csv(f'data/output/clean_spotify_{num_output_csvs}.csv', index=False)
            time.sleep(600)
            break

        danceability = features['danceability']
        energy = features['energy']
        loudness = features['loudness']
        speechiness = features['speechiness']
        acousticness = features['acousticness']
        instrumentalness = features['instrumentalness']
        liveness = features['liveness']
        valence = features['valence']
        tempo = features['tempo']
        
        # List the features
        relevant_features = [track_id, album_id, track_number, track_count, duration, explicit, track_pop, 
                             album_pop, comparative_pop, danceability, energy, loudness, speechiness, 
                             acousticness, instrumentalness, liveness, valence, tempo]
        rows.append(dict(zip(columns, relevant_features)))
        
        if num_tracks % track_csv_limit == 0:
            df = pd.DataFrame(rows)
            df.to_csv(f'data/output/clean_spotify_{num_output_csvs}.csv', index=False)
            print(f"clean_spotify_{num_output_csvs}.csv succesfully made.")
            
            num_output_csvs += 1
            rows = []
            
    print(f"Up to the data collection from songs/{csv_id}.csv, the following are the statistics:")
    print(f"{num_tracks} were written. {num_albums} were written.")
    print(f"There were {num_failed_track_calls} failed calls to the API for tracks.")
    print(f"There were {num_failed_album_calls} failed calls to the API for albums.")
    print(f"There were {num_invalid_track_ids} invalid track IDs.")
    print(f"There were {num_invalid_album_ids} invalid album IDs.")

HBox(children=(IntProgress(value=0, description='csv progress', max=330, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='uri progress for songs3', max=66512, style=ProgressStyle(desc…