Objective: Find common trends among billboard songs

To do:

1. Get songs on historical hot-100 charts
2. Get songs on spotify charts at same dates
3. Get song features: track popularity, track gender, artist, artist popularity, 
4. Get audio features
5. Run statistical analysis
6. Apply Barabasi models

In [1]:
import billboard
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy, json, glob, pprint
import spotipy.util as util
import datetime as dt
import pandas as pd
import numpy as np

### collect data from billboard

In [2]:
def get_chart(chartName,date):
    """
    chartName (str)
    date (str): 'YYYY-MM-DD
    """
    chart = billboard.ChartData(chartName, date)
    tracks = parse_chart_data(chart)
    return tracks

def parse_chart_data(chart):
    attributes = ['spotifyLink', 'peakPos', 'artist', 'lastPos', 'title', 
                  'rank', 'weeks', 'spotifyID', 'change']
    tracks = []
    for song in chart:
        track = song.__dict__
        track['date'] = chart.date
        tracks.append(track)
    return tracks

def dump_json(filename,data):
    with open(filename,'w') as f:
        json.dump(data, f)
        f.close()

In [29]:
## collect billboard chart data
day = dt.datetime(2016,1,1)
month = 0
while day <= dt.datetime.today()-dt.timedelta(1):
    if day.month != month:
        month = day.month + 0
        print month,' ',
    if day.weekday() == 4:
        date = str(day.date())
        tracks = get_chart('hot-100',date)
        dump_json('data/'+date+'.json',tracks)
    day+=dt.timedelta(1)

1   2   3   4   5   6   7   8   9   10   11   12  


### collect from spotify
Now that we have collected billboard data, we need to aggregate the dataset and acquire further details from spotify

In [3]:
def parse_track_data(trackObj):
    """
    input:
        track spotify instance
    returns
        spotifyID - str
        trackPopularity - int
        artistName - list of str
        artistId - list of str
        albumName - str
        albumId - str
        preview_url - str
    """
    track = {}
    track['trackId'] = trackObj['id']
    track['trackPopularity'] = trackObj['popularity']
    track['trackArtistId'] = map(lambda x:x['id'],trackObj['artists'])
    track['trackArtistNum'] = len(track['trackArtistId'])
    track['trackAlbumName'] = trackObj['album']['name']
    track['trackAlbumId'] = trackObj['album']['id']
    track['preview_url'] = trackObj['preview_url']
    #track['trackPopularity'] = sp.search(track['trackId'])['tracks']['items']['popularity']
    return track


In [4]:
files = glob.glob('data/2016*')
tracks = []
for filename in files:
    with open(filename,'r') as f:
        tracks.extend(json.load(f))
data = pd.DataFrame(tracks)
spotifyIds = data.spotifyID.unique()
spotifyIds = filter(lambda x:x!='',spotifyIds)

In [5]:
sp = spotipy.Spotify()
track_props = []
cnt = 0
limit = 50
fetch = True
while fetch:
    results = sp.tracks(spotifyIds[cnt*limit:min((cnt+1)*limit,len(spotifyIds))])['tracks']
    for track in results:
        track_props.append(parse_track_data(track))
        print '.',
    cnt += 1
    if len(results)<limit:
        fetch = False
dump_json('data/tracks_properties.json',track_props)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


Next, let's gather artist data. The track_props dataset has artist columns as a list of artists since multiple artists can feature on one song. We will first extract all artists in a single list and use spotipy to get data about unique artists

In [11]:
def parse_artist_data(artistObj):
    """
    artistId - str
    artistName - str
    artistGenders - list of str
    artistPopularity - int
    artistFollowers - int
    
    """
    artist = {}
    artist['artistId'] = artistObj['id']
    artist['artistName'] = artistObj['name']
    artist['artistGenres'] = artistObj['genres']
    artist['artistPopularity'] = artistObj['popularity']
    artist['artistFollowers'] = artistObj['followers']['total']
    return artist

In [12]:
track_props_df = pd.DataFrame(track_props)
artists_list = list(set(reduce(lambda x,y:x+y,track_props_df.trackArtistId.tolist())))
artist_data = []
cnt = 0
limit = 50
fetch = True
while fetch:
    results = sp.artists(artists_list[cnt*limit:min((cnt+1)*limit,len(artists_list))])['artists']
    for artist in results:
        artist_data.append(parse_artist_data(artist))
        print '.',
    cnt += 1
    if len(results)<limit:
        fetch = False
dump_json('data/artists_data.json',artist_data)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


Next, let's gather tracks' audio features using spotipy. This is a feature that needs authentication token. We will first get authenticated using the client id and client secret together with the direct_uri as whitelisted on the app registration form.

In [None]:
#prep client credentials
with open('credentials.json','r') as f:
    credentials = json.load(f)
    client_id = credentials['clientId']
    client_secret = credentials['clientSecret']
    token = credentials['token']
    sCredentialsManager = SpotifyClientCredentials(client_id = credentials['clientId'],
                                                    client_secret = credentials['clientSecret'])
    f.close()
    del credentials, f

sp = spotipy.Spotify(client_credentials_manager = sCredentialsManager,auth = token)

In [270]:
def parse_audio_features(trackObj):
    track = {}
    attributes = [u'energy', u'liveness', u'tempo', u'speechiness', u'acousticness', u'instrumentalness', 
                  u'time_signature', u'danceability', u'key', u'duration_ms', u'loudness', u'valence', u'id', u'mode']
    for attribute in attributes:
        track['track'+attribute.capitalize()] = trackObj[attribute]
    return track

In [272]:
#get data
fetch = True
cnt = 0
limit = 50
audio_features = []
while fetch:
    results = sp.audio_features(spotifyIds[cnt*limit:min((cnt+1)*limit,len(spotifyIds))])
    results = map(lambda x:parse_audio_features(x),results)
    audio_features.extend(results)
    cnt += 1
    if len(results)<limit:
        fetch = False

dump_json('data/audio_features.json',audio_features)

In [3]:
## Manual search for some songs without spotifyID

In [4]:
sp = spotipy.Spotify()

In [20]:
#r = sp.search()
r = sp.search('Wildest Dreams')['tracks']['items']