# Setup

Before we can begin analyzing my music collection, we have to fetch it and clean it. We first import all the necessary packages to work.

In [1]:
import os, platform

import re
from pprint import pprint 

import pandas as pd
from pandas import Timestamp
import numpy as np
import itertools
from datetime import datetime
from pytz import timezone

import multiprocessing as mp

import seaborn as sns
import matplotlib.pyplot as plt

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from tqdm import tqdm_notebook as tqdm

### Credentials

Establish my credentials for the Spotify API, and setup up an object to use for calls to the API.

In [2]:
spotify_id = '672b6ccc89154897bbafa579105f4124'
spotify_secret = 'bed129a78ded4e849eab9ca4fb3da1e3'
REQUEST_TIMEOUT = 4.0

client_credentials_manager = SpotifyClientCredentials(
    client_id = spotify_id, 
    client_secret = spotify_secret)
sp = spotipy.Spotify(
    client_credentials_manager = client_credentials_manager, 
    requests_timeout = REQUEST_TIMEOUT)

# Fetch Song Information

I have linked my Spotify account to my Last.FM account. Last.FM records each stream, and this log can be downloaded via https://benjaminbenben.com/lastfm-to-csv/.

Next, we load the raw data. Notice how there are some missing album titles and timestamps. This is likely just the result of a bad script pulling from Last.FM, so we'll have to fix that. Below the counts is a random sample of the data, just to get a feel of what is in there.

In [3]:
original_history = pd.read_csv('data/alexliebscher.csv', header=None, skiprows=1, names=['artist', 'album', 'track', 'timestamp'])
original_history.count()

artist       32702
album        32667
track        32702
timestamp    32540
dtype: int64

We will just backfill to take care of missing timestamp data. A very small percentage is missing, and I assume the missing values have a high probability of being similar to the song before.

In [4]:
original_history['timestamp'] = original_history['timestamp'].bfill()
original_history['timestamp'].count()

32702

The timestamps are also missing timestamp information, so we should add that to ensure our analysis reflects my local time. In this case, all timestamps are assumed to be UTC and are converted to US/Pacific, my local zone.

In [5]:
timezoned_history = original_history.copy()
timezoned_history['timestamp'] = pd.to_datetime(timezoned_history['timestamp'], utc=True)

The first song was recorded on December 18th, 2017 at roughly 7pm. This dataset covers the following 260 days after that.

In [6]:
history_max = timezoned_history['timestamp'].max()
history_min = timezoned_history['timestamp'].min()

print(history_min.tz_convert('US/Pacific'))
print(history_max - history_min)

2017-12-18 18:55:00-08:00
364 days 14:41:00


## Fetch full track data

Although we have artist, album, track, and timestamp for each stream, there's a lot more information that we can find. We choose to use the Spotify API, as it is reliable, easy to use, and offers a handful of quantitative features we otherwise wouldn't be able to assess.

In [7]:
delimeter_pattern = re.compile("[\{\}\[\]\(\)\#\'\"]")
classical_pattern = re.compile("((op\.?|no\.?)\s*\d{1,3}\s?)", re.IGNORECASE)
collections_pattern = re.compile("(^\d{1,3}\s*)")
stylizations_pattern = re.compile("[\,\-\_\&\*]\s?|\:\s")


def clean_query(track, artist, album=''):
    # remove (feat. some artist) for cleaner search
    track = track.lower()
    if " (feat" in track:
        track = track.split(" (feat")[0]
    elif " (with" in track:
        track = track.split(" (with")[0]
    elif " (&" in track:
        track = track.split(" (&")[0]
        
    # clean album names too
    album = album.lower()
    if "nan" == album:
        album = ""
    elif " (feat" in album:
        album = album.split(" (feat")[0]
    elif " (with" in album:
        album = album.split(" (with")[0]
    elif " (&" in album:
        album = album.split(" (&")[0]
        
    # compose a clean, simple query string
    query = str(track + ' ' + artist + ' ' + album).strip()
    
    query = delimeter_pattern.sub("", query) # remove various delimeter chars
    query, subs = classical_pattern.subn("", query) # remove common strings in classical track titles
                                                    # unfortunately modifies tracks such as Candy Shop 
                                                    # by 50 Cent to "candy shCent"
    if subs > 0:
        # classical music often starts with the number of pieces in
        # a collection ("12 Etudes, Op. 10: No.10 in C minor")
        query = collections_pattern.sub("", query)
        
    query = stylizations_pattern.sub(" ", query) # common stylizations in track/album names
    
    return query

def format_return_track(metadata, audio_features):
    # store a new track
    _track = dict({})
    
    _track['id'] = metadata['id']
    _track['name'] = metadata['name']
    _track['release'] = metadata['album']['release_date']
    _track['popularity'] = metadata['popularity']
    _track['explicit'] = int(metadata['explicit'])
    _track['artists'] = [a['id'] for a in metadata['artists']]
    _track['album'] = metadata['album']['name']

    _track['acousticness'] = audio_features['acousticness']
    _track['danceability'] = audio_features['danceability']
    _track['duration_ms'] = audio_features['duration_ms']
    _track['energy'] = audio_features['energy']
    _track['key'] = audio_features['key']
    _track['liveness'] = audio_features['liveness']
    _track['loudness'] = audio_features['loudness']
    _track['mode'] = audio_features['mode']
    _track['speechiness'] = audio_features['speechiness']
    _track['tempo'] = audio_features['tempo']
    _track['time_signature'] = audio_features['time_signature']
    _track['valence'] = audio_features['valence']
    
    return _track

def get_track_info(track, artist, album='', id_excl=False, verbose=0):
    '''
    With a track name and artist, and optionally an album name,
    search for a corresponding track via the Spotify API and
    build an object with possible descriptive data.
    
    Parameters
    ----------
    track : str
        The name of a track
    artist : str
        The name of the track's artist
    album : str, optional
        The name of the track's album
    id_excl : bool, optional
        Return only the track's Spotify ID
    verbose : int, optional
        Level of verbosity. 0 is no output
    
    Return
    ----------
    Descriptive track data, or just the track ID, or an empty
    dict if no data could be found for the specified track
    '''
    query = clean_query(track, artist, album)    
        
    # if the song exists in the Spotify catalog, fetch info
    try:
        if verbose >= 2: print('Query track: ' + query)
        meta = sp.search(q='track:' + query, type='track', limit=1)
        meta = meta['tracks']['items'][0]

        if not id_excl:
            features = sp.audio_features([meta['id']])[0]
            
    except Exception as e:
        
        # if the track could not be found, try once more without the album
        if album is not "":
            
            if verbose >= 2: print('Requery {} by {} without album'.format(track, artist))
                
            retry = get_track_info(track, artist)
            # if the track couldn't be found without the album, give up
            if retry:
                return retry
            
        if verbose >= 1:
            print('No data for {} by {}, query: {}\n'.format(track, artist, query))
            
        return {}

    if id_excl and meta['id']:
        return meta['id']
    
    # return the track information
    try:
        return format_return_track(meta, features)
    except TypeError:
        if verbose >= 2: print('Parameter missing for {} by {}'.format(track, artist))
    
    return {}

## Baseline multiprocessor efficiency

We time and record serial processing and multiprocessor functionality to estimate performance improvements. Let this be a simple measurement of how well we can do with multiprocessing when fetching track data from the API.

Extract a random sample of 50 tracks. We will use this to compare single processor efficiency with multiprocessor efficiency.

In [8]:
sample = timezoned_history.sample(50)

In [9]:
cpus = mp.cpu_count() # we'll make use of all CPUs, we use this later too

In [11]:
pbar = tqdm(total=len(sample))

def progress(*args):
    pbar.update()

def async_fetch(track, artist, album):
    '''
    Count and display track searches and timing
    
    Parameters
    ----------
    track : str
        The name of a track
    artist : str
        The name of the track's artist
    album : str
        The name of the track's album
    
    Return
    ----------
    Data about the track, if the track is found (otherwise, empty dict)
    '''
    return get_track_info(track, artist, album, verbose=1)

def serial(tracks):
    '''
    A serial processor for comparison's purpose (1 CPU, 1 process)
    
    Parameters
    ----------
    tracks : list
        A list of tracks to search
    
    Return
    ----------
    A list of track data in dicts
    '''
    return [get_track_info(str(t['track']), str(t['artist']), str(t['album']), verbose=1) for i, t in tracks.iterrows()]

def multiprocess(processes, tracks):  
    '''
    Multiprocessing to utilize all cores for comparison's purpose
    
    Parameters
    ----------
    processes : int
        The number of processes to create in parallel
    tracks : list
        A list of tracks to search
    
    Return
    ----------
    A list of track data in dicts
    '''
    pool = mp.Pool(processes=processes)
    results = [pool.apply_async(async_fetch,
                                args=(str(t['track']), str(t['artist']), str(t['album'])),
                                callback=progress) for i, t in tracks.iterrows()]
    results = [p.get() for p in results]
        
    return results

print('\n')
print('# of CPUs:\t{}'.format(cpus))
print('Python version:\t{}'.format(platform.python_version()))
print('Compiler:\t{}'.format(platform.python_compiler()))
print('System:\t\t{}'.format(platform.system()))
print('Release:\t{}'.format(platform.release()))
print('Machine:\t{}'.format(platform.machine()))
print('Processor:\t{}'.format(platform.processor()))
print('Interpreter:\t{}'.format(platform.architecture()[0]))
print('\n')

print('Testing Serial\n')
# Test and time serial()
s = datetime.now()
serial_temp = pd.DataFrame(serial(sample)).dropna()
serial_t = datetime.now() - s

print('Testing Multiprocessor\n')
# Test and time multiprocess()
s = datetime.now()
multi_temp = pd.DataFrame(multiprocess(cpus, sample)).dropna()
multi_t = datetime.now() - s

pbar.close()

A Jupyter Widget



# of CPUs:	4
Python version:	3.6.3
Compiler:	GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)
System:		Darwin
Release:	18.0.0
Machine:	x86_64
Processor:	i386
Interpreter:	64bit


Testing Serial

No data for I Wonder Why - 1999 Digital Remaster by Dion & The Belmonts, query: i wonder why  1999 digital remaster Dion  The Belmonts the best of dion  the belmonts

Testing Multiprocessor

No data for I Wonder Why - 1999 Digital Remaster by Dion & The Belmonts, query: i wonder why  1999 digital remaster Dion  The Belmonts the best of dion  the belmonts

retrying ...3secs
retrying ...3secs
retrying ...3secs
retrying ...3secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs



Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/alex/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/alex/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/alex/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/alex/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/anaconda3/lib/python3.6/multip

In [12]:
print('Serial Processing')
print('search ratio (found : expected): {}'.format(len(serial_temp)/len(sample)))
print('Time: {}'.format(serial_t))
print('\nMulti Processing')
print('search ratio (found : expected): {}'.format(len(multi_temp)/len(sample)))
print('Time: {}'.format(multi_t))
print('\n{0:.2f}x faster with multiprocess'.format(serial_t / multi_t))

Serial Processing
search ratio (found : expected): 0.98
Time: 0:00:20.836388

Multi Processing
search ratio (found : expected): 0.98
Time: 0:00:09.468030

2.20x faster with multiprocess


## Fetch all track data from Spotify

Use all 4 of my CPUs to fetch track information in parallel.

In [10]:
pbar = None

def progress(*args):
    pbar.update()

def async_fetch_real(track, artist, album, timestamp):
    '''
    Count and display track searches and timing
    
    Parameters
    ----------
    track : str
        The name of a track
    artist : str
        The name of the track's artist
    album : str
        The name of the track's album
    timestamp : str
        The timestamp of the track
    
    Return
    ----------
    Data about the track, if the track is found (otherwise, empty dict)    
    '''
    _t = get_track_info(track, artist, album, verbose=1)
    # re-attach the timestamp to the track data
    _t.update({
        'timestamp': timestamp
    })
            
    return _t

def multiprocess(processes, tracks):
    '''
    Multiprocessing to utilize all cores for all listening history
    
    Parameters
    ----------
    processes : int
        The number of processes to create in parallel
    tracks : list
        A list of tracks to search
    
    Return
    ----------
    A list of track data in dicts    
    '''
    pool = mp.Pool(processes=processes)
    results = [pool.apply_async(async_fetch_real, 
                                args=(str(t['track']), str(t['artist']), str(t['album']), Timestamp(t['timestamp']),),
                                callback=progress) for i, t in tracks.iterrows()]
    results = [p.get() for p in results]
    
    return results

def updateTracks(original, unique):
    '''
    Fetch track data via the Spotify API and save the compiled output to JSON
    
    Parameters
    ----------
    original : pandas.DataFrame
    unique : pandas.DataFrame
    '''
    print('Fetching {} tracks...'.format(len(unique)))
    
    temp = pd.DataFrame(multiprocess(cpus, unique)).dropna()
    multi_t = datetime.now() - s
            
    if not temp.empty:
        compiled = pd.concat([original, temp], ignore_index=True)
        compiled.to_json('data/history_comp.json')
        print('Saved complete history')
        
        print('Search ratio (found : expected): {}'.format(len(temp)/len(unique)))
        print('Total songs found: \t\t{}'.format(len(temp)))
        print('Total time:\t\t\t {}'.format(multi_t))
        print('Songs/sec fetched:\t\t {}'.format(len(unique)/multi_t.total_seconds()))
    else:
        print('No new track data found')

In [11]:
last_full_history = pd.read_json('data/history_comp.json') if os.path.isfile('data/history_comp.json') else pd.DataFrame(columns=['timestamp'])
last_full_history['timestamp'] = pd.to_datetime(last_full_history['timestamp'], utc=True)

s = datetime.now()

if last_full_history.empty or last_full_history['timestamp'].max() < timezoned_history['timestamp'].max():
    unique = timezoned_history[~timezoned_history['timestamp'].isin(last_full_history['timestamp'])]
    
    pbar = tqdm(total=len(unique))
    
    updateTracks(last_full_history, unique)
    
    pbar.close()
else:
    print('all records up to date')
    
# Saved complete history
# Search ratio (found : expected): 0.9925565416547381
# Total songs found: 		10401
# Total time:			 0:35:05.198987
# Songs/sec fetched:		 4.977676725435361

A Jupyter Widget

Fetching 4114 tracks...
retrying ...5secs
retrying ...5secs
retrying ...5secs
retrying ...5secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...4secs
retrying ...4secs
retrying ...4secs
retrying ...4secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...2secs
retrying ...2secs
retrying ...2secs
retrying ...2secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...4secs
retrying ...4secs
retrying ...4secs
retrying ...4secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...2secs
retrying ...2secs
retrying ...2secs
retrying ...2secs
retrying ...1secs
retr

No data for Sigamos Bailando by Gianluca Vacchi, query: sigamos bailando Gianluca Vacchi sigamos bailando


No data for Just Wanna Love You (feat. J. Balvin) by Cris Cab, query: just wanna love you Cris Cab just wanna love you
No data for Jaleo by Nicky Jam, query: jaleo Nicky Jam jaleo


No data for Just Wanna Love You (feat. J. Balvin) by Cris Cab, query: just wanna love you Cris Cab just wanna love you
No data for RA by Inna, query: ra Inna ra


No data for X - Remix by Nicky Jam, query: x  remix Nicky Jam x remix
No data for Dilema by Prince Royce, query: dilema Prince Royce five deluxe edition


No data for Adicto by Prince Royce, query: adicto Prince Royce adicto
No data for Monster In Me by Little Mix, query: monster in me Little Mix lm5 deluxe

No data for Why Are We So Broken (feat. blink-182) by Steve Aoki, query: why are we so broken Steve Aoki neon future iii

No data for A Summer Place by Andy Williams, query: a summer place Andy Williams moon river and other great movie t

No data for I Want It All by Arctic Monkeys, query: i want it all Arctic Monkeys am


No data for I Wanna Be Yours by Arctic Monkeys, query: i wanna be yours Arctic Monkeys am

No data for One for the Road by Arctic Monkeys, query: one for the road Arctic Monkeys am

No data for No. 1 Party Anthem by Arctic Monkeys, query: party anthem Arctic Monkeys am

No data for Snap Out of It by Arctic Monkeys, query: snap out of it Arctic Monkeys am
No data for In / Balance by San Scout, query: in / balance San Scout in / balance

No data for A Brighter Love by St. Lucia, query: a brighter love St. Lucia a brighter love / paradise is waiting


No data for Believe (Paul Meany Remix) by Romes, query: believe paul meany remix Romes believe paul meany remix

No data for Tongues by Joywave, query: tongues Joywave how do you feel now?

No data for Lonely Man of Winter by Sufjan Stevens, query: lonely man of winter Sufjan Stevens lonely man of winter
No data for Lonely Man of Winter - Doveman Mix feat. 

No data for Redbone by Childish Gambino, query: redbone Childish Gambino awaken my love!
No data for Riot by Childish Gambino, query: riot Childish Gambino awaken my love!



No data for zombies by Childish Gambino, query: zombies Childish Gambino awaken my love!

No data for Boogieman by Childish Gambino, query: boogieman Childish Gambino awaken my love!
No data for Me and Your Mama by Childish Gambino, query: me and your mama Childish Gambino awaken my love!

No data for Have Some Love by Childish Gambino, query: have some love Childish Gambino awaken my love!


No data for Can't Knock The Hustle by Weezer, query: cant knock the hustle Weezer cant knock the hustle
No data for Paradise by The Neighbourhood, query: paradise The Neighbourhood ever changing  ep

No data for Blood // Water by Grandson, query: blood // water Grandson blood / / water

No data for Can't Knock The Hustle by Weezer, query: cant knock the hustle Weezer cant knock the hustle


No data for Do Your Worst by Rival 

No data for Walking Trophy (feat. Trey Songz & Fabolous) - Remix by Hoodcelebrityy, query: walking trophy Hoodcelebrityy inna real life


No data for Only You (feat. WizKid Offset & J Balvin) by Metro Boomin, query: only you Metro Boomin not all heroes wear capes


No data for Kamikaze by lil Mosey, query: kamikaze lil Mosey northsbest
No data for Better Now by Post Malone, query: better now Post Malone beerbongs  bentleys
No data for Climax (feat. 6lack) by Young Thug, query: climax Young Thug on the rvn



No data for Better Now by Post Malone, query: better now Post Malone beerbongs  bentleys

No data for Keep It Close by BONES OWENS, query: keep it close BONES OWENS keep it close
No data for So Be It by Blackfoot Gypsies, query: so be it Blackfoot Gypsies handle it


No data for Keep It Close by BONES OWENS, query: keep it close BONES OWENS keep it close
No data for 18 Counties by Moreland & Arbuckle, query: 18 counties Moreland  Arbuckle flood


No data for Something’s Got a Hold 



No data for Angel From Montgomery - live by Susan Tedeschi, query: angel from montgomery  live Susan Tedeschi live from austin tx
No data for Zaïs: Overture - Live by Jean-Philippe Rameau, query: zaïs overture  live Jean Philippe Rameau une symphonie imaginaire

No data for In Memory of T-Bone by Ronnie Earl & The Broadcasters, query: in memory of t bone Ronnie Earl  The Broadcasters maxwell street


No data for Konzertstück Op.86 In F For 4 Horns And Orchestra: 1. Lebhaft - by Robert Schumann, query: konzertstück in f for 4 horns and orchestra 1. lebhaft  Robert Schumann schumann symphonies nos.1  4; konzertstück for 4 horns

No data for Ciaccona (Antonio Falconiero ca. 1585-1656) by Claudio Monteverdi, query: ciaccona antonio falconiero ca. 1585 1656 Claudio Monteverdi monteverdi  madrigali e lamenti
No data for In An 18th Century Drawing Room by Raymond Scott, query: in an 18th century drawing room Raymond Scott the music of raymond scott reckless nights and turkish twilights


No


No data for Poison by Brent Faiyaz, query: poison Brent Faiyaz poison


No data for GOT IT GOOD by Kaytranada, query: got it good Kaytranada 99.9%

No data for Spare Time by Eric Bellinger, query: spare time Eric Bellinger eventually mixtape
No data for Don't Choose by DVSN, query: dont choose DVSN dont choose

No data for Nobody Else But You by Trey Songz, query: nobody else but you Trey Songz nobody else but you
No data for Confidently Lost by Sabrina Claudio, query: confidently lost Sabrina Claudio confidently lost



No data for Deeper (feat. Beldina) [Bonus] by KYLE, query: deeper KYLE beautiful loser

No data for Hold Tight by Justin Bieber, query: hold tight Justin Bieber journals
No data for Ahora Dice by Chris Jeday, query: ahora dice Chris Jeday ahora dice


No data for All That Matters by Justin Bieber, query: all that matters Justin Bieber journals

No data for Business before Pleasure by George Akaeze & His Augmented hits, query: business before pleasure George Akaeze  Hi

No data for Open Wound (ODESZA Remix) by Ki:Theory, query: open wound odesza remix Ki:Theory kitty hawk remixed and extended

No data for Take You Down by Illenium, query: take you down Illenium take you down

No data for Tie Me Down (with Elley Duhé) by Gryffin, query: tie me down Gryffin tie me down


No data for B - Electric by Jaden Smith, query: b  electric Jaden Smith syre the electric album

No data for Break The Rules - ODESZA Remix by Charli XCX, query: break the rules  odesza remix Charli XCX break the rules remixes
No data for When I Get There by Big Wild, query: when i get there Big Wild invincible ep


No data for Boy by Odesza, query: boy Odesza a moment apart

No data for Bloomingdales by White Boiz, query: bloomingdales White Boiz neighborhood wonderful
No data for Know About Love by Donny, query: know about love Donny know about love / strip club blues


No data for Around Me by Brent Faiyaz, query: around me Brent Faiyaz lost

No data for How to Screw up Your Future a

No data for Brakelights by omar apollo, query: brakelights omar apollo brakelights


No data for Solstice by YEEK, query: solstice YEEK solstice

No data for I Do (End Credits) by Kevin Abstract, query: i do end credits Kevin Abstract american boyfriend a suburban love story
No data for All I Have - Intro by Snoh Aalegra, query: all i have  intro Snoh Aalegra feels


No data for Dressed Like Rappers by Isaiah Rashad, query: dressed like rappers Isaiah Rashad the suns tirade
No data for Tyrant (feat. Daniel Caesar) - Remix by Kali Uchis, query: tyrant Kali Uchis tyrant
No data for Oh! Sweet Nuthin' - 2015 Remastered by The Velvet Underground, query: oh! sweet nuthin  2015 remastered The Velvet Underground loaded re loaded 45th anniversary edition



No data for When The Music's Over - Remastered by The Doors, query: when the musics over  remastered The Doors strange days 50th anniversary expanded edition remastered

No data for Love Her Madly by The Doors, query: love her madly The Door


No data for Older - livefromspotifylondon by The Parcels, query: older  livefromspotifylondon The Parcels
No data for Lightenup by The Parcels, query: lightenup The Parcels lightenup

No data for Older - Mouse on Mars Remix by The Parcels, query: older  mouse on mars remix The Parcels hideout remixed

No data for Clockscared - livefromspotifylondon by The Parcels, query: clockscared  livefromspotifylondon The Parcels


No data for Gamesofluck - L'Impératrice Remix by The Parcels, query: gamesofluck  limpératrice remix The Parcels

No data for Tieduprightnow by The Parcels, query: tieduprightnow The Parcels tieduprightnow
No data for Hideout - Disco Despair Remix by The Parcels, query: hideout  disco despair remix The Parcels

No data for The Well-Tempered ClavierBWV 846: Prelude I in C Major by Johann Sebastian Bach, query: the well tempered clavierbwv 846 prelude i in c major Johann Sebastian Bach bach the well tempered clavier book 1 bwv 846 869
No data for KAYTRANADA_WAITIN_115 BPM


No data for Plans (feat. Brandyn Burnette) by Elefante, query: plans Elefante i am the elephante

No data for Black Ivory by Elefante, query: black ivory Elefante i am the elephante
No data for La Mala Maña by BANDA REAL, query: la mala maña BANDA REAL una parranda latina
No data for Everything You Want Me to Be by Cold Front, query: everything you want me to be Cold Front float around



No data for Wave(s) by Lewis Del Mar, query: waves Lewis Del Mar ep

No data for Candy Shop by 50 Cent, query: candy shCent the massacre

No data for artifacts_stbb375 by eli filosov [ p h i l o ], query: artifacts stbb375 eli filosov  p h i l o  2007
No data for Piano Concerto No. 2 in C minor Op. 50: II. Romanza: Andante con moto by Николай Карлович Метнер, query: piano concerto in c minor  ii. romanza andante con moto Николай Карлович Метнер rachmaninov s. piano concerto original 1926 version / medtner n. piano concerto 

No data for Number One by J Dilla, query: number one J Dilla number one
No d

No data for Next Time - Original Mix by Xilent, query: next time  original mix Xilent your system ep


No data for Afterlife (Dabin Remix) [feat. Echos] by Illenium, query: afterlife dabin remix feat. echos Illenium ashes remixes
No data for California Love - Original Version (Explicit) by 2Pac, query: california love  original version explicit 2Pac 2pac greatest hits explicit version


No data for Clockscared - livefromspotifylondon by The Parcels, query: clockscared  livefromspotifylondon The Parcels

No data for Ganja in my Brain by Ras Matthew, query: ganja in my brain Ras Matthew new singles ep
No data for Chelsea Dagger - radio edit by The Fratellis, query: chelsea dagger  radio edit The Fratellis chelsea dagger radio edit


No data for In My Room (feat. Ty Dolla $ign & Tyga) by DJ Mustard, query: in my room DJ Mustard blood for mercy

No data for Clockscared - livefromspotifylondon by The Parcels, query: clockscared  livefromspotifylondon The Parcels

No data for Keep Their Head

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Saved complete history
Search ratio (found : expected): 0.7967914438502673
Total songs found: 		3278
Total time:			 0:12:04.721103
Songs/sec fetched:		 5.676666490005604



In [12]:
artists = pd.read_json('data/artist_info.json')
artists.sample(5)

Unnamed: 0,artist,followers,genres,id,popularity
1366,Bishop Briggs,206707,[],0yb46jwm7gqbZXVXZQ8Z1e,73
6116,Trio Delfines,837,[],1qvmo3M2lo0qpzRhzNTgIm,26
1402,Daniel Adni,167,[],2KOWOrJ18MXvgBW2q5kG8o,23
7191,Menahan Street Band,58807,"[afrobeat, brass band, deep funk, funk, nu jaz...",1PryMSya1JnSAlcwYawCxp,59
6852,Pony Pony Run Run,29910,[french indie pop],6RXZaVIbZfdngXJOSGrVH3,38


In [13]:
full_history = pd.read_json('data/history_comp.json')
if full_history['timestamp'].max().tz is None:
        full_history['timestamp'] = pd.to_datetime(full_history['timestamp'], utc=True)

In [14]:
missing_artist_ids = []

for i, r in full_history.iterrows():
    for artist in r['artists']:
        if artist not in missing_artist_ids and not (artists['id'] == artist).any():
            missing_artist_ids.append(artist)
            
len(missing_artist_ids)

566

In [15]:
def get_artist_info(id):
    try:
        result = sp.artist(id)
    except:
        return {}
    
    return {'artist': result['name'],
            'id': result['id'], 
            'genres': np.array(result['genres']), 
            'popularity': result['popularity'], 
            'followers': result['followers']['total']}

In [16]:
pbar = None

def progress(*args):
    pbar.update()

cpus = mp.cpu_count()

def async_fetch_real(id):
    '''
    count and display artist searches and timing
    '''
    return get_artist_info(id)

def multiprocess(processes, ids):
    '''
    multiprocessing to utilize all cores
    '''
    pool = mp.Pool(processes=processes)
    results = [pool.apply_async(async_fetch_real, args=(str(i),), callback=progress) for i in ids]
    results = [p.get() for p in results]
    return results

pbar = tqdm(total=len(missing_artist_ids))

s = datetime.now()
artists_temp = pd.DataFrame(multiprocess(cpus, missing_artist_ids)).dropna()
multi_t = datetime.now() - s

pbar.close()

stitched_artists = artists.append(artists_temp, ignore_index=True)
stitched_artists = stitched_artists.drop_duplicates('id')

if not artists_temp.empty:
    stitched_artists.to_json('data/artist_info.json'.format(s.month, s.day))
    print('Saved artist info')
    
print('{} artists added (expected {})'.format(len(stitched_artists)-len(artists), len(artists_temp)))

A Jupyter Widget

retrying ...2secs
retrying ...2secs
retrying ...2secs
retrying ...2secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...5secs
retrying ...5secs
retrying ...5secs
retrying ...5secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...3secs
retrying ...3secs
retrying ...3secs
retrying ...3secs
retrying ...1secs
retrying ...1secs
retrying ...1secs

Saved artist info
566 artists added (expected 566)


The service to export Last.FM data overcounts the most recently listened to song, so I choose to keep the first quarter of instances and drop the remainder. This prevents an artificial skewing toward a song that shouldn't be the mode of the data set. Drawback: if the first song _really_ is the mode of the dataset, I unknowingly change that.

In [17]:
track_mode = full_history['id'].mode()[0]
L = list(full_history.loc[full_history['id'] == track_mode].index)
L = L[int(len(L)*0.25):]
print(full_history.loc[L[0]]['name'])
L

SUMMER


[19840,
 20114,
 21239,
 21580,
 22281,
 22365,
 22755,
 23370,
 23617,
 23622,
 23675,
 23819,
 24398,
 2493,
 25163,
 25260,
 25333,
 25494,
 26003,
 26156,
 26511,
 27481,
 27482,
 27570,
 28315,
 28778,
 293,
 30490,
 31721,
 336,
 4371,
 4679,
 4882,
 5346,
 5465,
 6350,
 6476,
 6596,
 6603,
 6743,
 8155,
 8456]

In [None]:
full_history = full_history.drop(index=L).reset_index()

In [21]:
full_history.to_json('data/history_comp.json')

# Mainstream Music

In [41]:
def fetch_mainstream(playlists):
    mainstream_music = []
    for playlist in playlists:
        for item in tqdm(sp.user_playlist_tracks(playlist[0], playlist[1])['items']):
            id_ = item['track']['id']

            features = sp.audio_features(id_)[0]
            
            try:
                mainstream_music.append(format_return_track(item['track'], features))
            except TypeError:
                print('Missing Data: ' + id_)
            
        print('Playlist complete')

    return mainstream_music

In [42]:
# Today's Top Hits & United States Top 50

playlists = [('spotify', '37i9dQZF1DX0s5kDXi1oC5'), ('spotify', '37i9dQZF1DXcBWIGoYBM5M')]

mainstream_music = pd.DataFrame(fetch_mainstream(playlists)).dropna()
mainstream_music.to_json('data/mainstream_music.json')
mainstream_music.sample(5)

A Jupyter Widget

Playlist complete


A Jupyter Widget

Missing Data: 1ZEm9cJC05rawV2tptNfTS
Missing Data: 04ZTP5KsCypmtCmQg5tH9R
Missing Data: 1NbGcdgwRHZ5rbPIT9hdR3
Missing Data: 3D1rlKmZdpYUeMtRRLNawc
Playlist complete


Unnamed: 0,acousticness,album,artists,danceability,duration_ms,energy,explicit,id,key,liveness,loudness,mode,name,popularity,release,speechiness,tempo,time_signature,valence
1,0.0991,Wild Ones,"[0jnsk9HBra6NMjO2oANoPY, 5WUlDfRSoLAfcVSX1WnrxN]",0.608,232947,0.86,0,1NpW5kyvO4XrNJ3rnfcNy3,5,0.262,-5.324,0,Wild Ones (feat. Sia),79,2012-06-22,0.0554,127.075,4,0.437
78,0.25,Jump (with Trippie Redd),"[0ZED1XzwlLHW4ZaG4lOT6m, 6Xgp2XMz1fhVYe7i6yNAax]",0.654,207857,0.547,1,2OvV4NjEBRE9v8Oo7QeUCq,10,0.0961,-6.598,1,Jump (with Trippie Redd),84,2018-05-04,0.127,173.981,4,0.507
67,0.0375,no tears left to cry,[66CXWjxzNUsdJxJ2JdwvnR],0.703,205947,0.696,0,5SxkdsY1ufZzoq9iXceLw9,0,0.274,-5.482,1,no tears left to cry,98,2018-04-20,0.0529,121.969,4,0.366
64,0.099,Invasion of Privacy,"[4kYSro6naA4h99UJvo89HB, 4q3ewBCX7sLwd24euuV69...",0.816,253390,0.726,1,58q2HKrzhC3ozto2nDdN4z,5,0.372,-3.998,0,I Like It,98,2018-04-06,0.129,136.048,4,0.65
90,0.0615,If You're Over Me,[5vBSrE1xujD2FXYRarbAXc],0.647,189000,0.853,0,01k24g94i1JvkFLQmVEdCd,8,0.062,-4.516,1,If You're Over Me,84,2018-05-10,0.0904,103.846,4,0.618


In [None]:
rapmap = pd.read_json('data/rapworldmap-artists.json')
rapmap.drop('youtube', axis=1, inplace=True)

for artist in rapmap.itertuples():
    rapmap.loc[artist.Index, 'city'] = artist.location['city']
    rapmap.loc[artist.Index, 'neighborhood'] = artist.location['neighborhood']
    rapmap.loc[artist.Index, 'lat'] = float(artist.location['coordinates'].split(',')[0])
    rapmap.loc[artist.Index, 'long'] = float(artist.location['coordinates'].split(',')[1])
    
    rapmap.loc[artist.Index, 'type_rapper'] = 1 if 'rapper' in artist.categories else 0
    rapmap.loc[artist.Index, 'type_singer'] = 1 if 'singer' in artist.categories else 0
    rapmap.loc[artist.Index, 'type_producer/dj'] = 1 if 'producer/dj' in artist.categories else 0
    rapmap.loc[artist.Index, 'type_producer/dj'] = 1 if 'producer / dj' in artist.categories else 0
    rapmap.loc[artist.Index, 'type_producer'] = 1 if 'producer' in artist.categories else 0
    rapmap.loc[artist.Index, 'type_group'] = 1 if 'group' in artist.categories else 0

    
rapmap.drop('location', axis=1, inplace=True)
rapmap.drop('categories', axis=1, inplace=True)