In [1]:
!pip3 install spotipy

In [77]:
!pip3 install -U scikit-learn

In [82]:
!pip3 install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.10.0-py3-none-any.whl (215 kB)
[K     |████████████████████████████████| 215 kB 5.8 MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.10.0


In [2]:
import sys
import json
import spotipy
import spotipy.util as util
import os
import pandas as pd
import numpy as np
import configparser

In [3]:
scope = 'user-library-read user-top-read'

config = configparser.ConfigParser()
config.read('config.ini')

scope = 'user-library-read'
username = config['SPOTIFY']['username']
client_id=config['SPOTIFY']['client_id']
client_secret=config['SPOTIFY']['client_secret']
redirect_uri=config['SPOTIFY']['redirect_uri']

token = util.prompt_for_user_token(username, scope,
                           client_id=client_id,
                           client_secret=client_secret,
                           redirect_uri=redirect_uri)

if token:
    print("got token")
    sp = spotipy.Spotify(auth=token)
    print("created client")
else:
    sp = None
    print("No token")

got token
created client


In [4]:
def getAllSongs(sp, redo=False):
    """
    Returns list of data on all saved tracks raw from the spotify api.    
    When running for the first time, saves json of the responses locally.
    Else, it just reads the saved json
    """
    if not sp:
        print("spotify client not initialized")
        return
        
    data = []
    my_songs_filepath = './data/my_songs.json'
    
    if not redo and os.path.exists(my_songs_filepath):
        print("loading existing json of my songs")
        with open(my_songs_filepath, 'r') as f:
            data = json.load(f)
    else:
        offset = 0
        limit = 50
        print("getting data from spotify...")
        while True:
            results = sp.current_user_saved_tracks(limit, offset)
            if 'items' not in results:
                print("items missing from results, printing response")
                print(results)
                break
            
            print("fetched... offset: %s" % offset)
            if len(results['items']):
                data.extend(results['items'])
                offset += limit
            else:
                print("got all songs... saving to json.")
                with open(my_songs_filepath, 'w') as json_file:
                    json.dump(data, json_file)
                break
    return data

# need uri, artist, album, track, popularity, duration, release date\
def transformSongData(song_data):
    transformed_song_data = []
    for song in song_data:
        transformed_data = {}
        transformed_data['track'] = song['track']['name']
        transformed_data['uri'] = song['track']['uri']
        transformed_data['album'] = song['track']['album']['name']
        transformed_data['artist'] = song['track']['artists'][0]['name']
        transformed_data['artist_uri'] = song['track']['artists'][0]['uri']
        transformed_data['popularity'] = song['track']['popularity']
        transformed_data['release_date'] = song['track']['album']['release_date']
        transformed_data['added_at'] = song['added_at']
        
        transformed_song_data.append(transformed_data)
    
    return transformed_song_data

In [5]:
def getAudioFeatures(sp, song_data=[], redo=False):
    """
    Returns list of audio features on all saved tracks raw from the spotify api.    
    When running for the first time, saves json of the responses locally. 
    Else, it just reads the saved json
    Can hit the API in batches of 50.
    """
    if not sp:
        print("spotify client not initialized")
        return
    
    data = []
    audio_features_filepath = 'data/audio_features.json'
    
    if not redo and os.path.exists(audio_features_filepath):
        print("loading existing json of audio features")
        with open(audio_features_filepath, 'r') as f:
            data = json.load(f)
    elif not len(song_data):
        print("existing json of audio features doesn't exist and song data not provided.")
    else:
        print("fetching audio features from spotify...")
        
        # chunk song data into batch of 50 to get audio features of 50 songs at a time
        batch_size = 50
        for i in range(0, len(song_data), batch_size):
            track_ids = []
            outer_bound = min(i+batch_size, len(song_data))
            for j in range(i, outer_bound):
                track_id = song_data[j]['track']['uri'].split(':')[-1]
                track_ids.append(track_id)
            
            results = sp.audio_features(track_ids)
            print("fetched audio features... (%s/%s)" % (i, len(song_data)))
            data.extend(results)
        
        print("got all audio features... saving to json.")
        with open(audio_features_filepath, 'w') as json_file:
            json.dump(data, json_file)
            
    return data

In [6]:
def getUniqueArtistsFromSongs(song_data):
    artists = {}
    for song in song_data:
        uri = song['track']['artists'][0]['uri']
        if uri not in artists:
            name = song['track']['artists'][0]['name']
            artists[uri] = {
                'uri': uri,
                'name': name,
            }
    
    return list(artists.values())

def getArtistData(sp, song_data=[], redo=False):
    if not sp:
        print("spotify client not initialized")
        return
    
    data = []
    artists_filepath = 'data/artists.json'
    if not redo and os.path.exists(artists_filepath):
        print("loading existing json of audio features")
        with open(artists_filepath, 'r') as f:
            data = json.load(f)
    elif not len(song_data):
        print("existing json of artists doesn't exist and song data not provided.")
    else:
        print("fecthing artists from spotify...")
        artists = getUniqueArtistsFromSongs(song_data)
        batch_size = 50
        for i in range(0, len(artists), batch_size):
            artist_ids = []
            outer_bound = min(i+batch_size, len(artists))
            for j in range(i, outer_bound):
                artist_id = artists[j]['uri']
                artist_ids.append(artist_id)
            
            results = sp.artists(artist_ids)

            print("fetched artists... (%s/%s)" % (i, len(artists)))
            data.extend(results['artists'])
        print("got all artists... saving to json.")
        with open(artists_filepath, 'w') as json_file:
            json.dump(data, json_file)
            
    return data

def transformArtistData(artist_data):
    transformed_artist_data = []
    genres = set()
    for artist in artist_data:
        transformed_data = {}
        genres |= set(artist['genres'])
        transformed_data['genres'] = artist['genres']
        transformed_data['artist_uri'] = artist['uri']
        transformed_data['name'] = artist['name']
        transformed_data['artist_popularity'] = artist['popularity']
        for genre in artist['genres']:
            transformed_data[genre] = 1
        transformed_artist_data.append(transformed_data)
   
    return transformed_artist_data, genres

In [7]:
song_data = getAllSongs(sp)
transformed_song_data = transformSongData(song_data)
song_df = pd.DataFrame.from_dict(transformed_song_data)
song_df

loading existing json of my songs


Unnamed: 0,track,uri,album,artist,artist_uri,popularity,release_date,added_at
0,"Look So Good, Be So Good",spotify:track:4a0FM5gDxCenHiLS17Edvr,Freezing to Death,The Shivas,spotify:artist:2OZfuhYQm8IY95egVPC1U9,33,2010-04-01,2020-03-26T16:57:05Z
1,El Condor Pasa (If I Could),spotify:track:1eN42Q7IWRzRBq8eW2Y2TE,Bridge Over Troubled Water,Simon & Garfunkel,spotify:artist:70cRZdQywnSFp9pnc2WTCE,62,1970-01-26,2020-03-06T23:43:34Z
2,This Old House Is All I Have,spotify:track:4SzmBRbDVmi0z4Lnc6H1Za,2012 - 2017,Against All Logic,spotify:artist:0ngUeF0DGpYmPec80MqSi1,56,2018-02-17,2019-12-25T05:06:10Z
3,Got My Mind Set On You - 2004 Mix,spotify:track:3OeUlriM0EZHdWleJtjoVr,Cloud Nine,George Harrison,spotify:artist:7FIoB5PHdrMZVC3q2HE5MS,70,1987-11-02,2019-12-23T18:41:24Z
4,A Little Bit Of Everything,spotify:track:6wLMO8GUyJrZuBwnf4sgsL,Nothing Is Wrong,Dawes,spotify:artist:0CDUUM6KNRvgBFYIbWxJwV,56,2011-06-07,2019-12-12T20:51:31Z
...,...,...,...,...,...,...,...,...
6645,Whiskey Girls,spotify:track:00hf8ngxJLTabS1fw4b7Jt,20 Songs of Electric Light Orchestra,Electric Light Orchestra,spotify:artist:7jefIIksOi1EazgRTfW2Pk,0,2012-08-29,2014-06-10T16:02:10Z
6646,Hold On Tight,spotify:track:4ZAFSG9ge8Zy6w0oSOXgvw,20 Songs of Electric Light Orchestra,Electric Light Orchestra,spotify:artist:7jefIIksOi1EazgRTfW2Pk,0,2012-08-29,2014-06-10T16:02:10Z
6647,Strange Magic,spotify:track:6pyWK3X6WrSnEUss628VQP,20 Songs of Electric Light Orchestra,Electric Light Orchestra,spotify:artist:7jefIIksOi1EazgRTfW2Pk,0,2012-08-29,2014-06-10T16:02:10Z
6648,Sweet Talking Woman,spotify:track:0knz7XF2MvWEUdK2E8obBy,20 Songs of Electric Light Orchestra,Electric Light Orchestra,spotify:artist:7jefIIksOi1EazgRTfW2Pk,0,2012-08-29,2014-06-10T16:02:10Z


In [8]:
artist_data = getArtistData(sp, song_data)
transformed_artist_data, genres = transformArtistData(artist_data)
artist_df = pd.DataFrame.from_dict(transformed_artist_data).fillna(0)
artist_df.head(10)

loading existing json of audio features


Unnamed: 0,genres,artist_uri,name,artist_popularity,indie garage rock,neo-psychedelic,portland indie,classic rock,folk,folk rock,...,liedermacher,berlin school,gospel rap,drama,indie quebecois,alternative r&b,australian r&b,indie jazz,neo r&b,popping
0,"[indie garage rock, neo-psychedelic, portland ...",spotify:artist:2OZfuhYQm8IY95egVPC1U9,The Shivas,41,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[classic rock, folk, folk rock, melancholia, m...",spotify:artist:70cRZdQywnSFp9pnc2WTCE,Simon & Garfunkel,76,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[art pop, chamber psych, electronica, escape r...",spotify:artist:0ngUeF0DGpYmPec80MqSi1,Against All Logic,56,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[album rock, classic rock, folk rock, mellow g...",spotify:artist:7FIoB5PHdrMZVC3q2HE5MS,George Harrison,71,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[alternative country, deep new americana, indi...",spotify:artist:0CDUUM6KNRvgBFYIbWxJwV,Dawes,58,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,[retro soul],spotify:artist:5I6ni4YWY0WJUs6rFqufxT,The Main Squeeze,45,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"[downtempo, jazz boom bap, jazztronica, livetr...",spotify:artist:179BpmLkQCRIoU68Co80f5,Gramatik,62,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"[dreamo, indie pop, indie rock, modern rock]",spotify:artist:5wFXmYsg3KFJ8BDsQudJ4f,Manchester Orchestra,60,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"[indie folk, modern folk rock, piano rock, sto...",spotify:artist:1xVWSPiw5B0OduIC0DSu3V,Delta Spirit,45,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"[alternative hip hop, hip hop, rap, southern h...",spotify:artist:0Y4inQK6OespitzD6ijMwb,Freddie Gibbs,67,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
audio_features = getAudioFeatures(sp, song_data)
audio_df = pd.DataFrame.from_dict(audio_features)
audio_df = audio_df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', \
                     'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', \
                     'uri', 'duration_ms', 'time_signature']]

loading existing json of audio features


In [10]:
joined_df = song_df.merge(audio_df, on='uri').merge(artist_df[['genres', 'artist_popularity', 'artist_uri']], on='artist_uri')
joined_df

Unnamed: 0,track,uri,album,artist,artist_uri,popularity,release_date,added_at,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genres,artist_popularity
0,"Look So Good, Be So Good",spotify:track:4a0FM5gDxCenHiLS17Edvr,Freezing to Death,The Shivas,spotify:artist:2OZfuhYQm8IY95egVPC1U9,33,2010-04-01,2020-03-26T16:57:05Z,0.515,0.872,...,0.0324,0.212,0.712000,0.1420,0.480,118.803,359800,4,"[indie garage rock, neo-psychedelic, portland ...",41
1,El Condor Pasa (If I Could),spotify:track:1eN42Q7IWRzRBq8eW2Y2TE,Bridge Over Troubled Water,Simon & Garfunkel,spotify:artist:70cRZdQywnSFp9pnc2WTCE,62,1970-01-26,2020-03-06T23:43:34Z,0.330,0.214,...,0.0311,0.836,0.070100,0.1780,0.275,147.795,187040,4,"[classic rock, folk, folk rock, melancholia, m...",76
2,Kathy's Song,spotify:track:4Acofe9hICRvyBTP5hFNk0,Sounds Of Silence,Simon & Garfunkel,spotify:artist:70cRZdQywnSFp9pnc2WTCE,54,1966-01-17,2019-04-17T23:52:38Z,0.431,0.188,...,0.0310,0.900,0.000900,0.1220,0.487,102.039,197773,4,"[classic rock, folk, folk rock, melancholia, m...",76
3,I Am a Rock,spotify:track:0byOqNZN9ailhoORv5Ps0Z,Sounds Of Silence,Simon & Garfunkel,spotify:artist:70cRZdQywnSFp9pnc2WTCE,61,1966-01-17,2019-04-17T23:52:38Z,0.660,0.611,...,0.0392,0.423,0.000011,0.1070,0.749,113.898,169520,4,"[classic rock, folk, folk rock, melancholia, m...",76
4,"Mrs. Robinson - From ""The Graduate"" Soundtrack",spotify:track:0iOZM63lendWRTTeKhZBSC,Bookends,Simon & Garfunkel,spotify:artist:70cRZdQywnSFp9pnc2WTCE,76,1968-04-03,2018-10-02T16:39:40Z,0.606,0.457,...,0.0497,0.713,0.000025,0.0747,0.813,92.033,244027,4,"[classic rock, folk, folk rock, melancholia, m...",76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6645,Bad Company (Remastered Album Version),spotify:track:7LZyhNjhyXj27iEXUYQYHo,Bad Company,Bad Company,spotify:artist:5AEG63ajney2BoDXi0Vb84,4,1974,2014-07-10T04:45:15Z,0.565,0.494,...,0.0306,0.502,0.187000,0.0731,0.385,114.113,287040,4,"[album rock, blues rock, classic rock, country...",64
6646,The Way I Choose (Remastered Album Version),spotify:track:0TZIp9elVftVQAtIAWlc6N,Bad Company,Bad Company,spotify:artist:5AEG63ajney2BoDXi0Vb84,0,1974,2014-07-10T04:45:15Z,0.583,0.411,...,0.0296,0.097,0.000023,0.0526,0.412,122.571,303040,3,"[album rock, blues rock, classic rock, country...",64
6647,Movin' On (Remastered Album Version),spotify:track:6kO3DBWBBtqchQ9RWRhQD0,Bad Company,Bad Company,spotify:artist:5AEG63ajney2BoDXi0Vb84,0,1974,2014-07-10T04:45:15Z,0.662,0.835,...,0.0349,0.500,0.002730,0.1440,0.904,117.868,200467,4,"[album rock, blues rock, classic rock, country...",64
6648,Seagull (Remastered Album Version),spotify:track:1dl1MRNBqBfQoHONGFUmIN,Bad Company,Bad Company,spotify:artist:5AEG63ajney2BoDXi0Vb84,0,1974,2014-07-10T04:45:15Z,0.361,0.335,...,0.0307,0.358,0.000017,0.2240,0.416,100.794,243173,4,"[album rock, blues rock, classic rock, country...",64


In [11]:
joined_df['duration_min'] = joined_df['duration_ms']/60000

In [12]:
joined_df.to_csv('data/songs_with_features.csv', index=False)

In [13]:
joined_df.describe()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_popularity,duration_min
count,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0,6650.0
mean,30.190376,0.505176,0.62357,5.165564,-8.837362,0.709774,0.066125,0.296084,0.167453,0.220162,0.506533,120.947989,241972.5,3.879549,63.485263,4.032874
std,20.604524,0.153569,0.234191,3.5386,4.59091,0.453901,0.092409,0.312634,0.290538,0.192347,0.248319,29.419335,128110.9,0.461891,17.808177,2.135181
min,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5560.0,0.0,0.0,0.092667
25%,13.0,0.399,0.47,2.0,-10.70175,0.0,0.0327,0.022125,4e-05,0.1,0.317,98.79475,178779.8,4.0,55.0,2.979663
50%,33.0,0.511,0.657,5.0,-7.883,1.0,0.0409,0.167,0.00426,0.138,0.514,119.75,222960.0,4.0,67.0,3.716
75%,46.0,0.611,0.815,9.0,-5.83875,1.0,0.060175,0.53,0.182,0.283,0.706,138.55975,274460.2,4.0,75.0,4.574338
max,85.0,0.968,0.997,11.0,0.49,1.0,0.952,0.996,0.996,0.994,0.983,215.895,3816373.0,5.0,94.0,63.606217


In [14]:
pd.DataFrame({'genres':list(genres)}).to_csv('data/my_genres.csv', index=False)

In [15]:
artist_df.head()

Unnamed: 0,genres,artist_uri,name,artist_popularity,indie garage rock,neo-psychedelic,portland indie,classic rock,folk,folk rock,...,liedermacher,berlin school,gospel rap,drama,indie quebecois,alternative r&b,australian r&b,indie jazz,neo r&b,popping
0,"[indie garage rock, neo-psychedelic, portland ...",spotify:artist:2OZfuhYQm8IY95egVPC1U9,The Shivas,41,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[classic rock, folk, folk rock, melancholia, m...",spotify:artist:70cRZdQywnSFp9pnc2WTCE,Simon & Garfunkel,76,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[art pop, chamber psych, electronica, escape r...",spotify:artist:0ngUeF0DGpYmPec80MqSi1,Against All Logic,56,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[album rock, classic rock, folk rock, mellow g...",spotify:artist:7FIoB5PHdrMZVC3q2HE5MS,George Harrison,71,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[alternative country, deep new americana, indi...",spotify:artist:0CDUUM6KNRvgBFYIbWxJwV,Dawes,58,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
artist_df.to_csv('data/artist_genre_one_hot.csv', index=False)

In [17]:
genre_one_hot = artist_df.drop(columns=['artist_uri', 'name', 'genres', 'artist_popularity'])

In [18]:
genre_count_dict = []
for genre in genre_one_hot.columns.tolist():
    genre_count_dict.append({
        'genre': genre, 
        'artist_count': genre_one_hot[genre].sum()
    })

In [19]:
genre_df = pd.DataFrame.from_dict(genre_count_dict)
genre_df.sort_values(by=['artist_count'], ascending=False).head(20)

Unnamed: 0,genre,artist_count
8,rock,161.0
39,modern rock,98.0
3,classic rock,98.0
25,indie rock,94.0
63,garage rock,70.0
24,indie pop,70.0
5,folk rock,68.0
72,blues rock,66.0
59,alternative rock,66.0
18,album rock,65.0


In [21]:
artist_df[artist_df['mellow gold'] == 1][['name']].head(50)

Unnamed: 0,name
1,Simon & Garfunkel
3,George Harrison
13,Graham Nash
16,Bob Dylan
25,Eric Clapton
82,Fleetwood Mac
90,Grateful Dead
96,The Outlaws
97,Donovan
111,Electric Light Orchestra
