In [6]:
import pandas as pd
import pickle
import requests as req
import time
import numpy as np
from sklearn.cluster import KMeans

In [3]:
artists_data = pd.read_csv('artists.csv')

In [4]:
def extract_spotify_features(artists):
    
    """
    You'll need to go through the process of getting a token if you want this to run in reasonable time.
    
    The token below is almost certainly expired.
    """
    
    start = time.clock()
    
    errors = dict()
    spotify_features = dict()
    genres = set()
    
    headers = {'Authorization': 'Bearer BQD1LOB5IzgxBT4rkOAiX_8MI-0ky3uNcw-axkpiotEnmQH8YoI8be5u23hMLeO0D9IXbe3vEdVgogrVPjdv3g'}
    
    artists_df = pd.DataFrame(artists)
    data = artists_df.set_index('name').groupby('artist').groups
    
    for artist, name in data.iteritems():
        payload = {'type': 'artist', 'limit': 1, 'q': 'artist:"{}"'.format(name[0])}
    
        resp = req.get('https://api.spotify.com/v1/search', params=payload, headers=headers).json()
        
        if resp.get('error'):
            errors[artist] = resp.get('message')
            continue
            
        if not resp['artists']['total']:  # No matches found
            errors[artist] = 'No matches found.'
            continue
            
        spotify_artist = resp['artists']['items'][0]
        genres.update(spotify_artist['genres'])
        
        spotify_features[artist] = {
            'genres': spotify_artist['genres'],
            'popularity': spotify_artist['popularity'],
            'followers': spotify_artist['followers']['total'],
        }
    
    print(time.clock() - start)
    return spotify_features, errors, genres

In [174]:
default_features = {artist: {'genres': [], 'popularity': 0, 'followers': 0} for artist in errors}

In [175]:
final_spotify_features = default_features.copy()
final_spotify_features.update(spotify_features)

In [178]:
pickle.dump(final_spotify_features, open('spotify.pkl', 'w'))

In [10]:
cached_spotify = pickle.load(open('spotify.pkl'))
genres = set()
for _, val in cached_spotify.iteritems():
    genres.update(val['genres'])

In [16]:
def cluster_genres(genres, artists_data, from_spotify):
    
    start = time.clock()

    genre_map = dict()
    for i, genre in enumerate(genres):
        genre_map[genre] = i
        
    artists_df = pd.DataFrame(artists_data)
    data = artists_df.set_index('name').groupby('artist').groups
    
    artist_map = dict()
    for i, key in enumerate(data):
        artist_map[key] = i
        
    X = np.zeros(shape=(2000, len(genres)))
    
    for key in data:
        genre_ids = [genre_map[genre] for genre in from_spotify[key]['genres']]
        artist_id = artist_map[key]
        for genre_id in genre_ids:
            X[artist_id][genre_id] = 1
            
    clf = KMeans(n_clusters=20)
    classes = clf.fit_predict(X)
    
    print(time.clock() - start)
    
    return artist_map, classes

In [17]:
artist_map, class_labels = cluster_genres(genres, artists_data, cached_spotify)

2.29487


In [25]:
features_from_spotify = cached_spotify.copy()
for artist, i in artist_map.iteritems():
    features_from_spotify[artist]['genre_id'] = class_labels[i]

In [26]:
pd.DataFrame(features_from_spotify)

Unnamed: 0,000d90ec-d64c-48a1-b775-e726fd240e9f,000fc734-b7e1-4a01-92d1-f544261b43f5,0019749d-ee29-4a5f-ab17-6bfa11deb969,0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,004e5eed-e267-46ea-b504-54526f1f377d,00565b31-14a3-4913-bd22-385eb40dd13c,00a9f935-ba93-4fc8-a33a-993abe9c936b,00eeed6b-5897-4359-8347-b8cd28375331,0103c1cc-4a09-4a5d-a344-56ad99a77193,0110e63e-0a9b-4818-af8e-41e180c20b9a,...,ff6e677f-91dd-4986-a174-8db0474b1799,ff7f80cd-05c2-4068-a00e-fbfbd453d049,ff865aa0-4603-4f79-ae8b-8735332e2cfa,ff95eb47-41c4-4f7f-a104-cdc30f02e872,ff9deaae-da4f-42b7-a19e-36fedd3fc706,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,ffb2d3e3-a4cc-48cf-8fb0-f2f846e9d7b9,ffb390b8-8df4-4b72-97d1-7b2fc008a452,ffe16bba-4d84-409b-8f22-5242c60b930f,ffe9ec08-6b6b-4993-9394-e280b429dbfd
followers,10035,104962,15492,686097,33032,65639,541219,154581,1546558,272825,...,1545681,35341,300535,179128,25705,86013,78746,184108,19156,75536
genre_id,5,6,4,1,2,18,2,18,10,1,...,11,5,11,5,11,5,5,10,8,5
genres,[british indie rock],"[alternative rock, art rock, britpop, chamber ...","[acid jazz, big beat, breakbeat, drum and bass...","[alternative rock, folk christmas, folk-pop, g...","[gothic metal, gothic symphonic metal, progres...","[album rock, crossover thrash, death metal, gl...","[finnish metal, gothic metal, gothic symphonic...","[glam metal, hard rock, metal, neo classical m...","[canadian pop, candy pop, dance pop, neo mello...","[chamber pop, dream pop, folk christmas, folk-...",...,"[neo mellow, pop rock]","[german metal, medieval rock, neue deutsche ha...","[ccm, christian alternative rock, christian ch...","[ambient, art rock, compositional ambient, dro...","[folk christmas, indie christmas, indie folk, ...","[compositional ambient, post rock]",[dwn trap],"[dance pop, emo, pop, pop punk, pop rap, post-...","[dreamo, emo, piano rock, pop punk]","[downtempo, new tribe, psychedelic trance, psy..."
popularity,29,52,34,66,41,48,63,52,72,62,...,75,44,61,64,38,54,66,55,38,46


In [27]:
pickle.dump(features_from_spotify, open('features_from_spotify.pkl', 'w'))