# Collect additional musician data
We have collected a seed set of musicians from Wikipedia [here](collect_pages_from_wikipedia_by_category.py) and now we seek to expand the list.

Methods:
- Suggestions from music streaming service, i.e. [Spotify](https://developer.spotify.com/console/get-search-item/).
- Broad queries to [DBPedia](https://dbpedia.org/sparql), e.g. "all musicians from Colombia".

## Music streaming suggestions

In [42]:
import pandas as pd
spotify_auth_data = pd.read_csv('../../data/culture_metadata/spotify_auth.csv', header=None, index_col=0).loc[:, 1]
spotify_auth_token = spotify_auth_data.loc['auth_token']

First we need to look up all the musicians' IDs.

In [17]:
import requests
test_artist = 'shakira'
search_type = 'artist'
search_url = 'https://api.spotify.com/v1/search'
search_result = requests.get(search_url, params={'q' : test_artist, 'type' : search_type}, headers={'authorization' : f'Bearer {spotify_auth_token}'})
print(search_result)

<Response [200]>


In [18]:
search_result.json()

{'artists': {'href': 'https://api.spotify.com/v1/search?query=shakira&type=artist&offset=0&limit=20',
  'items': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0EmeFodog0BfCgMzAIvKQp'},
    'followers': {'href': None, 'total': 18151611},
    'genres': ['colombian pop', 'dance pop', 'latin', 'latin pop', 'pop'],
    'href': 'https://api.spotify.com/v1/artists/0EmeFodog0BfCgMzAIvKQp',
    'id': '0EmeFodog0BfCgMzAIvKQp',
    'images': [{'height': 640,
      'url': 'https://i.scdn.co/image/234f86923486e8ba3860def8518c01d5054aac1f',
      'width': 640},
     {'height': 320,
      'url': 'https://i.scdn.co/image/2577d225adf779981103c8b0dc75f198e4a55f31',
      'width': 320},
     {'height': 160,
      'url': 'https://i.scdn.co/image/caa3eae0ede56110b7c4b7694cfb20e754052f9d',
      'width': 160}],
    'name': 'Shakira',
    'popularity': 89,
    'type': 'artist',
    'uri': 'spotify:artist:0EmeFodog0BfCgMzAIvKQp'},
   {'external_urls': {'spotify': 'https://open.spotify.com/ar

In [20]:
print(search_result.status_code)

200


In [25]:
a = search_result.json()['artists']['items']
print(len(a))

20


Great! Let's restrict the results to exact name matches and take the most popular result (follower count).

In [70]:
from unidecode import unidecode
from time import sleep
def clean_txt(txt):
    return unidecode(txt.lower())
def collect_artist_data(artist_name, api, 
#                         search_url='https://api.spotify.com/v1/search', 
                        request_sleep_time=60,
                        artist_keys=['name', 'id', 'genres'],
                        SEARCH_LIMIT=20):
    """
    Collect all artist data from Spotify:
    follower count, genres, ID
    """
    clean_artist_name = clean_txt(artist_name)
    successful_search = False
    while(not successful_search):
        search_result = api.search(q=clean_artist_name, type='artist', limit=SEARCH_LIMIT)
        # TODO: status code
#         search_result = requests.get(search_url, params={'q' : clean_artist_name, 'type' : search_type}, headers={'authorization' : f'Bearer {auth_token}'})
#         sleep(10)
#         if(search_result.status_code == 200):
        successful_search = True
        # filter for name match
        search_result_data = search_result['artists']['items']
        valid_search_result_data = list(filter(lambda x: clean_txt(x['name'])==clean_artist_name, search_result_data))
        if(len(valid_search_result_data) > 0):
            # sort by followers
            valid_search_result_data = list(sorted(valid_search_result_data, key=lambda x: x['followers']['total'], reverse=True))
            # take top result
            matching_search_result = valid_search_result_data[0]
            artist_data = {k : matching_search_result[k] for k in artist_keys}
            artist_data['followers'] = matching_search_result['followers']['total']
        else:
            artist_data = {'name' : clean_artist_name}
#         elif(search_result.status_code == 429):
#             print(f'error, too many requests, sleeping for {request_sleep_time} sec')
#             sleep(request_sleep_time)
    return artist_data
def collect_all_artist_data(artist_names, api):
    artist_data_combined = []
    for artist_name in artist_names:
        print('processing artist %s'%(artist_name))
        artist_data = collect_artist_data(artist_name, api)
        artist_data = pd.Series(artist_data)
        artist_data_combined.append(artist_data)
    artist_data_combined = pd.concat(artist_data_combined, axis=1).transpose()
    return artist_data_combined

In [60]:
# we are going to use python wrapper for Spotify because otherwise we need to regenerate auth tokens
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import Spotify
api_creds = SpotifyClientCredentials(client_id=spotify_auth_data.loc['client_id'], client_secret=spotify_auth_data.loc['client_secret'])
api = Spotify(client_credentials_manager=api_creds)

In [63]:
test_artist = 'shakira'
artist_data = collect_artist_data(test_artist, api)
print(artist_data)

{'name': 'Shakira', 'id': '0EmeFodog0BfCgMzAIvKQp', 'genres': ['colombian pop', 'dance pop', 'latin', 'latin pop', 'pop'], 'followers': 18151611}


In [64]:
# load wiki data
latin_american_wiki_en_artist_data = pd.read_csv('../../data/culture_metadata/latin_american_pop_musicians_en_wiki_data.tsv', sep='\t', index_col=False)
latin_american_wiki_es_artist_data = pd.read_csv('../../data/culture_metadata/latin_american_pop_musicians_es_wiki_data.tsv', sep='\t', index_col=False)
latin_american_wiki_artist_data = pd.concat([latin_american_wiki_en_artist_data, latin_american_wiki_es_artist_data], axis=0)
# add clean col
latin_american_wiki_artist_data = latin_american_wiki_artist_data.assign(**{
    'clean_name' : latin_american_wiki_artist_data.loc[:, 'name'].apply(clean_txt)
})
# drop duplicates
latin_american_wiki_artist_data.drop_duplicates('clean_name', inplace=True)
display(latin_american_wiki_artist_data.head())

Unnamed: 0,name,wiki_url,page_category,clean_name
0,Dorismar,/wiki/Dorismar,Argentine_pop_singers,dorismar
1,Emilia Mernes,/wiki/Emilia_Mernes,Argentine_pop_singers,emilia mernes
2,Lali Espósito,/wiki/Lali_Esp%C3%B3sito,Argentine_pop_singers,lali esposito
3,Laura Natalia Esquivel,/wiki/Laura_Natalia_Esquivel,Argentine_pop_singers,laura natalia esquivel
4,Valeria Gastaldi,/wiki/Valeria_Gastaldi,Argentine_pop_singers,valeria gastaldi


In [66]:
## old code: generating auth token
# generate auth token
# client_id = spotify_auth_data.loc['client_id']
# redirect_URI = spotify_auth_data.loc['redirect_URI']
# auth_request_URL = 'https://accounts.spotify.com/authorize'
# auth_request_result = requests.get(auth_request_URL, params={'client_id' : client_id, 'response_type' : 'code', 'redirect_uri' : redirect_URI})
# print(auth_request_result)

In [71]:
latin_american_artist_clean_names = latin_american_wiki_artist_data.loc[:, 'clean_name'].unique()
latin_american_artist_query_data = collect_all_artist_data(latin_american_artist_clean_names, api)

processing artist dorismar
processing artist emilia mernes
processing artist lali esposito
processing artist laura natalia esquivel
processing artist valeria gastaldi
processing artist isol
processing artist hilda lizarazu
processing artist marcela morelo
processing artist emanuel ortega
processing artist sofia reca
processing artist violeta rivas
processing artist benjamin rojas
processing artist julian serrano
processing artist silvia suller
processing artist tormenta
processing artist diego torres
processing artist trix
processing artist maxi trusso
processing artist sam alves
processing artist baby do brasil
processing artist bianca alencar
processing artist biel
processing artist lua blanco
processing artist roberto carlos
processing artist da lou
processing artist natalia damini
processing artist diana
processing artist felipe dylon
processing artist marjorie estiano
processing artist manu gavassi
processing artist latino
processing artist tania mara
processing artist mc sapao
pr

processing artist maria fernanda neil
processing artist nicki nicole
processing artist chico novarro
processing artist rosario ortega
processing artist agustina palma
processing artist natalie perez
processing artist abel pintos
processing artist lola ponce
processing artist nono pugliese
processing artist paolo ragone
processing artist santiago ramundo
processing artist geronimo rauch
processing artist martin ricca
processing artist nicolas riera
processing artist rizha
processing artist juan manuel rodil
processing artist sol rodriguez
processing artist rubi
processing artist claudia ruffinatti
processing artist pablo ruiz
processing artist oriana sabatini
processing artist raul sagan
processing artist paola sallustro
processing artist dolores sarmiento
processing artist belen scalella
processing artist noel schajris
processing artist ale sergi
processing artist eddie sierra
processing artist blanquita silvan
processing artist johny tedesco
processing artist tete
processing artist ti

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [78]:
import numpy as np
latin_american_artist_query_data = latin_american_artist_query_data.assign(**{
    'clean_name' : latin_american_artist_query_data.loc[:, 'name'].apply(lambda x: clean_txt(x))
})
valid_latin_american_artist_query_data = latin_american_artist_query_data[~latin_american_artist_query_data.loc[:, 'id'].apply(lambda x: type(x) is float and np.isnan(x))]
print('%d/%d valid queries'%(valid_latin_american_artist_query_data.shape[0], latin_american_artist_query_data.shape[0]))

424/526 valid queries


In [79]:
display(valid_latin_american_artist_query_data[valid_latin_american_artist_query_data.loc[:, 'clean_name']=='shakira'].head())

Unnamed: 0,followers,genres,id,name,clean_name
83,18151611,"[colombian pop, dance pop, latin, latin pop, pop]",0EmeFodog0BfCgMzAIvKQp,Shakira,shakira


OK! Now that we have the IDs, let's look for similar artists.

In [110]:
def get_similar_artists(artist_id, api, artist_keys=['name', 'id', 'genres'],):
    similar_artist_data_response = api.artist_related_artists(artist_id)
    similar_artists = similar_artist_data_response['artists']
    similar_artist_data_combined = []
    for similar_artist in similar_artists:
        artist_data = {k : similar_artist[k] for k in artist_keys}
        artist_data['followers'] = similar_artist['followers']['total']
        similar_artist_data_combined.append(pd.Series(artist_data))
    if(len(similar_artist_data_combined) > 0):
        similar_artist_data_combined = pd.concat(similar_artist_data_combined, axis=1).transpose()
    return similar_artist_data_combined
def get_similar_artists_all_ids(artist_ids, api):
    similar_artist_data_combined = []
    for artist_id in artist_ids:
        print('processing ID %s'%(artist_id))
        similar_artist_data = get_similar_artists(artist_id, api)
        if(len(similar_artist_data) > 0):
            similar_artist_data = similar_artist_data.assign(**{
                'similar_artist_query_id' : artist_id,
            })
            similar_artist_data_combined.append(similar_artist_data)
    similar_artist_data_combined = pd.concat(similar_artist_data_combined, axis=0)
    return similar_artist_data_combined

In [105]:
test_id = '0EmeFodog0BfCgMzAIvKQp'
a = get_similar_artists(test_id, api)
display(a.head())

Unnamed: 0,name,id,genres,followers
0,Paulina Rubio,1d6dwipPrsFSJVmFTTdFSS,"[dance pop, latin, latin arena pop, latin pop,...",1481320
1,Thalía,23wEWD21D4TPYiJugoXmYb,"[dance pop, latin, latin pop, mexican pop]",2385202
2,Belinda,5LeiVcEnsZcwc133TUhJNW,"[dance pop, latin, latin arena pop, latin pop,...",711735
3,Belanova,3oNy8cjBtJzLC07I70sklp,"[dance pop, latin, latin alternative, latin ar...",564632
4,Gloria Trevi,1Db5GsIoVWYktPoD2nnPZZ,"[latin, latin arena pop, latin pop, mexican pop]",1743462


In [107]:
a[~a.loc[:, 'id'].isin(valid_latin_american_artist_ids)]

Unnamed: 0,name,id,genres,followers
3,Belanova,3oNy8cjBtJzLC07I70sklp,"[dance pop, latin, latin alternative, latin ar...",564632
10,OV7,5zaT4Qu9otu6z4oyWjRqM2,"[dance pop, grupera, latin, latin arena pop, l...",765380
11,Alejandra Guzman,7Hf9AwMO37bSdxHb0FBGmO,"[grupera, latin, latin arena pop, latin pop, m...",1760414
12,Playa Limbo,6XmHtVhgpE33VHFEp2V1P8,"[dance pop, grupera, latin, latin arena pop, l...",1012881
13,La Quinta Estacion,7FZj349hdLfD6qzXkJLuAh,"[latin, latin arena pop, latin pop, pop, rock ...",1395141
15,Kabah,61hAcjvvUS6EXMpeeHwaDi,"[dance pop, grupera, latin, latin pop, mexican...",430941
16,Jesse & Joy,1mX1TWKpNxDSAH16LgDfiR,"[latin, latin arena pop, latin pop, mexican po...",3311158
17,Aleks Syntek,0r8toju2ecKaVtItkzAnNi,"[latin, latin alternative, latin arena pop, la...",719825
18,Ha*Ash,5xd2Tg7Zo8755eCy8Gxkp8,"[latin, latin arena pop, latin pop, mexican pop]",3865398
19,Miguel Bosé,7mWCSSOYqm4E9mB7V4ot6S,"[latin, latin pop, mexican pop, rock en espano...",1156090


In [111]:
valid_latin_american_artist_ids = valid_latin_american_artist_query_data.loc[:, 'id'].unique()
latin_american_similar_artists = get_similar_artists_all_ids(valid_latin_american_artist_ids, api)

processing ID 1O3jQItsBlBvLYmCmYfaIQ
processing ID 1imu37uW2KnEiLMAKmQVgn
processing ID 63xFjf6Ww22O5gHrBP79G9
processing ID 1DbNO9pwS4AzYbWX367Pvg
processing ID 3MRKKlBIXnTcwCoyeCqL6X
processing ID 37wvb2slf0x1R2W9xIUsWw
processing ID 3uJCc4nUdN85RXQd464mSR
processing ID 4FOiEKPQZ60hzEXm0Vgyg3
processing ID 5IxyAzEThg31lxtUWCcb2q
processing ID 0LALueHEQunQ2d61fXGeZh
processing ID 6n5eDbbT3ErAGFipwc9DEg
processing ID 1V7tGmh5DDSNE0VsUcorJL
processing ID 6GGy7nQ4GbJ7s6u2W94hgY
processing ID 2K5llYyoLmzpOcbVuOY1ls
processing ID 3NPdP88KulJtVGnJhEMHqs
processing ID 6SjLkpJ7cGqX5HWh23e74K
processing ID 1shvlaBEDzIJGYhXbMxKJY
processing ID 7fAKtXSdNInWAIf0jVUz65
processing ID 5GSVxDAx5DRZtGydYgE6Fv
processing ID 0Hr79Md4RZyKvoajkkTYII
processing ID 2sohJaAXhL0jr1SiMuzBDd
processing ID 3GfoqBEQHAxeWykLJYzIyH
processing ID 6xF3XdxG7VHRmQmTJxRqX8
processing ID 06EMbW4WO6U4fGNnKjeuI5
processing ID 5XN8KhxZVObkfRvTgsQFuN
processing ID 4I4iP1ZXPT2eLo3R4cBPig
processing ID 0rzVUhvOIVNUbY6uKCpGfb
p

processing ID 53EZ5ABP35N5oPGYTvjwU7
processing ID 6AvVNBiwAW7CXZPACAo2OB
processing ID 4uRB9m8Y8tHrW1lwvV7RJ1
processing ID 1Y99HOeRzRc27my6NJE3rE
processing ID 6HTUcOExehqydqa7C3usAa
processing ID 6GjbWWx5Zmmwc5lSF8095y
processing ID 3dmO4UJLNPyzMXwAk6Jn96
processing ID 4u906EKZ3K68xEq6H96OSq
processing ID 7nXYKGgZqPuiqAzbXrWLBQ
processing ID 0uqXDD3RrzFosE8bZVFriI
processing ID 0ePBngzq9aL6oB3S8fVF5p
processing ID 5BLxach4cxTAK3Pg0Nu2r2
processing ID 36ZqMC7zjLgKhxKc1l9g0X
processing ID 4nosUizPZKMeOXyIwmC4vq
processing ID 2uEjXyVi0SA1HPpj3zquXh
processing ID 6oKLWbs4OqvmeXnREk0flZ
processing ID 4QILOFSgMcb9gUF8GEkD3V
processing ID 1s1uqfcKS1MHmaLLtDrjfv
processing ID 0eykNIBJy8E8Cyd00Y5fqx
processing ID 2dXAUSS8KOMRulaMWbC2yn
processing ID 4JG8tWjcIHMtsBFz9IIlUL
processing ID 3YS7AYmRQooFmsiziBh3Wf
processing ID 3bxvahhx6gUAHFSZYHHMDt
processing ID 3feslCMBl9d4ym41eCpIzw
processing ID 3NmbU7646zRJZ4hXxHqyNT
processing ID 32x1uogH2zajP85pzZAtuE
processing ID 79TYiPyB5Mh5NzZPeTckCa
p

In [113]:
unique_latin_american_similar_artist_data = latin_american_similar_artists.drop_duplicates('id', inplace=False)
print('%d/%d unique similar artists'%(unique_latin_american_similar_artist_data.shape[0], latin_american_similar_artists.shape[0]))
# check overlap with original data
new_unique_latin_american_similar_artist_data = unique_latin_american_similar_artist_data[~unique_latin_american_similar_artist_data.loc[:, 'id'].isin(valid_latin_american_artist_ids)]
print('%d/%d new IDs'%(new_unique_latin_american_similar_artist_data.shape[0], unique_latin_american_similar_artist_data.shape[0]))

3703/7336 unique similar artists
3478/3703 new IDs


Great! Better coverage now. Let's see the genre distribution.

In [115]:
from functools import reduce
similar_artist_genres = list(reduce(lambda x,y: x+y, new_unique_latin_american_similar_artist_data.loc[:, 'genres'].values))
similar_artist_genre_counts = pd.Series(similar_artist_genres).value_counts()
display(similar_artist_genre_counts.head(20))

latin pop            194
latin                190
latin rock           144
rock en espanol      127
nueva cancion        114
tropical             109
mpb                  107
latin alternative     99
chilean rock          97
pagode                93
grupera               89
pop reggaeton         88
baile pop             86
argentine rock        84
chilean indie         81
pop romantico         78
rock nacional         77
funk carioca          76
spanish pop           74
argentine indie       70
dtype: int64

OK! This is a good sign. Most of the new artists that we've discovered have more Latin American genres.