### Import libraries

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from CREDENTIALS import CLIENT_ID, CLIENT_SECRET

-----
### Set Up Client Credentials

In [3]:
client_id = CLIENT_ID
client_secret = CLIENT_SECRET
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


-------

### Define which artists retrieve the data

In [4]:
artists = ["Emicida", "Racionais MC's", "Tasha & Tracie"]

-----

### Get tracks from each artist

In [5]:
def getTracks(nArtists):
    '''
    Returns a dataframe with the following columns:
    artist_name, track_name, track_id, track_popularity, artist_id, album_name
    '''
    artist_name = []
    track_name = []
    track_popularity = []
    artist_id = []
    track_id = []
    album_name = []
    album_id = []
    for artist in nArtists:
        for i in range(0,1000,50):
            track_results = sp.search(q=f'artist: {artist}', type='track', limit=50, offset=i)
            for i, t in enumerate(track_results['tracks']['items']):
                if artist == t['artists'][0]['name']:
                    artist_name.append(t['artists'][0]['name'])
                    artist_id.append(t['artists'][0]['id'])
                    track_name.append(t['name'])
                    track_id.append(t['id'])
                    track_popularity.append(t['popularity'])
                    album_id.append(t['album']['id'])
                    album_name.append(t['album']['name'])

    results = pd.DataFrame({'track_id' : track_id, 'track_name': track_name,  'track_popularity': track_popularity , 'artist_id': artist_id, 'artist_name': artist_name, 'album_id': album_id, 'album_name': album_name})
    return results

track_df = getTracks(artists)


In [6]:
track_df.head()


Unnamed: 0,track_id,track_name,track_popularity,artist_id,artist_name,album_id,album_name
0,0JSux25Te5HYMSr2D64d02,Levanta e Anda,55,2d9LRvQJnAXRijqIJDDs2K,Emicida,57PWjWHzqzODblomXxnQca,O Glorioso Retorno de Quem Nunca Esteve Aqui
1,3XELYgcY0b9mGsJE28r4Fh,Triunfo (A Rua É Nóiz),45,2d9LRvQJnAXRijqIJDDs2K,Emicida,3ZA85zeudxFA1NGeb6avXS,Pra Quem Já Mordeu um Cachorro por Comida Até ...
2,4HniBnVyH2PPYRoQFJGRtY,AmarElo (Sample: Sujeito de Sorte - Belchior),27,2d9LRvQJnAXRijqIJDDs2K,Emicida,22ltKhPdKZ4IuFrna73xPG,AmarElo
3,3De0GCU6ono03UxXKzRmz3,Passarinhos,26,2d9LRvQJnAXRijqIJDDs2K,Emicida,593RZfvtz4IAPWZpELwqDB,"Sobre Crianças, Quadris, Pesadelos e Lições de..."
4,1d5Hpa1FxneKB3DIgH6OZc,Bonjour,28,2d9LRvQJnAXRijqIJDDs2K,Emicida,1NnQYrdDbqZxzTsNaQuDl9,Bonjour


------

### Get audio features for each track

In [7]:
def getAudioFeatures(ids):
  '''
  Returns a dataframe with the following columns:
  track_id, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness,
  liveness, valence, tempo, duration_ms, time_signature.
  '''
  track_id = []
  danceability = []
  energy = []
  key = []
  loudness = []
  mode = []
  speechiness = []
  acousticness = []
  instrumentalness = []
  liveness = []
  valence = []
  tempo = []
  duration_ms = []
  time_signature = []

  for t_id in ids:
    track_features = sp.audio_features(t_id)
    if t_id not in track_id:
      track_id.append(track_features[0]['id'])
      danceability.append(track_features[0]['danceability'])
      energy.append(track_features[0]['energy'])
      key.append(track_features[0]['key'])
      loudness.append(track_features[0]['loudness'])
      mode.append(track_features[0]['mode'])
      speechiness.append(track_features[0]['speechiness'])
      acousticness.append(track_features[0]['acousticness'])
      instrumentalness.append(track_features[0]['instrumentalness'])
      liveness.append(track_features[0]['liveness'])
      valence.append(track_features[0]['valence'])
      tempo.append(track_features[0]['tempo'])
      duration_ms.append(track_features[0]['duration_ms'])
      time_signature.append(track_features[0]['time_signature'])

  results = pd.DataFrame({'track_id': track_id, 'danceability': danceability, 'energy': energy, 'key': key, 'loudness': loudness, 'mode': mode, 'speechiness': speechiness, 'acousticness': acousticness, 'instrumentalness': instrumentalness, 'liveness': liveness, 'valence': valence, 'tempo': tempo, 'duration_ms': duration_ms, 'time_signature': time_signature })
  return results

track_features = getAudioFeatures(track_df['track_id'])

In [8]:
track_features.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0JSux25Te5HYMSr2D64d02,0.488,0.931,9,-4.185,1,0.327,0.0907,0.0,0.446,0.823,87.33,150439,4
1,3XELYgcY0b9mGsJE28r4Fh,0.608,0.909,5,-5.439,0,0.342,0.173,0.0,0.85,0.865,92.172,209373,4
2,4HniBnVyH2PPYRoQFJGRtY,0.555,0.755,8,-6.346,1,0.364,0.0347,1.8e-05,0.0279,0.548,172.081,323187,4
3,3De0GCU6ono03UxXKzRmz3,0.747,0.677,1,-6.24,0,0.157,0.062,0.0,0.109,0.702,159.959,221627,4
4,1d5Hpa1FxneKB3DIgH6OZc,0.749,0.872,7,-4.982,1,0.218,0.222,0.0,0.607,0.715,100.896,212362,4


----

### Get albums for each track

In [9]:
def getAlbums(ids):
    '''
    Returns a dataframe with the following columns:
    album_id, album_name, album_type, total_tracks, release_date, release_date_precision, artist_id, artist_name.
    '''
    album_id = []
    album_name = []
    album_type = []
    total_tracks = []
    release_date = []
    release_date_precision = []
    artist_id = []
    artist_name = []

    for alb_id in ids:
        lst_alb_id = [alb_id]
        albums = sp.albums(lst_alb_id)
        if alb_id not in album_id:
            album_id.append(albums['albums'][0]['id'])
            album_name.append(albums['albums'][0]['name'])
            album_type.append(albums['albums'][0]['album_type'])
            total_tracks.append(albums['albums'][0]['total_tracks'])
            release_date.append(albums['albums'][0]['release_date'])
            release_date_precision.append(albums['albums'][0]['release_date_precision'])
            artist_id.append(albums['albums'][0]['artists'][0]['id'])
            artist_name.append(albums['albums'][0]['artists'][0]['name'])

    results = pd.DataFrame({'album_id': album_id, 'album_name': album_name, 'album_type': album_type, 'total_tracks': total_tracks, 'release_date': release_date, 'release_date_precision': release_date_precision, 'artist_id': artist_id, 'artist_name': artist_name})
    return results

album_df = getAlbums(track_df['album_id'])


In [10]:
album_df.head()

Unnamed: 0,album_id,album_name,album_type,total_tracks,release_date,release_date_precision,artist_id,artist_name
0,57PWjWHzqzODblomXxnQca,O Glorioso Retorno de Quem Nunca Esteve Aqui,album,14,2013,year,2d9LRvQJnAXRijqIJDDs2K,Emicida
1,3ZA85zeudxFA1NGeb6avXS,Pra Quem Já Mordeu um Cachorro por Comida Até ...,album,25,2009-05-01,day,2d9LRvQJnAXRijqIJDDs2K,Emicida
2,22ltKhPdKZ4IuFrna73xPG,AmarElo,album,11,2019-10-30,day,2d9LRvQJnAXRijqIJDDs2K,Emicida
3,593RZfvtz4IAPWZpELwqDB,"Sobre Crianças, Quadris, Pesadelos e Lições de...",album,14,2015-08-07,day,2d9LRvQJnAXRijqIJDDs2K,Emicida
4,1NnQYrdDbqZxzTsNaQuDl9,Bonjour,single,1,2015-02-23,day,2d9LRvQJnAXRijqIJDDs2K,Emicida


----

### Get artist information

In [11]:
def getArtistInfo(ids):
    '''
    Returns a dataframe with the following columns for each artist related to the albums in the album_df:
    artist_id, artist_name, popularity, genres, followers.
    '''
    artist_id = []
    artist_name = []
    artist_popularity = []
    artist_genres = []
    artist_followers = []

    for a_id in ids:
        artist = sp.artist(a_id)
        if a_id not in artist_id:
            artist_id.append(a_id)
            artist_name.append(artist['name'])
            artist_popularity.append(artist['popularity'])
            artist_genres.append(artist['genres'])
            artist_followers.append(artist['followers']['total'])

    results = pd.DataFrame({'artist_id': artist_id, 'artist_name': artist_name, 'popularity': artist_popularity, 'genres': artist_genres, 'followers': artist_followers})
    return results
    
artist_info = getArtistInfo(album_df['artist_id'])

In [12]:
display(artist_info)

Unnamed: 0,artist_id,artist_name,popularity,genres,followers
0,2d9LRvQJnAXRijqIJDDs2K,Emicida,62,"[afrofuturismo brasileiro, brazilian hip hop, ...",1806911
1,0LyfQWJT6nXafLPZqxe9Of,Various Artists,0,[],1503229
2,6rM2yY0GnVcOHMU5GD3y9E,Martinho Da Vila,56,"[bossa nova, mpb, pagode, samba, umbanda, velh...",592663
3,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,65,"[boom bap brasileiro, brazilian hip hop, funk ...",4706843
4,5Gv1C1LY8pWiYcfcdjSNMT,Tasha & Tracie,54,"[funk carioca, rap feminino nacional]",61420


-----------
 ### Get Related Artists

In [13]:
def getRelatedArtists(artists):
    related_artist = []

    for ids in artists:
        artist_related = sp.artist_related_artists(ids)
        for j in range(1, len(artist_related['artists'])):
            related_artist.append(artist_related['artists'][j]['name'])
    
    results = pd.DataFrame({'related_artist_name': related_artist})
    return results

related_artists = getRelatedArtists(artist_info['artist_id'])

-----

### Left join into track_df and track_features dataframe

In [14]:
track_df = pd.merge(track_df, track_features, left_on='track_id', right_on='track_id')

In [15]:
track_df.head()

Unnamed: 0,track_id,track_name,track_popularity,artist_id,artist_name,album_id,album_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0JSux25Te5HYMSr2D64d02,Levanta e Anda,55,2d9LRvQJnAXRijqIJDDs2K,Emicida,57PWjWHzqzODblomXxnQca,O Glorioso Retorno de Quem Nunca Esteve Aqui,0.488,0.931,9,-4.185,1,0.327,0.0907,0.0,0.446,0.823,87.33,150439,4
1,3XELYgcY0b9mGsJE28r4Fh,Triunfo (A Rua É Nóiz),45,2d9LRvQJnAXRijqIJDDs2K,Emicida,3ZA85zeudxFA1NGeb6avXS,Pra Quem Já Mordeu um Cachorro por Comida Até ...,0.608,0.909,5,-5.439,0,0.342,0.173,0.0,0.85,0.865,92.172,209373,4
2,4HniBnVyH2PPYRoQFJGRtY,AmarElo (Sample: Sujeito de Sorte - Belchior),27,2d9LRvQJnAXRijqIJDDs2K,Emicida,22ltKhPdKZ4IuFrna73xPG,AmarElo,0.555,0.755,8,-6.346,1,0.364,0.0347,1.8e-05,0.0279,0.548,172.081,323187,4
3,3De0GCU6ono03UxXKzRmz3,Passarinhos,26,2d9LRvQJnAXRijqIJDDs2K,Emicida,593RZfvtz4IAPWZpELwqDB,"Sobre Crianças, Quadris, Pesadelos e Lições de...",0.747,0.677,1,-6.24,0,0.157,0.062,0.0,0.109,0.702,159.959,221627,4
4,1d5Hpa1FxneKB3DIgH6OZc,Bonjour,28,2d9LRvQJnAXRijqIJDDs2K,Emicida,1NnQYrdDbqZxzTsNaQuDl9,Bonjour,0.749,0.872,7,-4.982,1,0.218,0.222,0.0,0.607,0.715,100.896,212362,4


____

### Set data types

In [16]:
track_df['artist_name'] = track_df['artist_name'].astype("string")
track_df['track_name'] = track_df['track_name'].astype("string")
track_df['track_id'] = track_df['track_id'].astype("string")
track_df['artist_id'] = track_df['artist_id'].astype("string")
track_df['album_id'] = track_df['album_id'].astype("string")
track_df['album_name'] = track_df['album_name'].astype("string")
track_df['duration_ms'] = pd.to_numeric(track_df['duration_ms'])
track_df['instrumentalness'] = pd.to_numeric(track_df['instrumentalness'])
track_df['time_signature'] = track_df['time_signature'].astype("category")

# track_df.convert_dtypes().dtypes

track_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 0 to 345
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   track_id          346 non-null    string  
 1   track_name        346 non-null    string  
 2   track_popularity  346 non-null    int64   
 3   artist_id         346 non-null    string  
 4   artist_name       346 non-null    string  
 5   album_id          346 non-null    string  
 6   album_name        346 non-null    string  
 7   danceability      346 non-null    float64 
 8   energy            346 non-null    float64 
 9   key               346 non-null    int64   
 10  loudness          346 non-null    float64 
 11  mode              346 non-null    int64   
 12  speechiness       346 non-null    float64 
 13  acousticness      346 non-null    float64 
 14  instrumentalness  346 non-null    float64 
 15  liveness          346 non-null    float64 
 16  valence           346 non-

In [17]:
album_df['album_id'] = album_df['album_id'].astype("string")
album_df['album_name'] = album_df['album_name'].astype("string")
album_df['album_type'] = album_df['album_type'].astype("string")
album_df['release_date'] = pd.to_datetime(album_df['release_date'])
album_df['release_date_precision'] = album_df['release_date_precision'].astype("string")
album_df['artist_id'] = album_df['artist_id'].astype("string")
album_df['artist_name'] = album_df['artist_name'].astype("string")

# album_df.convert_dtypes().dtypes

album_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   album_id                71 non-null     string        
 1   album_name              71 non-null     string        
 2   album_type              71 non-null     string        
 3   total_tracks            71 non-null     int64         
 4   release_date            71 non-null     datetime64[ns]
 5   release_date_precision  71 non-null     string        
 6   artist_id               71 non-null     string        
 7   artist_name             71 non-null     string        
dtypes: datetime64[ns](1), int64(1), string(6)
memory usage: 4.6 KB


In [18]:
artist_info['artist_id'] = artist_info['artist_id'].astype("string")
artist_info['artist_name'] = artist_info['artist_name'].astype("string")
print(type(artist_info['genres'][0]))

# artist_info.convert_dtypes().dtypes

artist_info.info()


<class 'list'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   artist_id    5 non-null      string
 1   artist_name  5 non-null      string
 2   popularity   5 non-null      int64 
 3   genres       5 non-null      object
 4   followers    5 non-null      int64 
dtypes: int64(2), object(1), string(2)
memory usage: 328.0+ bytes


In [19]:
related_artists['related_artist_name'] = related_artists['related_artist_name'].astype("string")


related_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 1 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   related_artist_name  76 non-null     string
dtypes: string(1)
memory usage: 736.0 bytes


----

### Export to CSV

In [20]:
track_df.to_csv('track_df.csv')
album_df.to_csv('album_df.csv')
artist_info.to_csv('artist_info.csv')
related_artists.to_csv('related_artists.csv')