### Import libraries

In [130]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

### Set Up Client Credentials

In [131]:
client_id = YOUR_CLIENT_ID_HERE
client_secret = YOUR_CLIENT_SECRET_HERE

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


### Define which artists retrieve the data

In [132]:
artists = ["Racionais MC's", 'Emicida', 'Coruja BC1']

### Get tracks from each artist

In [133]:
def getTracks(nArtists):
    '''
    Returns a dataframe with the following columns:
    - artist_name: 
    - track_name:
    - track_id:
    - track_popularity: popularity of each track (1-100)
    - artist_id:
    - album_name: album name where the track was released 
    
    '''
    artist_name = []
    track_name = []
    track_popularity = []
    artist_id = []
    track_id = []
    album_name = []
    album_id = []
    for artist in nArtists:
        for i in range(0,1000,50):
            track_results = sp.search(q=f'artist: {artist}', type='track', limit=50, offset=i)
            for i, t in enumerate(track_results['tracks']['items']):
                artist_name.append(t['artists'][0]['name'])
                artist_id.append(t['artists'][0]['id'])
                track_name.append(t['name'])
                track_id.append(t['id'])
                track_popularity.append(t['popularity'])
                album_id.append(t['album']['id'])
                album_name.append(t['album']['name'])

    results = pd.DataFrame({'track_id' : track_id, 'track_name': track_name,  'track_popularity': track_popularity , 'artist_id': artist_id, 'artist_name': artist_name, 'album_id': album_id, 'album_name': album_name})
    return results

track_df = getTracks(artists)


In [134]:
track_df.head()

Unnamed: 0,track_id,track_name,track_popularity,artist_id,artist_name,album_id,album_name
0,6m8AgjfI28ER6odzMxmHtR,"Vida Loka, Pt. 1",71,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2"
1,3ytXzEJFeVydFfmUhHvti8,Negro Drama,70,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2"
2,4nTrxp4aH0g2yLVPkFmljF,"Vida Loka, Pt. 2",70,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2"
3,5wZUvwWGKaZ6NG8yckZcTM,Jesus Chorou,68,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2"
4,7wglwZzZoWUr8sOECwpu6L,Diário de um Detento,67,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,1UzrzuOmZfBgXyS3pgKD10,Sobrevivendo no Inferno


### Get audio features for each track

In [135]:
def getAudioFeatures(ids):
  '''
  Returns a dataframe with the following columns:
  - track_id:
  - danceability:
  - energy:
  - key:
  - loudness
  - mode:
  - speechiness:
  - acousticness: 
  - instrumentalness:
  - liveness:
  - valence:
  - tempo:
  - duration_ms:
  - time_signature:

  '''
  track_id = []
  danceability = []
  energy = []
  key = []
  loudness = []
  mode = []
  speechiness = []
  acousticness = []
  instrumentalness = []
  liveness = []
  valence = []
  tempo = []
  duration_ms = []
  time_signature = []

  for t_id in ids:
    track_features = sp.audio_features(t_id)
    if t_id not in track_id:
      track_id.append(track_features[0]['id'])
      danceability.append(track_features[0]['danceability'])
      energy.append(track_features[0]['energy'])
      key.append(track_features[0]['key'])
      loudness.append(track_features[0]['loudness'])
      mode.append(track_features[0]['mode'])
      speechiness.append(track_features[0]['speechiness'])
      acousticness.append(track_features[0]['acousticness'])
      instrumentalness.append(track_features[0]['instrumentalness'])
      liveness.append(track_features[0]['liveness'])
      valence.append(track_features[0]['valence'])
      tempo.append(track_features[0]['tempo'])
      duration_ms.append(track_features[0]['duration_ms'])
      time_signature.append(track_features[0]['time_signature'])

  results = pd.DataFrame({'track_id': track_id, 'danceability': danceability, 'energy': energy, 'key': key, 'loudness': loudness, 'mode': mode, 'speechiness': speechiness, 'acousticness': acousticness, 'instrumentalness': instrumentalness, 'liveness': liveness, 'valence': valence, 'tempo': tempo, 'duration_ms': duration_ms, 'time_signature': time_signature })
  return results

track_features = getAudioFeatures(track_df['track_id'])

In [136]:
track_features.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,6m8AgjfI28ER6odzMxmHtR,0.569,0.592,4,-8.557,1,0.39,0.544,0.0,0.79,0.752,82.38,303560,4
1,3ytXzEJFeVydFfmUhHvti8,0.858,0.733,7,-7.292,0,0.305,0.275,0.0,0.152,0.801,157.38,411867,4
2,4nTrxp4aH0g2yLVPkFmljF,0.655,0.653,1,-7.2,0,0.273,0.133,0.00813,0.203,0.433,82.1,350560,4
3,5wZUvwWGKaZ6NG8yckZcTM,0.796,0.824,11,-5.448,0,0.355,0.155,2.5e-05,0.355,0.75,167.926,471253,4
4,7wglwZzZoWUr8sOECwpu6L,0.903,0.526,10,-10.069,0,0.334,0.603,0.0,0.0973,0.766,87.005,451000,4


### Get albums for each track

In [137]:
def getAlbums(ids):
    '''
    Returns a dataframe with the following columns:
    - album_id:
    - album_name:
    - album_type:
    - total_tracks:
    - release_date:
    - release_date_precision:
    - artist_id: 
    - artist_name:
    '''
    album_id = []
    album_name = []
    album_type = []
    total_tracks = []
    release_date = []
    release_date_precision = []
    artist_id = []
    artist_name = []

    for alb_id in ids:
        lst_alb_id = [alb_id]
        albums = sp.albums(lst_alb_id)
        if alb_id not in album_id:
            album_id.append(albums['albums'][0]['id'])
            album_name.append(albums['albums'][0]['name'])
            album_type.append(albums['albums'][0]['album_type'])
            total_tracks.append(albums['albums'][0]['total_tracks'])
            release_date.append(albums['albums'][0]['release_date'])
            release_date_precision.append(albums['albums'][0]['release_date_precision'])
            artist_id.append(albums['albums'][0]['artists'][0]['id'])
            artist_name.append(albums['albums'][0]['artists'][0]['name'])

    results = pd.DataFrame({'album_id': album_id, 'album_name': album_name, 'album_type': album_type, 'total_tracks': total_tracks, 'release_date': release_date, 'release_date_precision': release_date_precision, 'artist_id': artist_id, 'artist_name': artist_name})
    return results

album_df = getAlbums(track_df['album_id'])


In [138]:
album_df.head()

Unnamed: 0,album_id,album_name,album_type,total_tracks,release_date,release_date_precision,artist_id,artist_name
0,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2",album,21,2002,year,29CQLw9uLWsl8Qkz9holfr,Racionais MC's
1,1UzrzuOmZfBgXyS3pgKD10,Sobrevivendo no Inferno,album,12,1997-10-07,day,29CQLw9uLWsl8Qkz9holfr,Racionais MC's
2,2QMZRtm35gtG3ZJs0yl9EM,Raio X do Brasil,album,13,1993,year,29CQLw9uLWsl8Qkz9holfr,Racionais MC's
3,1hqniVJju4Nj9qGJR2ZeIR,Mil Faces de um Homem Leal (Marighella),single,1,2017-11-03,day,29CQLw9uLWsl8Qkz9holfr,Racionais MC's
4,1CYfSGAq6xQNF5V8CAeP7m,Cores & Valores,album,15,2014-11-26,day,29CQLw9uLWsl8Qkz9holfr,Racionais MC's


### Get artist information

In [163]:
def getArtistInfo(ids):
    '''
    Returns a dataframe with the following columns for each artist related to the albums in the album_df:
    - artist_id:
    - artist_name:
    - popularity:
    - genres:
    - followers:

    '''
    artist_id = []
    artist_name = []
    artist_popularity = []
    artist_genres = []
    artist_followers = []

    for a_id in ids:
        # lst_a_id = [a_id]
        artist = sp.artist(a_id)
        # artist_information.append(artist)
        if a_id not in artist_id and artist['popularity'] != 0:
            artist_id.append(a_id)
            artist_name.append(artist['name'])
            artist_popularity.append(artist['popularity'])
            artist_genres.append(artist['genres'])
            artist_followers.append(artist['followers']['total'])

    results = pd.DataFrame({'artist_id': artist_id, 'artist_name': artist_name, 'popularity': artist_popularity, 'genres': artist_genres, 'followers': artist_followers})
    return results
    
artist_info = getArtistInfo(album_df['artist_id'])

In [164]:
display(artist_info)

Unnamed: 0,artist_id,artist_name,popularity,genres,followers
0,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,70,"[boom bap brasileiro, brazilian hip hop, funk ...",4555125
1,2d9LRvQJnAXRijqIJDDs2K,Emicida,68,"[afrofuturismo brasileiro, brazilian hip hop, ...",1783506
2,6rM2yY0GnVcOHMU5GD3y9E,Martinho Da Vila,62,"[bossa nova, mpb, pagode, samba, umbanda, velh...",571688
3,6mw0OyFqwxCOmz1v3W3htO,Coruja Bc1,50,[brazilian hip hop],246882
4,5rTjH3aABAmPM5B6DZebZ7,Tiê,53,"[folk brasileiro, mpb, nova mpb]",678743


### Left join into track_df and track_features dataframe

In [141]:
track_df = pd.merge(track_df, track_features, left_on='track_id', right_on='track_id')

In [142]:
track_df.head()

Unnamed: 0,track_id,track_name,track_popularity,artist_id,artist_name,album_id,album_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,6m8AgjfI28ER6odzMxmHtR,"Vida Loka, Pt. 1",71,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2",0.569,0.592,4,-8.557,1,0.39,0.544,0.0,0.79,0.752,82.38,303560,4
1,3ytXzEJFeVydFfmUhHvti8,Negro Drama,70,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2",0.858,0.733,7,-7.292,0,0.305,0.275,0.0,0.152,0.801,157.38,411867,4
2,4nTrxp4aH0g2yLVPkFmljF,"Vida Loka, Pt. 2",70,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2",0.655,0.653,1,-7.2,0,0.273,0.133,0.00813,0.203,0.433,82.1,350560,4
3,5wZUvwWGKaZ6NG8yckZcTM,Jesus Chorou,68,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,4HcPzKyKVtcZCwJgesoZWn,"Nada Como um Dia Após o Outro Dia, Vol. 1 & 2",0.796,0.824,11,-5.448,0,0.355,0.155,2.5e-05,0.355,0.75,167.926,471253,4
4,7wglwZzZoWUr8sOECwpu6L,Diário de um Detento,67,29CQLw9uLWsl8Qkz9holfr,Racionais MC's,1UzrzuOmZfBgXyS3pgKD10,Sobrevivendo no Inferno,0.903,0.526,10,-10.069,0,0.334,0.603,0.0,0.0973,0.766,87.005,451000,4


### Set data types

In [143]:
track_df['artist_name'] = track_df['artist_name'].astype("string")
track_df['track_name'] = track_df['track_name'].astype("string")
track_df['track_id'] = track_df['track_id'].astype("string")
track_df['artist_id'] = track_df['artist_id'].astype("string")
track_df['album_id'] = track_df['album_id'].astype("string")
track_df['album_name'] = track_df['album_name'].astype("string")
track_df['duration_ms'] = pd.to_numeric(track_df['duration_ms'])
track_df['instrumentalness'] = pd.to_numeric(track_df['instrumentalness'])
track_df['time_signature'] = track_df['time_signature'].astype("category")

track_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 424 entries, 0 to 423
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   track_id          424 non-null    string  
 1   track_name        424 non-null    string  
 2   track_popularity  424 non-null    int64   
 3   artist_id         424 non-null    string  
 4   artist_name       424 non-null    string  
 5   album_id          424 non-null    string  
 6   album_name        424 non-null    string  
 7   danceability      424 non-null    float64 
 8   energy            424 non-null    float64 
 9   key               424 non-null    int64   
 10  loudness          424 non-null    float64 
 11  mode              424 non-null    int64   
 12  speechiness       424 non-null    float64 
 13  acousticness      424 non-null    float64 
 14  instrumentalness  424 non-null    float64 
 15  liveness          424 non-null    float64 
 16  valence           424 non-

In [144]:
album_df['album_id'] = album_df['album_id'].astype("string")
album_df['album_name'] = album_df['album_name'].astype("string")
album_df['album_type'] = album_df['album_type'].astype("string")
album_df['release_date'] = pd.to_datetime(album_df['release_date'])
album_df['release_date_precision'] = album_df['release_date_precision'].astype("string")
album_df['artist_id'] = album_df['artist_id'].astype("string")
album_df['artist_name'] = album_df['artist_name'].astype("string")

album_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   album_id                90 non-null     string        
 1   album_name              90 non-null     string        
 2   album_type              90 non-null     string        
 3   total_tracks            90 non-null     int64         
 4   release_date            90 non-null     datetime64[ns]
 5   release_date_precision  90 non-null     string        
 6   artist_id               90 non-null     string        
 7   artist_name             90 non-null     string        
dtypes: datetime64[ns](1), int64(1), string(6)
memory usage: 5.8 KB


In [145]:
artist_info['artist_id'] = artist_info['artist_id'].astype("string")
artist_info['artist_name'] = artist_info['artist_name'].astype("string")
print(type(artist_info['genres'][0]))

artist_info.info()

<class 'list'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   artist_id    7 non-null      string
 1   artist_name  7 non-null      string
 2   popularity   7 non-null      int64 
 3   genres       7 non-null      object
 4   followers    7 non-null      int64 
dtypes: int64(2), object(1), string(2)
memory usage: 408.0+ bytes


### Export to CSV

In [146]:
track_df.to_csv('track_df.csv')
album_df.to_csv('album_df.csv')
artist_info.to_csv('artist_info.csv')