In [324]:
## Installing Spotify's API library in Python

# pip install spotipy

In [325]:
## Importing libraries

import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime

sys.path.append("/home/tabas/personal-dev/pyprojects")
import pipelines.utils.personal_env as penv

In [326]:
# Importing Spotify Credentials

CLIENT_ID = penv.spotify_client_id
CLIENT_SECRET = penv.spotify_client_secret

In [327]:
# Stablishing Spotify Authentication

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [328]:
# Creating a list of all available markets on Spotify

markets = [
            "AD", "AE", "AG", "AL", #"AM", "AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", 
            #"BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", 
            #"BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", 
            #"CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", 
            #"FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", 
            #"GW", "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", 
            #"IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", 
            #"LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", 
            #"MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", 
            #"MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", 
            #"PA", "PE", "PG", "PH", "PK", "PL", "PR", "PS", "PT", "PW", "PY", "QA", "RO", 
            #"RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", 
            #"ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR", "TT", "TV", 
            #"TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", 
            #"ZA", "ZM", "ZW"
           ]

In [329]:
## Creating empty DataFrame to append API values after request

releases = pd.DataFrame()

In [330]:
## Creating loop to make GET Request
## The first request gets the list of new Albums released two weeks ago from each market defined above
## Then, it collects the ids of the Artists of each release and makes the second request, 
## which returns the Artist's data

for i in range(len(markets)):
    
    ## The Spotify only returns 50 values per request 
    # (the variables 'limit' and 'batchSize' helps Spotify not crash if the data exceeds )
    
    limit = 50
    offset = 0
    
    while offset < 1000:    # Spotify limit for Search Request is 1000
        
        ## Making GET request of the type search with the tag:'new', that returns the latest Albums

        newReleases = sp.search(q="tag:new", market=markets[i], type="album", limit=limit, offset=offset)
        newReleasesData = pd.DataFrame.from_dict(newReleases['albums']['items'])
        
        releases = pd.concat([releases, newReleasesData])
        releases['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
        
        # Incremental addition to offset to return the following pages of data
        
        offset=offset+limit
        
    print("Successfully got request from ", markets[i], "market")

Successfully got request from  AD market
Successfully got request from  AE market
Successfully got request from  AG market
Successfully got request from  AL market


In [331]:
# Returning table info

releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 49
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              4000 non-null   object
 1   total_tracks            4000 non-null   int64 
 2   is_playable             4000 non-null   bool  
 3   external_urls           4000 non-null   object
 4   href                    4000 non-null   object
 5   id                      4000 non-null   object
 6   images                  4000 non-null   object
 7   name                    4000 non-null   object
 8   release_date            4000 non-null   object
 9   release_date_precision  4000 non-null   object
 10  type                    4000 non-null   object
 11  uri                     4000 non-null   object
 12  artists                 4000 non-null   object
 13  extractionTimestamp     4000 non-null   object
dtypes: bool(1), int64(1), object(12)
memory usage: 441.4+ KB


In [332]:
## Treating some fields (renaming, extracting values, etc)

releases['spotify_url'] = pd.json_normalize(releases['external_urls'])

# Here, we are maintaining the id, uri and href in lists to facilitate in case we need to use them on the requests below

releases['artist_uri'] = releases['artists'].apply(lambda artists: [artist['uri'] for artist in artists])
releases['artist_href'] = releases['artists'].apply(lambda artists: [artist['href'] for artist in artists])
releases['artist_id'] = releases['artists'].apply(lambda artists: [artist['id'] for artist in artists])


In [333]:
# Dropping unnecessary and/or treated columns

releases = releases.drop(columns=['external_urls', 'artists'])

# Ordering columns

releases = releases[[
                    'id', 
                    'href', 
                    'uri', 
                    'spotify_url', 
                    'album_type', 
                    'total_tracks', 
                    'is_playable', 
                    'name', 
                    'release_date', 
                    'release_date_precision', 
                    'type', 
                    'artist_id', 
                    'artist_href', 
                    'artist_uri', 
                    'images', 
                    'extractionTimestamp']
            ]

In [334]:
# Removing duplicated release id 

releases = releases[releases.duplicated(subset='id') == False]

In [335]:
releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1439 entries, 0 to 42
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1439 non-null   object
 1   href                    1439 non-null   object
 2   uri                     1439 non-null   object
 3   spotify_url             1439 non-null   object
 4   album_type              1439 non-null   object
 5   total_tracks            1439 non-null   int64 
 6   is_playable             1439 non-null   bool  
 7   name                    1439 non-null   object
 8   release_date            1439 non-null   object
 9   release_date_precision  1439 non-null   object
 10  type                    1439 non-null   object
 11  artist_id               1439 non-null   object
 12  artist_href             1439 non-null   object
 13  artist_uri              1439 non-null   object
 14  images                  1439 non-null   object
 15  extractionT

In [336]:
## Creating empty DataFrame to append API values after request

artists = pd.DataFrame()

In [337]:
## Making request to GET Artists' Data

# Here we're accessing the Artist ID to make the loop request below 
# We transforme the arrays into sets to remove duplicates, and then convert it back to lists, so it can be used
# on the API request
        
artistsList =  list(set(releases['artist_id'].explode()))

batchSize = 50 # Spotify limit for Artist Request is 50
        
for j in range(0, len(artistsList), batchSize):
    artistsBatch = artistsList[j:j + batchSize]
    artistsData = sp.artists(artistsBatch)
    artistsData = pd.DataFrame.from_dict(artistsData['artists'])  
    artists = pd.concat([artists, artistsData])
    artists['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
    
    print("Successfully got ", round(((j + batchSize)/len(artistsList)) * 100, 2), "% of Artists")

Successfully got  3.17 % of Artists
Successfully got  6.35 % of Artists
Successfully got  9.52 % of Artists
Successfully got  12.69 % of Artists
Successfully got  15.86 % of Artists
Successfully got  19.04 % of Artists
Successfully got  22.21 % of Artists
Successfully got  25.38 % of Artists
Successfully got  28.55 % of Artists
Successfully got  31.73 % of Artists
Successfully got  34.9 % of Artists
Successfully got  38.07 % of Artists
Successfully got  41.24 % of Artists
Successfully got  44.42 % of Artists
Successfully got  47.59 % of Artists
Successfully got  50.76 % of Artists
Successfully got  53.93 % of Artists
Successfully got  57.11 % of Artists
Successfully got  60.28 % of Artists
Successfully got  63.45 % of Artists
Successfully got  66.62 % of Artists
Successfully got  69.8 % of Artists
Successfully got  72.97 % of Artists
Successfully got  76.14 % of Artists
Successfully got  79.31 % of Artists
Successfully got  82.49 % of Artists
Successfully got  85.66 % of Artists
Succes

In [338]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1576 entries, 0 to 25
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   external_urls        1576 non-null   object
 1   followers            1576 non-null   object
 2   genres               1576 non-null   object
 3   href                 1576 non-null   object
 4   id                   1576 non-null   object
 5   images               1576 non-null   object
 6   name                 1576 non-null   object
 7   popularity           1576 non-null   int64 
 8   type                 1576 non-null   object
 9   uri                  1576 non-null   object
 10  extractionTimestamp  1576 non-null   object
dtypes: int64(1), object(10)
memory usage: 147.8+ KB


In [339]:
## Returning Artists DataFrame

artists.head()

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,extractionTimestamp
0,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 541}",[phonk],https://api.spotify.com/v1/artists/0BgU2NJhkpP...,0BgU2NJhkpP7crd7vvkNHC,[{'url': 'https://i.scdn.co/image/ab6761610000...,DXN CHVLX,45,artist,spotify:artist:0BgU2NJhkpP7crd7vvkNHC,2025-02-05 20:16:04
1,{'spotify': 'https://open.spotify.com/artist/3...,"{'href': None, 'total': 25598705}","[sertanejo, sertanejo universitário, arrocha, ...",https://api.spotify.com/v1/artists/3p7PcrEHaaK...,3p7PcrEHaaKLJnPUGOtRlT,[{'url': 'https://i.scdn.co/image/ab6761610000...,Henrique & Juliano,84,artist,spotify:artist:3p7PcrEHaaKLJnPUGOtRlT,2025-02-05 20:16:04
2,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 7672}",[],https://api.spotify.com/v1/artists/02J1zj1QPj7...,02J1zj1QPj7nlJiKGX6wgY,[{'url': 'https://i.scdn.co/image/ab6761610000...,Mello Santana,50,artist,spotify:artist:02J1zj1QPj7nlJiKGX6wgY,2025-02-05 20:16:04
3,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 2733}",[funk],https://api.spotify.com/v1/artists/2xp6N2DMAAO...,2xp6N2DMAAOytqSkMQOdjD,[{'url': 'https://i.scdn.co/image/ab6761610000...,DJ G4P ORIGINAL,38,artist,spotify:artist:2xp6N2DMAAOytqSkMQOdjD,2025-02-05 20:16:04
4,{'spotify': 'https://open.spotify.com/artist/1...,"{'href': None, 'total': 19778533}","[bhajan, hindi pop, bollywood, devotional, desi]",https://api.spotify.com/v1/artists/1tqysapcCh1...,1tqysapcCh1lWEAc9dIFpa,[{'url': 'https://i.scdn.co/image/ab6761610000...,Jubin Nautiyal,74,artist,spotify:artist:1tqysapcCh1lWEAc9dIFpa,2025-02-05 20:16:04


In [340]:
## Treating some fields (renaming, exploding the dicts, etc)

artists['spotify_url'] = pd.json_normalize(artists['external_urls'])
artists['followers'] = pd.json_normalize(artists['followers'], max_level=1)['total']
artists['genres'] = artists["genres"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)


In [341]:
# Dropping unnecessary and/or treated columns

artists = artists.drop(columns=['external_urls'])

# Ordering columns

artists = artists[[
                    'id', 
                    'href', 
                    'uri', 
                    'spotify_url', 
                    'type',
                    'name', 
                    'followers', 
                    'popularity', 
                    'genres', 
                    'images', 
                    'extractionTimestamp']
]

In [342]:
# Removing duplicated release id 

artists = artists[artists.duplicated(subset='id') == False]

In [343]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1576 entries, 0 to 25
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   1576 non-null   object
 1   href                 1576 non-null   object
 2   uri                  1576 non-null   object
 3   spotify_url          1576 non-null   object
 4   type                 1576 non-null   object
 5   name                 1576 non-null   object
 6   followers            1576 non-null   int64 
 7   popularity           1576 non-null   int64 
 8   genres               1576 non-null   object
 9   images               1576 non-null   object
 10  extractionTimestamp  1576 non-null   object
dtypes: int64(2), object(9)
memory usage: 147.8+ KB


In [344]:
## Let's get more data on this script
## Now that we have New Releases information, we can make a request to collect further information about albums and tracks
## Note that a new release can be an album or a single.
## So, to get more data, we're going to make a Request to return more informations about albums (like tracks, for example)
## And then, we are going to make a final request to return information about all tracks (single releases and tracks from the albums)

In [345]:
## Creating empty DataFrame to append API values after request

albums = pd.DataFrame()

In [346]:
# Again, using set and list to make the Album request below

albumsList = list(set(releases['id']))

# Redefining batchSize variable

batchSize = 20 # Spotify limit for Album Request is 20

for k in range(0, len(albumsList), batchSize):
    
    albumsData = sp.albums(albumsList[k:k + batchSize])
    albumsData = pd.DataFrame.from_dict(albumsData['albums'])
    albums = pd.concat([albums, albumsData])
    albums['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
    
    print("Successfully got ", round((k + batchSize) / len(albumsList) * 100, 2), "% of albums")

Successfully got  1.39 % of albums
Successfully got  2.78 % of albums
Successfully got  4.17 % of albums
Successfully got  5.56 % of albums
Successfully got  6.95 % of albums
Successfully got  8.34 % of albums
Successfully got  9.73 % of albums
Successfully got  11.12 % of albums
Successfully got  12.51 % of albums
Successfully got  13.9 % of albums
Successfully got  15.29 % of albums
Successfully got  16.68 % of albums
Successfully got  18.07 % of albums
Successfully got  19.46 % of albums
Successfully got  20.85 % of albums
Successfully got  22.24 % of albums
Successfully got  23.63 % of albums
Successfully got  25.02 % of albums
Successfully got  26.41 % of albums
Successfully got  27.8 % of albums
Successfully got  29.19 % of albums
Successfully got  30.58 % of albums
Successfully got  31.97 % of albums
Successfully got  33.36 % of albums
Successfully got  34.75 % of albums
Successfully got  36.14 % of albums
Successfully got  37.53 % of albums
Successfully got  38.92 % of albums
S

In [347]:
albums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1439 entries, 0 to 18
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              1439 non-null   object
 1   total_tracks            1439 non-null   int64 
 2   available_markets       1439 non-null   object
 3   external_urls           1439 non-null   object
 4   href                    1439 non-null   object
 5   id                      1439 non-null   object
 6   images                  1439 non-null   object
 7   name                    1439 non-null   object
 8   release_date            1439 non-null   object
 9   release_date_precision  1439 non-null   object
 10  type                    1439 non-null   object
 11  uri                     1439 non-null   object
 12  artists                 1439 non-null   object
 13  tracks                  1439 non-null   object
 14  copyrights              1439 non-null   object
 15  external_id

In [348]:
albums.head()

Unnamed: 0,album_type,total_tracks,available_markets,external_urls,href,id,images,name,release_date,release_date_precision,type,uri,artists,tracks,copyrights,external_ids,genres,label,popularity,extractionTimestamp
0,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/2I...,https://api.spotify.com/v1/albums/2IM7G40eSeFr...,2IM7G40eSeFrRZ5ITPVJgk,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Sacred Alignment (Wind),2025-02-04,day,album,spotify:album:2IM7G40eSeFrRZ5ITPVJgk,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/2I...,"[{'text': '2025 Yzalune', 'type': 'C'}, {'text...",{'upc': '5056826836373'},[],Yzalune,0,2025-02-05 20:16:39
1,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/5y...,https://api.spotify.com/v1/albums/5yOtAcU6AkGj...,5yOtAcU6AkGjqjiutHxawu,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Slow Motion,2025-01-24,day,album,spotify:album:5yOtAcU6AkGjqjiutHxawu,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/5y...,"[{'text': '© 2025 Def Jam Recordings, a divisi...",{'upc': '00602475842965'},[],"EP Entertainment, LLC / Def Jam",51,2025-02-05 20:16:39
2,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/7G...,https://api.spotify.com/v1/albums/7Gjl4Unb6gNK...,7Gjl4Unb6gNKMXDjyANuxJ,[{'url': 'https://i.scdn.co/image/ab67616d0000...,SENTIDO COMÚN,2025-01-30,day,album,spotify:album:7Gjl4Unb6gNKMXDjyANuxJ,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/7G...,[{'text': '© 2025 Kristoman LLC distributed by...,{'upc': '5021732625717'},[],WEA Latina,36,2025-02-05 20:16:39
3,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/7m...,https://api.spotify.com/v1/albums/7mK6mlzHXsEZ...,7mK6mlzHXsEZTSmrXT5aIJ,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Então Toma Piranha,2025-01-26,day,album,spotify:album:7mK6mlzHXsEZTSmrXT5aIJ,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/7m...,"[{'text': '2025 Melhor do funk', 'type': 'C'},...",{'upc': '790092397941'},[],Melhor do funk,0,2025-02-05 20:16:39
4,single,2,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/4q...,https://api.spotify.com/v1/albums/4q8OpncDw3Ho...,4q8OpncDw3HoDB3czBNYQi,[{'url': 'https://i.scdn.co/image/ab67616d0000...,slash 011 - Getting Ready For The Party (Instr...,2025-02-05,day,album,spotify:album:4q8OpncDw3HoDB3czBNYQi,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/4q...,[{'text': '2025 Armada Music B.V. under exclus...,{'upc': '8718522525605'},[],slash,0,2025-02-05 20:16:39


In [349]:
## Treating some fields (renaming, extracting values, etc)

albums['spotify_url'] = pd.json_normalize(albums['external_urls'])

albums['artist_id'] = albums['artists'].apply(lambda artists: [artist['id'] for artist in artists])
albums['artist_href'] = albums['artists'].apply(lambda artists: [artist['href'] for artist in artists])
albums['artist_uri'] = albums['artists'].apply(lambda artists: [artist['uri'] for artist in artists])

albums['track_id'] = pd.json_normalize(albums['tracks'])['items'].apply(lambda x: [track['id'] for track in x] if isinstance(x, list) else [])
albums['track_href'] = pd.json_normalize(albums['tracks'])['items'].apply(lambda x: [track['href'] for track in x] if isinstance(x, list) else [])
albums['track_uri'] = pd.json_normalize(albums['tracks'])['items'].apply(lambda x: [track['uri'] for track in x] if isinstance(x, list) else [])

In [350]:
# Removing unnecessary and/or treated columns

#albums = albums.drop(columns=['external_urls', 'artists', 'tracks', 'genres']) 

# Ordering columns

albums = albums[[
                'id', 
                'href',
                'uri',
                'spotify_url',
                'album_type',
                'total_tracks',  
                'name', 
                'available_markets',
                'release_date', 
                'release_date_precision', 
                'type', 
                'artist_id', 
                'artist_href', 
                'artist_uri', 
                'track_id',
                'track_href',
                'track_uri',
                'popularity',
                'label',
                'copyrights',
                'external_ids',                
                'images', 
                'extractionTimestamp']
            ]


In [351]:
# Removing duplicated release di 

albums = albums[albums.duplicated(subset='id') == False]

In [394]:
albums.head()

Unnamed: 0,id,href,uri,spotify_url,album_type,total_tracks,name,available_markets,release_date,release_date_precision,...,artist_uri,track_id,track_href,track_uri,popularity,label,copyrights,external_ids,images,extractionTimestamp
0,2IM7G40eSeFrRZ5ITPVJgk,https://api.spotify.com/v1/albums/2IM7G40eSeFr...,spotify:album:2IM7G40eSeFrRZ5ITPVJgk,https://open.spotify.com/album/2IM7G40eSeFrRZ5...,single,1,Sacred Alignment (Wind),"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2025-02-04,day,...,[spotify:artist:50cM5Lb34I3OgF5jaNCILA],[4GQPTzGeT9W2Xx9mlqlVM5],[https://api.spotify.com/v1/tracks/4GQPTzGeT9W...,[spotify:track:4GQPTzGeT9W2Xx9mlqlVM5],0,Yzalune,"[{'text': '2025 Yzalune', 'type': 'C'}, {'text...",{'upc': '5056826836373'},[{'url': 'https://i.scdn.co/image/ab67616d0000...,2025-02-05 20:16:39
1,5yOtAcU6AkGjqjiutHxawu,https://api.spotify.com/v1/albums/5yOtAcU6AkGj...,spotify:album:5yOtAcU6AkGjqjiutHxawu,https://open.spotify.com/album/5yOtAcU6AkGjqji...,single,1,Slow Motion,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2025-01-24,day,...,[spotify:artist:2wUjUUtkb5lvLKcGKsKqsR],[5zuV1H77nGNalXohUCsjTY],[https://api.spotify.com/v1/tracks/5zuV1H77nGN...,[spotify:track:5zuV1H77nGNalXohUCsjTY],51,"EP Entertainment, LLC / Def Jam","[{'text': '© 2025 Def Jam Recordings, a divisi...",{'upc': '00602475842965'},[{'url': 'https://i.scdn.co/image/ab67616d0000...,2025-02-05 20:16:39
2,7Gjl4Unb6gNKMXDjyANuxJ,https://api.spotify.com/v1/albums/7Gjl4Unb6gNK...,spotify:album:7Gjl4Unb6gNKMXDjyANuxJ,https://open.spotify.com/album/7Gjl4Unb6gNKMXD...,single,1,SENTIDO COMÚN,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2025-01-30,day,...,[spotify:artist:1pQWsZQehhS4wavwh7Fnxd],[43gQjpu2eXUsbDhW8UPjYA],[https://api.spotify.com/v1/tracks/43gQjpu2eXU...,[spotify:track:43gQjpu2eXUsbDhW8UPjYA],36,WEA Latina,[{'text': '© 2025 Kristoman LLC distributed by...,{'upc': '5021732625717'},[{'url': 'https://i.scdn.co/image/ab67616d0000...,2025-02-05 20:16:39
3,7mK6mlzHXsEZTSmrXT5aIJ,https://api.spotify.com/v1/albums/7mK6mlzHXsEZ...,spotify:album:7mK6mlzHXsEZTSmrXT5aIJ,https://open.spotify.com/album/7mK6mlzHXsEZTSm...,single,1,Então Toma Piranha,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2025-01-26,day,...,"[spotify:artist:47wsfaEWJwhWd9bs04Oc1g, spotif...",[59XT49MfvYoBj4WEQlNd5a],[https://api.spotify.com/v1/tracks/59XT49MfvYo...,[spotify:track:59XT49MfvYoBj4WEQlNd5a],0,Melhor do funk,"[{'text': '2025 Melhor do funk', 'type': 'C'},...",{'upc': '790092397941'},[{'url': 'https://i.scdn.co/image/ab67616d0000...,2025-02-05 20:16:39
4,4q8OpncDw3HoDB3czBNYQi,https://api.spotify.com/v1/albums/4q8OpncDw3Ho...,spotify:album:4q8OpncDw3HoDB3czBNYQi,https://open.spotify.com/album/4q8OpncDw3HoDB3...,single,2,slash 011 - Getting Ready For The Party (Instr...,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",2025-02-05,day,...,[spotify:artist:0UMs6dTf23FC2fHc40fXNS],"[5vO723Rml9bx16DJdmT7fd, 1TvTTsjY8xg6WIR12WMUak]",[https://api.spotify.com/v1/tracks/5vO723Rml9b...,"[spotify:track:5vO723Rml9bx16DJdmT7fd, spotify...",0,slash,[{'text': '2025 Armada Music B.V. under exclus...,{'upc': '8718522525605'},[{'url': 'https://i.scdn.co/image/ab67616d0000...,2025-02-05 20:16:39


In [353]:
## Now, we are going to get all tracks data
## Spotify provides general informations about Tracks, Audio Features and Audio Analysis

In [354]:
## Creating dataframe to append data

tracks = pd.DataFrame()

In [395]:
# Creating list of Track ids

tracksList = list(set(albums['track_id'].explode()))

# Redefining batchSize variable

batchSize = 50 # Spotify limit for Track Request is 50

for k in range(0, len(tracksList), batchSize):
    tracksData = sp.tracks(tracksList[k:k + batchSize])
    tracksData = pd.DataFrame.from_dict(tracksData['tracks'])
    
    tracks = pd.concat([tracks, tracksData])
    
    print("Successfully got ", round((k + batchSize) / len(tracksList) * 100, 2), "% of tracks")

Successfully got  44.25 % of tracks
Successfully got  88.5 % of tracks
Successfully got  132.74 % of tracks


In [396]:
tracks['artist_id'] = tracks['artists'].apply(lambda artists: [artist['id'] for artist in artists])
tracks['artist_href'] = tracks['artists'].apply(lambda artists: [artist['href'] for artist in artists])
tracks['artist_uri'] = tracks['artists'].apply(lambda artists: [artist['uri'] for artist in artists])

tracks['album_id'] = pd.json_normalize(tracks['album'])['id']
tracks['album_href'] = pd.json_normalize(tracks['album'])['href']
tracks['album_uri'] = pd.json_normalize(tracks['album'])['uri']

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,374533,False,{'isrc': 'FR50X2311402'},{'spotify': 'https://open.spotify.com/track/5H...,https://api.spotify.com/v1/tracks/5HPQjyOIkUjz...,5HPQjyOIkUjzHUE3sNCCLn,False,"Va, dal furor portata, KV 21/19c",8,,2,track,spotify:track:5HPQjyOIkUjzHUE3sNCCLn
1,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,221453,False,{'isrc': 'FR50X2311403'},{'spotify': 'https://open.spotify.com/track/1k...,https://api.spotify.com/v1/tracks/1kVKu0S9wgT3...,1kVKu0S9wgT3PwrJKqschS,False,"Si mostra la sorte, KV 209",7,,3,track,spotify:track:1kVKu0S9wgT3PwrJKqschS
2,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,122173,False,{'isrc': 'DEE862401559'},{'spotify': 'https://open.spotify.com/track/1H...,https://api.spotify.com/v1/tracks/1H8y3fRvAWMZ...,1H8y3fRvAWMZsVMMCSmVIq,False,Sherlock Holmes Suite: Part 1,35,,14,track,spotify:track:1H8y3fRvAWMZsVMMCSmVIq
3,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,792013,False,{'isrc': 'FR50X2311405'},{'spotify': 'https://open.spotify.com/track/2H...,https://api.spotify.com/v1/tracks/2H5aMHrDkF1l...,2H5aMHrDkF1lsFsiWeDcGq,False,"Se al labbro mio non credi, KV 295: Adagio - A...",6,,5,track,spotify:track:2H5aMHrDkF1lsFsiWeDcGq
4,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,153600,True,{'isrc': 'USUM72414748'},{'spotify': 'https://open.spotify.com/track/7b...,https://api.spotify.com/v1/tracks/7bgfvzEIufDU...,7bgfvzEIufDUgwhqIBrmBb,False,Free Rico,49,,1,track,spotify:track:7bgfvzEIufDUgwhqIBrmBb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2,121960,False,{'isrc': 'DEE862401574'},{'spotify': 'https://open.spotify.com/track/6T...,https://api.spotify.com/v1/tracks/6TvPj1MnVYOh...,6TvPj1MnVYOhVtz6NFBCjt,False,Kung Fu Panda Suite: Part 1,34,,6,track,spotify:track:6TvPj1MnVYOhVtz6NFBCjt
9,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,103615,True,{'isrc': 'BK16P2400631'},{'spotify': 'https://open.spotify.com/track/5e...,https://api.spotify.com/v1/tracks/5elievkIC8te...,5elievkIC8tezuAzsBUm7o,False,Colo Colocando [Slowed],1,,2,track,spotify:track:5elievkIC8tezuAzsBUm7o
10,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,145955,True,{'isrc': 'USUM72414755'},{'spotify': 'https://open.spotify.com/track/0j...,https://api.spotify.com/v1/tracks/0j7d0CqZOkGP...,0j7d0CqZOkGPwAYloJTz79,False,Right Now,48,,4,track,spotify:track:0j7d0CqZOkGPwAYloJTz79
11,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,180062,True,{'isrc': 'USUM72413770'},{'spotify': 'https://open.spotify.com/track/6Y...,https://api.spotify.com/v1/tracks/6Y4tU3DeeWzG...,6Y4tU3DeeWzGa0eXpkVhKp,False,The Streets,47,,2,track,spotify:track:6Y4tU3DeeWzGa0eXpkVhKp


In [15]:
## Importing Credentials from Google Cloud

from google.cloud import storage
from google.oauth2 import service_account

CREDENTIALS = service_account.Credentials.from_service_account_file(penv.bq_path)
STORAGE = storage.Client(credentials=CREDENTIALS)

In [16]:
# Acessing Bucket Path

bucket = STORAGE.get_bucket(penv.bucket_path)

In [17]:
# Getting currentTimestamp

currentTimestamp = datetime.today().strftime('%Y-%m-%d %X')

# Adding currentTimestamp on file name, so it doesn't overwrite itself. 
# Also, it helps keep track on incremental models

file_name = f"spotify_api_test_data__{currentTimestamp}"

In [19]:
## Defining a function called avro_df_prep to prepare the dataframe for the Avro format

def avro_df_prep():

    # pip install fastavro

    from fastavro import writer, parse_schema

    # Converting all columns to string, because Avro doesn't support object type

    columns_to_convert = [  # Lista de colunas definidas no esquema Avro
        'album_type', 'external_urls', 'href',
       'id', 'images', 'name', 'release_date', 'release_date_precision',
       'type', 'uri', 'artists', 'restrictions']

    df[columns_to_convert] = df[columns_to_convert].astype(str)

    # Declaring dataframe schema

    schema = {
        'name': 'spotify'
        , 'type': 'record'
        , 'fields': [
                        {'name': 'album_type', 'type': 'string'}, 
                        {'name': 'total_tracks', 'type': 'int'}, 
                        {'name': 'is_playable', 'type': 'boolean'}, 
                        {'name': 'external_urls', 'type': 'string'},
                        {'name': 'id', 'type': 'string'},
                        {'name': 'images', 'type': 'string'},  
                        {'name': 'name', 'type': 'string'}, 
                        {'name': 'release_date', 'type': 'string'}, 
                        {'name': 'release_date_precision', 'type': 'string'}, 
                        {'name': 'href', 'type': 'string'}, 
                        {'name': 'type', 'type': 'string'}, 
                        {'name': 'uri', 'type': 'string'}, 
                        {'name': 'artists', 'type': 'string'}, 
                        {'name': 'restrictions', 'type': 'string'}, 
                    ]
    }

    parsed_schema = parse_schema(schema)
    records = df.to_dict('records')

    # Writing an Avro file on 'archive' directory

    with open(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb') as out:
       writer(out, parsed_schema, records)
            

In [None]:
avro_df_prep()

In [None]:
## Writing Dataframe to Bucket folder with desired file format 

file_formats = [
                'csv'
                , 'parquet'
                , 'json'
                , 'orc'
                , 'avro'
]

for i in range(len(file_formats)):
    
    blob = bucket.blob(f"{penv.bucket_folder}/{file_name}.{file_formats[i]}")
    
    if file_formats[i] == 'csv':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_csv(), '/text/csv')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'parquet':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_parquet(), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'json':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_json(orient='table'), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'orc':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.reset_index().to_orc(index=None), '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'avro':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        avro_df_prep()
        blob.upload_from_filename(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb', '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))