In [1]:
## Installing Spotify's API library in Python

# pip install spotipy

In [2]:
## Importing libraries

import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime

sys.path.append("/home/tabas/personal-dev/pyprojects")
import pipelines.utils.personal_env as penv
import pipelines.utils.common as common

In [3]:
# Importing Spotify Credentials

CLIENT_ID = penv.spotify_client_id
CLIENT_SECRET = penv.spotify_client_secret

In [4]:
# Stablishing Spotify Authentication

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [5]:
# Creating a list of all available markets on Spotify

markets = [
            "AD", "AE", "AG", "AL", #"AM", "AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", 
            #"BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", 
            #"BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", 
            #"CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", 
            #"FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", 
            #"GW", "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", 
            #"IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", 
            #"LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", 
            #"MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", 
            #"MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", 
            #"PA", "PE", "PG", "PH", "PK", "PL", "PR", "PS", "PT", "PW", "PY", "QA", "RO", 
            #"RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", 
            #"ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR", "TT", "TV", 
            #"TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", 
            #"ZA", "ZM", "ZW"
           ]

### Getting latest releases

In [6]:
## Creating empty DataFrame to append API values after request

releases = pd.DataFrame()

In [8]:
## Creating loop to make GET Request
## The first request gets the list of new Albums released two weeks ago from each market defined above
## Then, it collects the ids of the Artists of each release and makes the second request, 
## which returns the Artist's data

for i in range(len(markets)):
    
    ## The Spotify only returns 50 values per request 
    # (the variables 'limit' and 'batchSize' helps Spotify not crash if the data exceeds )
    
    limit = 50
    offset = 0
    
    while offset < 1000:    # Spotify limit for Search Request is 1000
        
        ## Making GET request of the type search with the tag:'new', that returns the latest Albums

        newReleases = sp.search(q="tag:new", market=markets[i], type="album", limit=limit, offset=offset)
        newReleasesData = pd.DataFrame.from_dict(newReleases['albums']['items'])
        
        releases = pd.concat([releases, newReleasesData])
        releases['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
        
        # Incremental addition to offset to return the following pages of data
        
        offset=offset+limit
        
    print("Successfully got request from ", markets[i], "market")

Successfully got request from  AD market
Successfully got request from  AE market
Successfully got request from  AG market
Successfully got request from  AL market


In [9]:
# Returning table info

releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 0 to 49
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              7000 non-null   object
 1   total_tracks            7000 non-null   int64 
 2   is_playable             7000 non-null   bool  
 3   external_urls           7000 non-null   object
 4   href                    7000 non-null   object
 5   id                      7000 non-null   object
 6   images                  7000 non-null   object
 7   name                    7000 non-null   object
 8   release_date            7000 non-null   object
 9   release_date_precision  7000 non-null   object
 10  type                    7000 non-null   object
 11  uri                     7000 non-null   object
 12  artists                 7000 non-null   object
 13  extractionTimestamp     7000 non-null   object
dtypes: bool(1), int64(1), object(12)
memory usage: 772.5+ KB


In [10]:
## Treating some fields (renaming, extracting values, etc)

releases['spotify_url'] = releases['external_urls'].apply(lambda x: x['spotify'] if isinstance(x, dict) else None)

# Here, we are maintaining the id, uri and href in lists to facilitate in case we need to use them on the requests below

releases['artist_uri'] = releases['artists'].apply(lambda artists: [artist['uri'] for artist in artists])
releases['artist_href'] = releases['artists'].apply(lambda artists: [artist['href'] for artist in artists])
releases['artist_id'] = releases['artists'].apply(lambda artists: [artist['id'] for artist in artists])


In [11]:
# Dropping unnecessary and/or treated columns

releases = releases.drop(columns=['external_urls', 'artists'])

# Ordering columns

releases = releases[[
                    'id', 
                    'href', 
                    'uri', 
                    'spotify_url', 
                    'album_type', 
                    'total_tracks', 
                    'is_playable', 
                    'name', 
                    'release_date', 
                    'release_date_precision', 
                    'type', 
                    'artist_id', 
                    'artist_href', 
                    'artist_uri', 
                    'images', 
                    'extractionTimestamp']
            ]

In [12]:
# Removing duplicated release id 

releases = releases[releases.duplicated(subset='id') == False]

In [13]:
releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 0 to 19
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1350 non-null   object
 1   href                    1350 non-null   object
 2   uri                     1350 non-null   object
 3   spotify_url             1350 non-null   object
 4   album_type              1350 non-null   object
 5   total_tracks            1350 non-null   int64 
 6   is_playable             1350 non-null   bool  
 7   name                    1350 non-null   object
 8   release_date            1350 non-null   object
 9   release_date_precision  1350 non-null   object
 10  type                    1350 non-null   object
 11  artist_id               1350 non-null   object
 12  artist_href             1350 non-null   object
 13  artist_uri              1350 non-null   object
 14  images                  1350 non-null   object
 15  extractionT

### Getting artists 

In [14]:
## Creating empty DataFrame to append API values after request

artists = pd.DataFrame()

In [15]:
## Making request to GET Artists' Data

# Here we're accessing the Artist ID to make the loop request below 
# We transforme the arrays into sets to remove duplicates, and then convert it back to lists, so it can be used
# on the API request
        
artistsList =  list(set(releases['artist_id'].explode()))

batchSize = 50 # Spotify limit for Artist Request is 50
        
for j in range(0, len(artistsList), batchSize):
    artistsBatch = artistsList[j:j + batchSize]
    artistsData = sp.artists(artistsBatch)
    artistsData = pd.DataFrame.from_dict(artistsData['artists'])  
    artists = pd.concat([artists, artistsData])
    artists['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
    
    print("Successfully got ", round(((j + batchSize)/len(artistsList)) * 100, 2), "% of Artists")

Successfully got  2.72 % of Artists
Successfully got  5.45 % of Artists
Successfully got  8.17 % of Artists
Successfully got  10.9 % of Artists
Successfully got  13.62 % of Artists
Successfully got  16.35 % of Artists
Successfully got  19.07 % of Artists
Successfully got  21.8 % of Artists
Successfully got  24.52 % of Artists
Successfully got  27.25 % of Artists
Successfully got  29.97 % of Artists
Successfully got  32.7 % of Artists
Successfully got  35.42 % of Artists
Successfully got  38.15 % of Artists
Successfully got  40.87 % of Artists
Successfully got  43.6 % of Artists
Successfully got  46.32 % of Artists
Successfully got  49.05 % of Artists
Successfully got  51.77 % of Artists
Successfully got  54.5 % of Artists
Successfully got  57.22 % of Artists
Successfully got  59.95 % of Artists
Successfully got  62.67 % of Artists
Successfully got  65.4 % of Artists
Successfully got  68.12 % of Artists
Successfully got  70.84 % of Artists
Successfully got  73.57 % of Artists
Successful

In [16]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1835 entries, 0 to 34
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   external_urls        1835 non-null   object
 1   followers            1835 non-null   object
 2   genres               1835 non-null   object
 3   href                 1835 non-null   object
 4   id                   1835 non-null   object
 5   images               1835 non-null   object
 6   name                 1835 non-null   object
 7   popularity           1835 non-null   int64 
 8   type                 1835 non-null   object
 9   uri                  1835 non-null   object
 10  extractionTimestamp  1835 non-null   object
dtypes: int64(1), object(10)
memory usage: 172.0+ KB


In [17]:
## Returning Artists DataFrame

artists.head()

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,extractionTimestamp
0,{'spotify': 'https://open.spotify.com/artist/1...,"{'href': None, 'total': 6}",[],https://api.spotify.com/v1/artists/1Xtrmi0VIsV...,1Xtrmi0VIsVC39iCWjukKl,[{'url': 'https://i.scdn.co/image/ab67616d0000...,SHAITO,2,artist,spotify:artist:1Xtrmi0VIsVC39iCWjukKl,2025-02-07 13:50:28
1,{'spotify': 'https://open.spotify.com/artist/5...,"{'href': None, 'total': 40964}",[german hip hop],https://api.spotify.com/v1/artists/5WeAkTAbC4T...,5WeAkTAbC4TnkTYGI0ifTr,[{'url': 'https://i.scdn.co/image/ab6761610000...,AssiH,52,artist,spotify:artist:5WeAkTAbC4TnkTYGI0ifTr,2025-02-07 13:50:28
2,{'spotify': 'https://open.spotify.com/artist/4...,"{'href': None, 'total': 139983}","[medieval metal, medieval, folk rock, folk met...",https://api.spotify.com/v1/artists/4tGxWZaAkt5...,4tGxWZaAkt50t9VZxSxOGZ,[{'url': 'https://i.scdn.co/image/ab6761610000...,Versengold,50,artist,spotify:artist:4tGxWZaAkt50t9VZxSxOGZ,2025-02-07 13:50:28
3,{'spotify': 'https://open.spotify.com/artist/6...,"{'href': None, 'total': 42774}",[sovietwave],https://api.spotify.com/v1/artists/6oiinpr7kwR...,6oiinpr7kwRmyWKVDTZgRT,[{'url': 'https://i.scdn.co/image/ab6761610000...,AP$ENT,47,artist,spotify:artist:6oiinpr7kwRmyWKVDTZgRT,2025-02-07 13:50:28
4,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 19252}","[agronejo, funk pop, brazilian pop]",https://api.spotify.com/v1/artists/0Otd1ReHJtV...,0Otd1ReHJtVAzwAuRj09Zg,[{'url': 'https://i.scdn.co/image/ab6761610000...,HITMAKER,65,artist,spotify:artist:0Otd1ReHJtVAzwAuRj09Zg,2025-02-07 13:50:28


In [18]:
## Treating some fields (renaming, exploding the dicts, etc)

artists['spotify_url'] = artists['external_urls'].apply(lambda x: x['spotify'] if isinstance(x, dict) else None)
artists['followers'] = artists['followers'].apply(lambda x: x['total'] if isinstance(x, dict) else None)


In [19]:
# Dropping unnecessary and/or treated columns

artists = artists.drop(columns=['external_urls'])

# Ordering columns

artists = artists[[
                    'id', 
                    'href', 
                    'uri', 
                    'spotify_url', 
                    'type',
                    'name', 
                    'followers', 
                    'popularity', 
                    'genres', 
                    'images', 
                    'extractionTimestamp']
]

In [20]:
# Removing duplicated release id 

artists = artists[artists.duplicated(subset='id') == False]

In [21]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1835 entries, 0 to 34
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   1835 non-null   object
 1   href                 1835 non-null   object
 2   uri                  1835 non-null   object
 3   spotify_url          1835 non-null   object
 4   type                 1835 non-null   object
 5   name                 1835 non-null   object
 6   followers            1835 non-null   int64 
 7   popularity           1835 non-null   int64 
 8   genres               1835 non-null   object
 9   images               1835 non-null   object
 10  extractionTimestamp  1835 non-null   object
dtypes: int64(2), object(9)
memory usage: 172.0+ KB


### Getting albums 

In [22]:
## Let's get more data on this script
## Now that we have New Releases information, we can make a request to collect further information about albums and tracks
## Note that a new release can be an album or a single.
## So, to get more data, we're going to make a Request to return more informations about albums (like tracks, for example)
## And then, we are going to make a final request to return information about all tracks (single releases and tracks from the albums)

In [23]:
## Creating empty DataFrame to append API values after request

albums = pd.DataFrame()

In [24]:
# Again, using set and list to make the Album request below

albumsList = list(set(releases['id']))

# Redefining batchSize variable

batchSize = 20 # Spotify limit for Album Request is 20

for k in range(0, len(albumsList), batchSize):
    
    albumsData = sp.albums(albumsList[k:k + batchSize])
    albumsData = pd.DataFrame.from_dict(albumsData['albums'])
    albums = pd.concat([albums, albumsData])
    albums['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
    
    print("Successfully got ", round((k + batchSize) / len(albumsList) * 100, 2), "% of albums")

Successfully got  1.48 % of albums
Successfully got  2.96 % of albums
Successfully got  4.44 % of albums
Successfully got  5.93 % of albums
Successfully got  7.41 % of albums
Successfully got  8.89 % of albums
Successfully got  10.37 % of albums
Successfully got  11.85 % of albums
Successfully got  13.33 % of albums
Successfully got  14.81 % of albums
Successfully got  16.3 % of albums
Successfully got  17.78 % of albums
Successfully got  19.26 % of albums
Successfully got  20.74 % of albums
Successfully got  22.22 % of albums
Successfully got  23.7 % of albums
Successfully got  25.19 % of albums
Successfully got  26.67 % of albums
Successfully got  28.15 % of albums
Successfully got  29.63 % of albums
Successfully got  31.11 % of albums
Successfully got  32.59 % of albums
Successfully got  34.07 % of albums
Successfully got  35.56 % of albums
Successfully got  37.04 % of albums
Successfully got  38.52 % of albums
Successfully got  40.0 % of albums
Successfully got  41.48 % of albums
S

In [25]:
albums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 0 to 9
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              1350 non-null   object
 1   total_tracks            1350 non-null   int64 
 2   available_markets       1350 non-null   object
 3   external_urls           1350 non-null   object
 4   href                    1350 non-null   object
 5   id                      1350 non-null   object
 6   images                  1350 non-null   object
 7   name                    1350 non-null   object
 8   release_date            1350 non-null   object
 9   release_date_precision  1350 non-null   object
 10  type                    1350 non-null   object
 11  uri                     1350 non-null   object
 12  artists                 1350 non-null   object
 13  tracks                  1350 non-null   object
 14  copyrights              1350 non-null   object
 15  external_ids

In [26]:
albums.head()

Unnamed: 0,album_type,total_tracks,available_markets,external_urls,href,id,images,name,release_date,release_date_precision,type,uri,artists,tracks,copyrights,external_ids,genres,label,popularity,extractionTimestamp
0,single,1,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/6P...,https://api.spotify.com/v1/albums/6PmwuEVcqaW1...,6PmwuEVcqaW1UH54ixrFMc,[{'url': 'https://i.scdn.co/image/ab67616d0000...,хотел,2025-02-07,day,album,spotify:album:6PmwuEVcqaW1UH54ixrFMc,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/6P...,"[{'text': '2025 eclipse media', 'type': 'C'}, ...",{'upc': '3617390084199'},[],eclipse media,0,2025-02-07 13:50:56
1,single,1,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/1P...,https://api.spotify.com/v1/albums/1Pb2Tpw2Jc9Y...,1Pb2Tpw2Jc9YrfhEVefIcy,[{'url': 'https://i.scdn.co/image/ab67616d0000...,moments,2025-02-07,day,album,spotify:album:1Pb2Tpw2Jc9YrfhEVefIcy,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/1P...,"[{'text': '2025 leadwave, under exclusive lice...",{'upc': '1963622893766'},[],leadwave,0,2025-02-07 13:50:56
2,single,2,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/4D...,https://api.spotify.com/v1/albums/4DSuRiOJnsan...,4DSuRiOJnsand0Rg6q4iR9,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Faded,2025-02-07,day,album,spotify:album:4DSuRiOJnsand0Rg6q4iR9,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/4D...,"[{'text': '2025 DXRTYLVND', 'type': 'C'}, {'te...",{'upc': '3617668607563'},[],DXRTYLVND,0,2025-02-07 13:50:56
3,single,2,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/6N...,https://api.spotify.com/v1/albums/6NTARpim6395...,6NTARpim6395oKlxlLDW7c,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Spotify Singles,2025-01-29,day,album,spotify:album:6NTARpim6395oKlxlLDW7c,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/6N...,[{'text': '(P) 2025 Sony Music Entertainment U...,{'upc': '196872677799'},[],RCA Records Label,59,2025-02-07 13:50:56
4,single,4,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/6K...,https://api.spotify.com/v1/albums/6KvS0wryQ6S9...,6KvS0wryQ6S97FevM2z2JW,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Montagem Superdimesional 1.0,2025-02-06,day,album,spotify:album:6KvS0wryQ6S97FevM2z2JW,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/6K...,"[{'text': '2025 Launch13', 'type': 'C'}, {'tex...",{'upc': '021865587352'},[],Launch13,0,2025-02-07 13:50:56


In [27]:
## Treating some fields (renaming, extracting values, etc)

albums['spotify_url'] = albums['external_urls'].apply(lambda x: x['spotify'] if isinstance(x, dict) else None)

albums['artist_id'] = albums['artists'].apply(lambda artists: [artist['id'] for artist in artists])
albums['artist_href'] = albums['artists'].apply(lambda artists: [artist['href'] for artist in artists])
albums['artist_uri'] = albums['artists'].apply(lambda artists: [artist['uri'] for artist in artists])

## To extract the track id we'll have to:

# 1) Extract the key 'items' inside the dictionary

items = albums['tracks'].apply(lambda x: x['items'] if isinstance(x, dict) else [])

# 2) Extract the id inside the list

albums['track_id'] = items.apply(lambda x: [track['id'] for track in x] if isinstance(x, list) else [])
albums['track_href'] = items.apply(lambda x: [track['href'] for track in x] if isinstance(x, list) else [])
albums['track_uri'] = items.apply(lambda x: [track['uri'] for track in x] if isinstance(x, list) else [])

In [28]:
# Removing unnecessary and/or treated columns

albums = albums.drop(columns=['external_urls', 'artists', 'tracks', 'genres']) 

# Ordering columns

albums = albums[[
                'id', 
                'href',
                'uri',
                'spotify_url',
                'album_type',
                'total_tracks',  
                'name', 
                'available_markets',
                'release_date', 
                'release_date_precision', 
                'type', 
                'artist_id', 
                'artist_href', 
                'artist_uri', 
                'track_id',
                'track_href',
                'track_uri',
                'popularity',
                'label',
                'copyrights',
                'external_ids',                
                'images', 
                'extractionTimestamp']
            ]


In [29]:
# Removing duplicated release di 

albums = albums[albums.duplicated(subset='id') == False]

In [30]:
albums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1350 entries, 0 to 9
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1350 non-null   object
 1   href                    1350 non-null   object
 2   uri                     1350 non-null   object
 3   spotify_url             1350 non-null   object
 4   album_type              1350 non-null   object
 5   total_tracks            1350 non-null   int64 
 6   name                    1350 non-null   object
 7   available_markets       1350 non-null   object
 8   release_date            1350 non-null   object
 9   release_date_precision  1350 non-null   object
 10  type                    1350 non-null   object
 11  artist_id               1350 non-null   object
 12  artist_href             1350 non-null   object
 13  artist_uri              1350 non-null   object
 14  track_id                1350 non-null   object
 15  track_href  

### Getting Tracks

In [31]:
## Now, we are going to get all tracks data
## Spotify provides general informations about Tracks, Audio Features and Audio Analysis

In [32]:
## Creating dataframe to append data

tracks = pd.DataFrame()

In [33]:
# Creating list of Track ids

tracksList = list(set(albums['track_id'].explode()))

# Redefining batchSize variable

batchSize = 50 # Spotify limit for Track Request is 50

for l in range(0, len(tracksList), batchSize):
    tracksData = sp.tracks(tracksList[l:l + batchSize])
    tracksData = pd.DataFrame.from_dict(tracksData['tracks'])
    
    tracks = pd.concat([tracks, tracksData])
    
    print("Successfully got ", round((l + batchSize) / len(tracksList) * 100, 2), "% of tracks")

Successfully got  1.09 % of tracks
Successfully got  2.18 % of tracks
Successfully got  3.27 % of tracks
Successfully got  4.36 % of tracks
Successfully got  5.45 % of tracks
Successfully got  6.54 % of tracks
Successfully got  7.63 % of tracks
Successfully got  8.72 % of tracks
Successfully got  9.81 % of tracks
Successfully got  10.91 % of tracks
Successfully got  12.0 % of tracks
Successfully got  13.09 % of tracks
Successfully got  14.18 % of tracks
Successfully got  15.27 % of tracks
Successfully got  16.36 % of tracks
Successfully got  17.45 % of tracks
Successfully got  18.54 % of tracks
Successfully got  19.63 % of tracks
Successfully got  20.72 % of tracks
Successfully got  21.81 % of tracks
Successfully got  22.9 % of tracks
Successfully got  23.99 % of tracks
Successfully got  25.08 % of tracks
Successfully got  26.17 % of tracks
Successfully got  27.26 % of tracks
Successfully got  28.35 % of tracks
Successfully got  29.44 % of tracks
Successfully got  30.53 % of tracks
Suc

In [34]:
tracks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4585 entries, 0 to 34
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   album              4585 non-null   object
 1   artists            4585 non-null   object
 2   available_markets  4585 non-null   object
 3   disc_number        4585 non-null   int64 
 4   duration_ms        4585 non-null   int64 
 5   explicit           4585 non-null   bool  
 6   external_ids       4585 non-null   object
 7   external_urls      4585 non-null   object
 8   href               4585 non-null   object
 9   id                 4585 non-null   object
 10  is_local           4585 non-null   bool  
 11  name               4585 non-null   object
 12  popularity         4585 non-null   int64 
 13  preview_url        0 non-null      object
 14  track_number       4585 non-null   int64 
 15  type               4585 non-null   object
 16  uri                4585 non-null   object
dtypes:

In [35]:
tracks.head()

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,166153,True,{'isrc': 'GXBAV2593145'},{'spotify': 'https://open.spotify.com/track/7a...,https://api.spotify.com/v1/tracks/7ayiyKpksc3C...,7ayiyKpksc3Cug0Jad91Oa,False,Zn Batedora - Super Slowed,0,,3,track,spotify:track:7ayiyKpksc3Cug0Jad91Oa
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",1,169625,False,{'isrc': 'CA5KR2513338'},{'spotify': 'https://open.spotify.com/track/38...,https://api.spotify.com/v1/tracks/386aqNtiEmvL...,386aqNtiEmvLX3XSDzOT7w,False,Montagem Secret Place - Ultra Slowed,0,,4,track,spotify:track:386aqNtiEmvLX3XSDzOT7w
2,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",1,169524,False,{'isrc': 'FR2X42527188'},{'spotify': 'https://open.spotify.com/track/0i...,https://api.spotify.com/v1/tracks/0iVomfljYfob...,0iVomfljYfobwGrDchEo8K,False,Svirači,0,,12,track,spotify:track:0iVomfljYfobwGrDchEo8K
3,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",1,137319,True,{'isrc': 'USUYG1598788'},{'spotify': 'https://open.spotify.com/track/5e...,https://api.spotify.com/v1/tracks/5eIUx4oIjcnV...,5eIUx4oIjcnVttj9WUsp17,False,Rap War,0,,10,track,spotify:track:5eIUx4oIjcnVttj9WUsp17
4,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,198750,False,{'isrc': 'CYA112400032'},{'spotify': 'https://open.spotify.com/track/6M...,https://api.spotify.com/v1/tracks/6M5JL2JSOjLI...,6M5JL2JSOjLIZQa0Q5enm9,False,I Follow Rivers (feat. Oaks),60,,1,track,spotify:track:6M5JL2JSOjLIZQa0Q5enm9


In [36]:
tracks['spotify_url'] = tracks['external_urls'].apply(lambda x: x['spotify'] if isinstance(x, dict) else None)


tracks['artists_id'] = tracks['artists'].apply(lambda artists: [artist['id'] for artist in artists])
tracks['artists_href'] = tracks['artists'].apply(lambda artists: [artist['href'] for artist in artists])
tracks['artists_uri'] = tracks['artists'].apply(lambda artists: [artist['uri'] for artist in artists])

tracks['album_id'] = tracks['album'].apply(lambda x: x['id'] if isinstance(x, dict) else [])
tracks['album_href'] = tracks['album'].apply(lambda x: x['href'] if isinstance(x, dict) else [])
tracks['album_uri'] = tracks['album'].apply(lambda x: x['uri'] if isinstance(x, dict) else [])

In [37]:
tracks = tracks.drop(columns = ['album', 'artists', 'external_urls'])

tracks = tracks[[
                'id',
                'href',
                'uri',
                'spotify_url',
                'artists_id',
                'artists_href',
                'artists_uri',
                'album_id',
                'album_href',
                'album_uri',
                'available_markets',
                'disc_number',
                'duration_ms',
                'explicit',
                'external_ids',]           
    ]

### Unloading Data

In [38]:
## Importing Credentials from Google Cloud

from google.cloud import storage
from google.oauth2 import service_account

CREDENTIALS = service_account.Credentials.from_service_account_file(penv.bq_path)
STORAGE = storage.Client(credentials=CREDENTIALS)

In [39]:
## Defining variables to execute unload_data()

origin = 'spotify'
bucket = STORAGE.get_bucket(penv.bucket_path)
bucket_folder = penv.bucket_folder+"/sp"
file_formats = ['parquet']
df_dict = {
        'releases': releases,
        'artists': artists,
        'albums': albums,
        'tracks': tracks    
}

In [40]:
## Writing Dataframe to Bucket folder with desired file format 

common.unload_data(file_formats, df_dict, bucket, bucket_folder)


Begin at:  2025-02-07 13:51:58
Sucessfully written  releases  in  parquet
End at:  2025-02-07 13:51:59

Begin at:  2025-02-07 13:51:59
Sucessfully written  artists  in  parquet
End at:  2025-02-07 13:52:00

Begin at:  2025-02-07 13:52:00
Sucessfully written  albums  in  parquet
End at:  2025-02-07 13:52:01

Begin at:  2025-02-07 13:52:01
Sucessfully written  tracks  in  parquet
End at:  2025-02-07 13:52:03


In [41]:
## This section is focused on understanding the data to find if further treatment is necessary
## Throuhgout the notebook, I've made some treatments base on what I've seen executiong the commands here in the notebook
## , but now I'm going to bring each dataframe (releases, artists, albums, tracks) to BigQuery and do some querying there

In [42]:
""" Releases (before removing duplicated values)

The releases data has the same structure as the album's data. Although the later has additional info, such as popularity
  , available markets, label, copyrights, and others.

Run made 06/02/2025 10:05

- album_type: 'single', 'album', 'compilation';
- release_date: minimum value is 2 weeks before today, maximum value is one day ahead;
- release_date_precision: take only value 'day';
- type: take only value 'album';


- Querying the distinct data, we get 1428 unique releas1es and 1507 unique artists;
SELECT 
  COUNT(DISTINCT id) AS qtd_release
  , COUNT(DISTINCT element) AS qtd_artists
FROM df_releases` 
  , UNNEST(artist_id.list) AS art_id
LIMIT 1000

- Checking if duplicated ids have indeed all the same values on all the columns id = '0hYiEucmpqclKoXxkBiIPS' as example) 
SELECT 
  *
FROM `tabas-dw-stg.stg_juliana.test_releases` 
  , UNNEST(artist_id.list) AS art_id
WHERE id = '0hYiEucmpqclKoXxkBiIPS' 

"""

" Releases (before removing duplicated values)\n\nThe releases data has the same structure as the album's data. Although the later has additional info, such as popularity\n  , available markets, label, copyrights, and others.\n\nRun made 06/02/2025 10:05\n\n- album_type: 'single', 'album', 'compilation';\n- release_date: minimum value is 2 weeks before today, maximum value is one day ahead;\n- release_date_precision: take only value 'day';\n- type: take only value 'album';\n\n\n- Querying the distinct data, we get 1428 unique releas1es and 1507 unique artists;\nSELECT \n  COUNT(DISTINCT id) AS qtd_release\n  , COUNT(DISTINCT element) AS qtd_artists\nFROM df_releases` \n  , UNNEST(artist_id.list) AS art_id\nLIMIT 1000\n\n- Checking if duplicated ids have indeed all the same values on all the columns id = '0hYiEucmpqclKoXxkBiIPS' as example) \nSELECT \n  *\nFROM `tabas-dw-stg.stg_juliana.test_releases` \n  , UNNEST(artist_id.list) AS art_id\nWHERE id = '0hYiEucmpqclKoXxkBiIPS' \n\n"

In [43]:
""" Artists 

Run made 06/02/2025 10:07

- genres: list of values - initially, I treated this field extracting the values from the list and maintaining 
        them solely on comma separated values, but on BigQuery it's possible to unnest values easily, so I 
        changed it back to original format;
- type: take only value 'artist';
- followers: total followers the artist has on Spotify profile - initially, the amount was divergent from what it was
        being shown on Spotify's page. I used json_normalize() to treat this field, but on investigation I've discovered 
        that it was atributing the wrong values. Using lambda function, it worked;
- popularity: goes from 0 to 100 - the artist's popularity is calculated from the popularity of all the artist's tracks.

- After running artists dataframe, we realize that it returned 1507 artists (as expected from above).

"""

" Artists \n\nRun made 06/02/2025 10:07\n\n- genres: list of values - initially, I treated this field extracting the values from the list and maintaining \n        them solely on comma separated values, but on BigQuery it's possible to unnest values easily, so I \n        changed it back to original format;\n- type: take only value 'artist';\n- followers: total followers the artist has on Spotify profile - initially, the amount was divergent from what it was\n        being shown on Spotify's page. I used json_normalize() to treat this field, but on investigation I've discovered \n        that it was atributing the wrong values. Using lambda function, it worked;\n- popularity: goes from 0 to 100 - the artist's popularity is calculated from the popularity of all the artist's tracks.\n\n- After running artists dataframe, we realize that it returned 1507 artists (as expected from above).\n\n"

In [44]:
""" Albums 

Run made 06/02/2025 11:07

- track_ids: Due to same issue of json_normalize() mentioned above, this field was returning a lot of duplicated
            values. Once fixed with lambda functions, it is consistent.

- After running artists dataframe, we realize that it returned 1428 albums (as expected from above).

"""

' Albums \n\nRun made 06/02/2025 11:07\n\n- track_ids: Due to same issue of json_normalize() mentioned above, this field was returning a lot of duplicated\n            values. Once fixed with lambda functions, it is consistent.\n\n- After running artists dataframe, we realize that it returned 1428 albums (as expected from above).\n\n'