In [117]:
## Installing Spotify's API library in Python

# pip install spotipy

In [118]:
## Importing libraries

import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime

sys.path.append("/home/tabas/personal-dev/pyprojects")
import pipelines.utils.personal_env as penv

In [119]:
# Importing Spotify Credentials

CLIENT_ID = penv.spotify_client_id
CLIENT_SECRET = penv.spotify_client_secret

In [120]:
# Stablishing Spotify Authentication

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [121]:
# Creating a list of all available markets on Spotify

markets = [
            "AD", "AE", "AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", 
            "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", 
            "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", 
            "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", 
            "FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", 
            "GW", "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", 
            "IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", 
            "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", 
            "MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", 
            "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", 
            "PA", "PE", "PG", "PH", "PK", "PL", "PR", "PS", "PT", "PW", "PY", "QA", "RO", 
            "RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", 
            "ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR", "TT", "TV", 
            "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", 
            "ZA", "ZM", "ZW"
           ]

In [122]:
## Creating empty DataFrame to append API values after request

albums = pd.DataFrame()
artists = pd.DataFrame()

In [None]:
## Creating loop to make GET Request
## The first request gets the list of new Albums released two weeks ago from each market defined above
## Then, it collects the ids of the Artists of each release and makes the second request, 
## which returns the Artist's data

for i in range(len(markets)):
    
    ## The Spotify only returns 50 values per request 
    # (the variables 'limit' and 'batchSize' helps Spotify not crash if the data exceeds )
    
    limit = 50
    offset = 0
    
    while offset < 1000:    # Spotify limit for Search Request is 1000
        
        ## Making GET request of the type search with the tag:'new', that returns the latest Albums

        newAlbums = sp.search(q="tag:new", market=markets[i], type="album", limit=limit, offset=offset)
        newAlbumData = pd.DataFrame.from_dict(newAlbums['albums']['items'])
        
        albums = pd.concat([albums, newAlbumData])
        albums['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
        
        # Here we're accessing the Artist ID to make the loop request below 
        
        artistsList =  newAlbumData['artists'].apply(lambda artists: [artist['id'] for artist in artists]).explode()
        batchSize = 50 # Spotify limit for Artist Request is 50

        for j in range(0, len(artistsList), batchSize):
                    
                artistsBatch = artistsList[j:j + batchSize]
                artistsData = sp.artists(artistsBatch.to_list())
                artistsData = pd.DataFrame.from_dict(artistsData['artists'])  
                artists = pd.concat([artists, artistsData])
                artists['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
        
        # Incremental addition to offset to return the following pages of data
        
        offset=offset+limit
        
    print("Successfully got request from ", markets[i], "market")

Successfully got request from  AD market
Successfully got request from  AE market


In [116]:
## Returning Album DataFrame
albums.head()

Unnamed: 0,album_type,total_tracks,is_playable,external_urls,href,id,images,name,release_date,release_date_precision,type,uri,artists,extractionTimestamp,image_url,album_href,artist_id,artist_name,album_uri,artist_uri
0,album,14,True,{'spotify': 'https://open.spotify.com/album/1F...,https://api.spotify.com/v1/albums/1FrWQryCHNC9...,1FrWQryCHNC95W7JwpKKiK,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",LowFlyer (Deluxe Edition),2025-01-30,day,artist,spotify:album:1FrWQryCHNC95W7JwpKKiK,[{'external_urls': {'spotify': 'https://open.s...,2025-01-29 22:43:49,https://i.scdn.co/image/ab67616d0000b2733337d6...,https://api.spotify.com/v1/albums/1FrWQryCHNC9...,22dFwJoRBV51ue5TGnC7Dt,Beny Jr,spotify:album:1FrWQryCHNC95W7JwpKKiK,spotify:artist:22dFwJoRBV51ue5TGnC7Dt
1,single,1,True,{'spotify': 'https://open.spotify.com/album/7C...,https://api.spotify.com/v1/albums/7CogroBOyfLM...,7CogroBOyfLMRTFvTR3AUX,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Many Men,2025-01-29,day,artist,spotify:album:7CogroBOyfLMRTFvTR3AUX,[{'external_urls': {'spotify': 'https://open.s...,2025-01-29 22:43:49,https://i.scdn.co/image/ab67616d00001e023337d6...,https://api.spotify.com/v1/albums/7CogroBOyfLM...,3J1MhhyXLJRNRZVrx11Lbf,Cyril Kamer,spotify:album:7CogroBOyfLMRTFvTR3AUX,spotify:artist:3J1MhhyXLJRNRZVrx11Lbf
2,album,12,True,{'spotify': 'https://open.spotify.com/album/5J...,https://api.spotify.com/v1/albums/5JXTdkPCK5xF...,5JXTdkPCK5xFywH7ROSLec,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",ESCUCHA / Zapada EN VIVO en UN POCO DE RUIDO,2025-01-28,day,artist,spotify:album:5JXTdkPCK5xFywH7ROSLec,[{'external_urls': {'spotify': 'https://open.s...,2025-01-29 22:43:49,https://i.scdn.co/image/ab67616d000048513337d6...,https://api.spotify.com/v1/albums/5JXTdkPCK5xF...,7yIp2QRLkQ6loIrupimiri,Pinky SD,spotify:album:5JXTdkPCK5xFywH7ROSLec,spotify:artist:7yIp2QRLkQ6loIrupimiri
3,single,1,True,{'spotify': 'https://open.spotify.com/album/3V...,https://api.spotify.com/v1/albums/3VLwV4o2WGAR...,3VLwV4o2WGARLpANnoPe57,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",But I'll Keep Trying,2025-01-29,day,artist,spotify:album:3VLwV4o2WGARLpANnoPe57,[{'external_urls': {'spotify': 'https://open.s...,2025-01-29 22:43:49,https://i.scdn.co/image/ab67616d0000b273d56489...,https://api.spotify.com/v1/albums/3VLwV4o2WGAR...,2Mt2vBBEckrvXtg0JldwZ0,Un Poco de Ruido,spotify:album:3VLwV4o2WGARLpANnoPe57,spotify:artist:2Mt2vBBEckrvXtg0JldwZ0
4,single,3,True,{'spotify': 'https://open.spotify.com/album/6Z...,https://api.spotify.com/v1/albums/6ZxZ4cvU5Fc3...,6ZxZ4cvU5Fc3IYmA1k8XNz,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",You're All I Got / So Long,2025-01-29,day,artist,spotify:album:6ZxZ4cvU5Fc3IYmA1k8XNz,[{'external_urls': {'spotify': 'https://open.s...,2025-01-29 22:43:49,https://i.scdn.co/image/ab67616d00001e02d56489...,https://api.spotify.com/v1/albums/6ZxZ4cvU5Fc3...,2U4caDhaqjDdjVg9Uyp0bp,Escucha!,spotify:album:6ZxZ4cvU5Fc3IYmA1k8XNz,spotify:artist:2U4caDhaqjDdjVg9Uyp0bp


In [109]:
## Treating some fields (renaming, exploding the dicts, etc)

albums['image_url'] = pd.json_normalize(albums['images'].explode())['url']
albums['album_href'] = albums['href'] 
albums['artist_id'] = pd.json_normalize(albums['artists'].explode())['id']
albums['artist_name'] = pd.json_normalize(albums['artists'].explode())['name']
albums['type'] = pd.json_normalize(albums['artists'].explode())['type']
albums['album_uri'] = albums['uri'] 
albums['artist_uri'] = pd.json_normalize(albums['artists'].explode())['uri']

In [111]:
## Returning Artists DataFrame
artists.head()

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,extractionTimestamp
0,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 1658997}",[drill],https://api.spotify.com/v1/artists/22dFwJoRBV5...,22dFwJoRBV51ue5TGnC7Dt,[{'url': 'https://i.scdn.co/image/ab6761610000...,Beny Jr,75,artist,spotify:artist:22dFwJoRBV51ue5TGnC7Dt,2025-01-29 22:43:48
1,{'spotify': 'https://open.spotify.com/artist/3...,"{'href': None, 'total': 474743}",[drill],https://api.spotify.com/v1/artists/3J1MhhyXLJR...,3J1MhhyXLJRNRZVrx11Lbf,[{'url': 'https://i.scdn.co/image/ab6761610000...,Cyril Kamer,61,artist,spotify:artist:3J1MhhyXLJRNRZVrx11Lbf,2025-01-29 22:43:48
2,{'spotify': 'https://open.spotify.com/artist/7...,"{'href': None, 'total': 59062}","[cuarteto, cumbia]",https://api.spotify.com/v1/artists/7yIp2QRLkQ6...,7yIp2QRLkQ6loIrupimiri,[{'url': 'https://i.scdn.co/image/ab6761610000...,Pinky SD,77,artist,spotify:artist:7yIp2QRLkQ6loIrupimiri,2025-01-29 22:43:48
3,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 240241}","[cuarteto, cumbia]",https://api.spotify.com/v1/artists/2Mt2vBBEckr...,2Mt2vBBEckrvXtg0JldwZ0,[{'url': 'https://i.scdn.co/image/ab6761610000...,Un Poco de Ruido,77,artist,spotify:artist:2Mt2vBBEckrvXtg0JldwZ0,2025-01-29 22:43:48
4,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 111239}",[cumbia],https://api.spotify.com/v1/artists/2U4caDhaqjD...,2U4caDhaqjDdjVg9Uyp0bp,[{'url': 'https://i.scdn.co/image/ab6761610000...,Escucha!,46,artist,spotify:artist:2U4caDhaqjDdjVg9Uyp0bp,2025-01-29 22:43:48


In [112]:
artists.head()


Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,extractionTimestamp,image_url
0,https://open.spotify.com/artist/22dFwJoRBV51ue...,1658997,drill,https://api.spotify.com/v1/artists/22dFwJoRBV5...,22dFwJoRBV51ue5TGnC7Dt,[{'url': 'https://i.scdn.co/image/ab6761610000...,Beny Jr,75,artist,spotify:artist:22dFwJoRBV51ue5TGnC7Dt,2025-01-29 22:43:48,https://i.scdn.co/image/ab6761610000e5eb807f22...
1,https://open.spotify.com/artist/3J1MhhyXLJRNRZ...,474743,drill,https://api.spotify.com/v1/artists/3J1MhhyXLJR...,3J1MhhyXLJRNRZVrx11Lbf,[{'url': 'https://i.scdn.co/image/ab6761610000...,Cyril Kamer,61,artist,spotify:artist:3J1MhhyXLJRNRZVrx11Lbf,2025-01-29 22:43:48,https://i.scdn.co/image/ab67616100005174807f22...
2,https://open.spotify.com/artist/7yIp2QRLkQ6loI...,59062,"cuarteto, cumbia",https://api.spotify.com/v1/artists/7yIp2QRLkQ6...,7yIp2QRLkQ6loIrupimiri,[{'url': 'https://i.scdn.co/image/ab6761610000...,Pinky SD,77,artist,spotify:artist:7yIp2QRLkQ6loIrupimiri,2025-01-29 22:43:48,https://i.scdn.co/image/ab6761610000f178807f22...
3,https://open.spotify.com/artist/2Mt2vBBEckrvXt...,240241,"cuarteto, cumbia",https://api.spotify.com/v1/artists/2Mt2vBBEckr...,2Mt2vBBEckrvXtg0JldwZ0,[{'url': 'https://i.scdn.co/image/ab6761610000...,Un Poco de Ruido,77,artist,spotify:artist:2Mt2vBBEckrvXtg0JldwZ0,2025-01-29 22:43:48,https://i.scdn.co/image/ab6761610000e5eb78be17...
4,https://open.spotify.com/artist/2U4caDhaqjDdjV...,111239,cumbia,https://api.spotify.com/v1/artists/2U4caDhaqjD...,2U4caDhaqjDdjVg9Uyp0bp,[{'url': 'https://i.scdn.co/image/ab6761610000...,Escucha!,46,artist,spotify:artist:2U4caDhaqjDdjVg9Uyp0bp,2025-01-29 22:43:48,https://i.scdn.co/image/ab6761610000517478be17...


In [None]:
## Treating some fields (renaming, exploding the dicts, etc)

artists['genres'] = artists["genres"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
artists['external_urls'] = pd.json_normalize(artists['external_urls'], max_level=1)['spotify']
artists['followers'] = pd.json_normalize(artists['followers'], max_level=1)['total']
artists['image_url'] = pd.json_normalize(artists['images'].explode())['url']

In [15]:
## Importing Credentials from Google Cloud

from google.cloud import storage
from google.oauth2 import service_account

CREDENTIALS = service_account.Credentials.from_service_account_file(penv.bq_path)
STORAGE = storage.Client(credentials=CREDENTIALS)

In [16]:
# Acessing Bucket Path

bucket = STORAGE.get_bucket(penv.bucket_path)

In [17]:
# Getting currentTimestamp

currentTimestamp = datetime.today().strftime('%Y-%m-%d %X')

# Adding currentTimestamp on file name, so it doesn't overwrite itself. 
# Also, it helps keep track on incremental models

file_name = f"spotify_api_test_data__{currentTimestamp}"

In [19]:
## Defining a function called avro_df_prep to prepare the dataframe for the Avro format

def avro_df_prep():

    # pip install fastavro

    from fastavro import writer, parse_schema

    # Converting all columns to string, because Avro doesn't support object type

    columns_to_convert = [  # Lista de colunas definidas no esquema Avro
        'album_type', 'external_urls', 'href',
       'id', 'images', 'name', 'release_date', 'release_date_precision',
       'type', 'uri', 'artists', 'restrictions']

    df[columns_to_convert] = df[columns_to_convert].astype(str)

    # Declaring dataframe schema

    schema = {
        'name': 'spotify'
        , 'type': 'record'
        , 'fields': [
                        {'name': 'album_type', 'type': 'string'}, 
                        {'name': 'total_tracks', 'type': 'int'}, 
                        {'name': 'is_playable', 'type': 'boolean'}, 
                        {'name': 'external_urls', 'type': 'string'},
                        {'name': 'id', 'type': 'string'},
                        {'name': 'images', 'type': 'string'},  
                        {'name': 'name', 'type': 'string'}, 
                        {'name': 'release_date', 'type': 'string'}, 
                        {'name': 'release_date_precision', 'type': 'string'}, 
                        {'name': 'href', 'type': 'string'}, 
                        {'name': 'type', 'type': 'string'}, 
                        {'name': 'uri', 'type': 'string'}, 
                        {'name': 'artists', 'type': 'string'}, 
                        {'name': 'restrictions', 'type': 'string'}, 
                    ]
    }

    parsed_schema = parse_schema(schema)
    records = df.to_dict('records')

    # Writing an Avro file on 'archive' directory

    with open(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb') as out:
       writer(out, parsed_schema, records)
            

In [35]:
avro_df_prep()

: 

In [None]:
## Writing Dataframe to Bucket folder with desired file format 

file_formats = [
                'csv'
                , 'parquet'
                , 'json'
                , 'orc'
                , 'avro'
]

for i in range(len(file_formats)):
    
    blob = bucket.blob(f"{penv.bucket_folder}/{file_name}.{file_formats[i]}")
    
    if file_formats[i] == 'csv':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_csv(), '/text/csv')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'parquet':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_parquet(), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'json':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_json(orient='table'), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'orc':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.reset_index().to_orc(index=None), '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'avro':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        avro_df_prep()
        blob.upload_from_filename(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb', '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))

## Increase in rows
#1st result

- Begin at:  2025-01-24 21:38:26
- Sucessfully written in  csv
- End at:  2025-01-24 21:38:48

- Begin at:  2025-01-24 21:38:48
- Sucessfully written in  parquet
- End at:  2025-01-24 21:38:51

- Begin at:  2025-01-24 21:38:51
- Sucessfully written in  json
- End at:  2025-01-24 21:39:17

- Begin at:  2025-01-24 21:39:17
- Sucessfully written in  orc
- End at:  2025-01-24 21:39:37

- Begin at:  2025-01-24 21:39:37
- Erro

#2nd result

- Begin at:  2025-01-24 23:04:22
- Sucessfully written in  csv
- End at:  2025-01-24 23:04:45
- Begin at:  2025-01-24 23:04:45
- Sucessfully written in  parquet
- End at:  2025-01-24 23:04:48
- Begin at:  2025-01-24 23:04:48
- Sucessfully written in  json
- End at:  2025-01-24 23:05:18
- Begin at:  2025-01-24 23:05:18
- Sucessfully written in  orc
- End at:  2025-01-24 23:05:39
- Begin at:  2025-01-24 23:05:39
- Sucessfully written in  avro
-End at:  2025-01-24 23:05:58


### 3rd
Begin at:  2025-01-27 14:59:22
Sucessfully written in  csv
End at:  2025-01-27 15:00:12
Begin at:  2025-01-27 15:00:12
Sucessfully written in  parquet
End at:  2025-01-27 15:00:19
Begin at:  2025-01-27 15:00:19
Sucessfully written in  json
End at:  2025-01-27 15:01:16
Begin at:  2025-01-27 15:01:16
Sucessfully written in  orc
End at:  2025-01-27 15:02:01
Begin at:  2025-01-27 15:02:01
Sucessfully written in  avro
End at:  2025-01-27 15:02:48


### 4th

Begin at:  2025-01-27 15:04:02
Sucessfully written in  csv
End at:  2025-01-27 15:05:40
Begin at:  2025-01-27 15:05:40
Sucessfully written in  parquet
End at:  2025-01-27 15:05:53
Begin at:  2025-01-27 15:05:53
Sucessfully written in  json
End at:  2025-01-27 15:07:37
Begin at:  2025-01-27 15:07:37
Sucessfully written in  orc
End at:  2025-01-27 15:09:00
Begin at:  2025-01-27 15:09:00
Sucessfully written in  avro
End at:  2025-01-27 15:10:29