In [117]:
## Installing Spotify's API library in Python

# pip install spotipy

In [63]:
## Importing libraries

import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime

sys.path.append("/home/tabas/personal-dev/pyprojects")
import pipelines.utils.personal_env as penv

In [64]:
# Importing Spotify Credentials

CLIENT_ID = penv.spotify_client_id
CLIENT_SECRET = penv.spotify_client_secret

In [65]:
# Stablishing Spotify Authentication

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [131]:
# Creating a list of all available markets on Spotify

markets = [
            "AD", "AE", "AG", "AL", #"AM", "AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", 
            #"BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", 
            #"BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", 
            #"CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", 
            #"FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", 
            #"GW", "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", 
            #"IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", 
            #"LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", 
            #"MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", 
            #"MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", 
            #"PA", "PE", "PG", "PH", "PK", "PL", "PR", "PS", "PT", "PW", "PY", "QA", "RO", 
            #"RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", 
            #"ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR", "TT", "TV", 
            #"TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", 
            #"ZA", "ZM", "ZW"
           ]

In [132]:
## Creating empty DataFrame to append API values after request

releases = pd.DataFrame()

In [134]:
## Creating loop to make GET Request
## The first request gets the list of new Albums released two weeks ago from each market defined above
## Then, it collects the ids of the Artists of each release and makes the second request, 
## which returns the Artist's data

for i in range(len(markets)):
    
    ## The Spotify only returns 50 values per request 
    # (the variables 'limit' and 'batchSize' helps Spotify not crash if the data exceeds )
    
    limit = 50
    offset = 0
    
    while offset < 1000:    # Spotify limit for Search Request is 1000
        
        ## Making GET request of the type search with the tag:'new', that returns the latest Albums

        newReleases = sp.search(q="tag:new", market=markets[i], type="album", limit=limit, offset=offset)
        newReleasesData = pd.DataFrame.from_dict(newReleases['albums']['items'])
        
        releases = pd.concat([releases, newReleasesData])
        releases['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
        
        # Incremental addition to offset to return the following pages of data
        
        offset=offset+limit
        
    print("Successfully got request from ", markets[i], "market")

Successfully got request from  AD market
Successfully got request from  AE market
Successfully got request from  AG market
Successfully got request from  AL market


In [135]:
releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4050 entries, 0 to 49
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              4050 non-null   object
 1   total_tracks            4050 non-null   int64 
 2   is_playable             4050 non-null   bool  
 3   external_urls           4050 non-null   object
 4   href                    4050 non-null   object
 5   id                      4050 non-null   object
 6   images                  4050 non-null   object
 7   name                    4050 non-null   object
 8   release_date            4050 non-null   object
 9   release_date_precision  4050 non-null   object
 10  type                    4050 non-null   object
 11  uri                     4050 non-null   object
 12  artists                 4050 non-null   object
 13  extractionTimestamp     4050 non-null   object
dtypes: bool(1), int64(1), object(12)
memory usage: 446.9+ KB


In [136]:
## Returning Album DataFrame
releases.head()

Unnamed: 0,album_type,total_tracks,is_playable,external_urls,href,id,images,name,release_date,release_date_precision,type,uri,artists,extractionTimestamp
0,single,1,True,{'spotify': 'https://open.spotify.com/album/7x...,https://api.spotify.com/v1/albums/7xvvdbJMMemW...,7xvvdbJMMemW3p8CMnwwl3,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",MANSON,2025-01-30,day,album,spotify:album:7xvvdbJMMemW3p8CMnwwl3,[{'external_urls': {'spotify': 'https://open.s...,2025-01-31 15:21:34
1,single,1,True,{'spotify': 'https://open.spotify.com/album/3l...,https://api.spotify.com/v1/albums/3lMzoJHRGENY...,3lMzoJHRGENYD7FTa6Dvkj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Muzica sa cânte,2025-01-31,day,album,spotify:album:3lMzoJHRGENYD7FTa6Dvkj,[{'external_urls': {'spotify': 'https://open.s...,2025-01-31 15:21:34
2,single,1,True,{'spotify': 'https://open.spotify.com/album/5S...,https://api.spotify.com/v1/albums/5S1yN6tfAihj...,5S1yN6tfAihjD7OIE2l68n,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Omul potrivit,2025-01-31,day,album,spotify:album:5S1yN6tfAihjD7OIE2l68n,[{'external_urls': {'spotify': 'https://open.s...,2025-01-31 15:21:34
3,single,1,True,{'spotify': 'https://open.spotify.com/album/5N...,https://api.spotify.com/v1/albums/5N5DahPumBZH...,5N5DahPumBZHZ7aGcuSSsB,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Hai lume cu Chiuitul,2025-01-31,day,album,spotify:album:5N5DahPumBZHZ7aGcuSSsB,[{'external_urls': {'spotify': 'https://open.s...,2025-01-31 15:21:34
4,single,1,True,{'spotify': 'https://open.spotify.com/album/1q...,https://api.spotify.com/v1/albums/1qerYFWbBF5T...,1qerYFWbBF5TIfmJRuCbfn,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Un frate daca ai,2025-01-31,day,album,spotify:album:1qerYFWbBF5TIfmJRuCbfn,[{'external_urls': {'spotify': 'https://open.s...,2025-01-31 15:21:34


In [137]:
## Treating some fields (renaming, exploding the dicts, etc)

releases['image_url'] = pd.json_normalize(releases['images'].explode())['url']
releases['album_href'] = releases['href'] 
releases['artist_id'] = releases['artists'].apply(lambda artists: [artist['id'] for artist in artists])
releases['artist_name'] = releases['artists'].apply(lambda artists: [artist['name'] for artist in artists])
releases['album_type'] = releases['type']
releases['artist_type'] = releases['artists'].apply(lambda artists: [artist['type'] for artist in artists])
releases['album_uri'] = releases['uri'] 
releases['artist_uri'] = releases['artists'].apply(lambda artists: [artist['uri'] for artist in artists])
releases['artist_href'] = releases['artists'].apply(lambda artists: [artist['href'] for artist in artists])

In [138]:
releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4050 entries, 0 to 49
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              4050 non-null   object
 1   total_tracks            4050 non-null   int64 
 2   is_playable             4050 non-null   bool  
 3   external_urls           4050 non-null   object
 4   href                    4050 non-null   object
 5   id                      4050 non-null   object
 6   images                  4050 non-null   object
 7   name                    4050 non-null   object
 8   release_date            4050 non-null   object
 9   release_date_precision  4050 non-null   object
 10  type                    4050 non-null   object
 11  uri                     4050 non-null   object
 12  artists                 4050 non-null   object
 13  extractionTimestamp     4050 non-null   object
 14  image_url               4050 non-null   object
 15  album_href 

In [139]:
## Creating empty DataFrame to append API values after request

artists = pd.DataFrame()

In [140]:
## Making request to GET Artists' Data

# Here we're accessing the Artist ID to make the loop request below 
# We transforme the arrays into sets to remove duplicates, and then convert it back to lists, so it can be used
# on the API request
        
artistsList =  list(set(releases['artist_id'].explode()))

batchSize = 50 # Spotify limit for Artist Request is 50
        
for j in range(0, len(artistsList), batchSize):
    artistsBatch = artistsList[j:j + batchSize]
    artistsData = sp.artists(artistsBatch)
    artistsData = pd.DataFrame.from_dict(artistsData['artists'])  
    artists = pd.concat([artists, artistsData])
    artists['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
    
    print("Successfully got ", round(((j + batchSize)/len(artistsList)) * 100, 2), "% of Artists")

Successfully got  2.48 % of Artists
Successfully got  4.97 % of Artists
Successfully got  7.45 % of Artists
Successfully got  9.93 % of Artists
Successfully got  12.41 % of Artists
Successfully got  14.9 % of Artists
Successfully got  17.38 % of Artists
Successfully got  19.86 % of Artists
Successfully got  22.34 % of Artists
Successfully got  24.83 % of Artists
Successfully got  27.31 % of Artists
Successfully got  29.79 % of Artists
Successfully got  32.27 % of Artists
Successfully got  34.76 % of Artists
Successfully got  37.24 % of Artists
Successfully got  39.72 % of Artists
Successfully got  42.2 % of Artists
Successfully got  44.69 % of Artists
Successfully got  47.17 % of Artists
Successfully got  49.65 % of Artists
Successfully got  52.14 % of Artists
Successfully got  54.62 % of Artists
Successfully got  57.1 % of Artists
Successfully got  59.58 % of Artists
Successfully got  62.07 % of Artists
Successfully got  64.55 % of Artists
Successfully got  67.03 % of Artists
Successf

In [141]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2014 entries, 0 to 13
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   external_urls        2014 non-null   object
 1   followers            2014 non-null   object
 2   genres               2014 non-null   object
 3   href                 2014 non-null   object
 4   id                   2014 non-null   object
 5   images               2014 non-null   object
 6   name                 2014 non-null   object
 7   popularity           2014 non-null   int64 
 8   type                 2014 non-null   object
 9   uri                  2014 non-null   object
 10  extractionTimestamp  2014 non-null   object
dtypes: int64(1), object(10)
memory usage: 188.8+ KB


In [142]:
## Returning Artists DataFrame
artists.head()

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,extractionTimestamp
0,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 16865}",[german hip hop],https://api.spotify.com/v1/artists/2INm4LtQ2hL...,2INm4LtQ2hL7qsmmnSFk8D,[{'url': 'https://i.scdn.co/image/ab6761610000...,RAQI,48,artist,spotify:artist:2INm4LtQ2hL7qsmmnSFk8D,2025-01-31 15:22:22
1,{'spotify': 'https://open.spotify.com/artist/5...,"{'href': None, 'total': 60256}",[],https://api.spotify.com/v1/artists/5Hx084y0lku...,5Hx084y0lkukp4MHLFmj3S,[{'url': 'https://i.scdn.co/image/ab6761610000...,Monty Datta,51,artist,spotify:artist:5Hx084y0lkukp4MHLFmj3S,2025-01-31 15:22:22
2,{'spotify': 'https://open.spotify.com/artist/3...,"{'href': None, 'total': 54}",[],https://api.spotify.com/v1/artists/3M8J5JPD9gY...,3M8J5JPD9gYNkVjgBfnWO3,[{'url': 'https://i.scdn.co/image/ab6761610000...,DJ Milton,12,artist,spotify:artist:3M8J5JPD9gYNkVjgBfnWO3,2025-01-31 15:22:22
3,{'spotify': 'https://open.spotify.com/artist/6...,"{'href': None, 'total': 101353}","[schlager, schlagerparty]",https://api.spotify.com/v1/artists/6A9b0JlSJF0...,6A9b0JlSJF0KxCddzOTGiA,[{'url': 'https://i.scdn.co/image/ab6761610000...,Höhner,52,artist,spotify:artist:6A9b0JlSJF0KxCddzOTGiA,2025-01-31 15:22:22
4,{'spotify': 'https://open.spotify.com/artist/1...,"{'href': None, 'total': 21835}","[funk, brazilian funk]",https://api.spotify.com/v1/artists/1X842P1hBNR...,1X842P1hBNRnwsuy0e3DLS,[{'url': 'https://i.scdn.co/image/ab6761610000...,Dj Pikeno Mpc,47,artist,spotify:artist:1X842P1hBNRnwsuy0e3DLS,2025-01-31 15:22:22


In [143]:
## Treating some fields (renaming, exploding the dicts, etc)

artists['genres'] = artists["genres"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
artists['external_urls'] = pd.json_normalize(artists['external_urls'], max_level=1)['spotify']
artists['followers'] = pd.json_normalize(artists['followers'], max_level=1)['total']
artists['image_url'] = pd.json_normalize(artists['images'].explode())['url']

In [144]:
## Let's get more data on this script
## Now that we have New Releases information, we can make a request to collect further information about albums and tracks
## Note that a new release can be an album or a single.
## So, to get more data, we're going to make a Request to return more informations about albums (like tracks, for example)
## And then, we are going to make a final request to return information about all tracks (single releases and tracks from the albums)

In [145]:
## Creating empty DataFrame to append API values after request

albums =pd.DataFrame()

In [147]:
albumsList = list(set(releases['id']))

# Redefining batchSize variable

batchSize = 20 # Spotify limit for Album Request is 20

for k in range(0, len(albumsList), batchSize):
    albumsData = sp.albums(albumsList[k:k + batchSize])
    albumsData = pd.DataFrame.from_dict(albumsData['albums'])
    
    albums = pd.concat([albums, albumsData])
    
    print("Successfully got ", round((k + batchSize) / len(albumsList) * 100, 2), "% of albums")

Successfully got  1.41 % of albums
Successfully got  2.81 % of albums
Successfully got  4.22 % of albums
Successfully got  5.62 % of albums
Successfully got  7.03 % of albums
Successfully got  8.43 % of albums
Successfully got  9.84 % of albums
Successfully got  11.24 % of albums
Successfully got  12.65 % of albums
Successfully got  14.05 % of albums
Successfully got  15.46 % of albums
Successfully got  16.87 % of albums
Successfully got  18.27 % of albums
Successfully got  19.68 % of albums
Successfully got  21.08 % of albums
Successfully got  22.49 % of albums
Successfully got  23.89 % of albums
Successfully got  25.3 % of albums
Successfully got  26.7 % of albums
Successfully got  28.11 % of albums
Successfully got  29.52 % of albums
Successfully got  30.92 % of albums
Successfully got  32.33 % of albums
Successfully got  33.73 % of albums
Successfully got  35.14 % of albums
Successfully got  36.54 % of albums
Successfully got  37.95 % of albums
Successfully got  39.35 % of albums
S

In [148]:
albums

Unnamed: 0,album_type,total_tracks,available_markets,external_urls,href,id,images,name,release_date,release_date_precision,type,uri,artists,tracks,copyrights,external_ids,genres,label,popularity
0,album,12,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/2b...,https://api.spotify.com/v1/albums/2bnnrhhq45z8...,2bnnrhhq45z8MEfLbW04zo,[{'url': 'https://i.scdn.co/image/ab67616d0000...,"Jeg vil bare gerne være et godt menneske, men ...",2025-01-31,day,album,spotify:album:2bnnrhhq45z8MEfLbW04zo,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/2b...,[{'text': '(P) 2025 Sony Music Entertainment D...,{'upc': '196872792393'},[],Sony Music Entertainment,0
1,album,20,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, C...",{'spotify': 'https://open.spotify.com/album/2t...,https://api.spotify.com/v1/albums/2tTUU4nsZ2jj...,2tTUU4nsZ2jjEgT7gNyCvE,[{'url': 'https://i.scdn.co/image/ab67616d0000...,The Blue Notebooks (20 Year Edition),2025-01-31,day,album,spotify:album:2tTUU4nsZ2jjEgT7gNyCvE,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/2t...,"[{'text': '© 2025 Deutsche Grammophon GmbH, Be...",{'upc': '00028948663354'},[],Deutsche Grammophon (DG),0
2,single,1,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/5j...,https://api.spotify.com/v1/albums/5jVhsN3Pm3BC...,5jVhsN3Pm3BCszZgpzqd0X,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Trophäe (Live aus Berlin),2025-01-31,day,album,spotify:album:5jVhsN3Pm3BCszZgpzqd0X,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/5j...,"[{'text': '2024 superpolrecords', 'type': 'C'}...",{'upc': '3617667797869'},[],superpolrecords,0
3,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/5s...,https://api.spotify.com/v1/albums/5sw2zAQbGlgu...,5sw2zAQbGlgu8AlhARzgWn,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Senta Com o Popô,2025-01-31,day,album,spotify:album:5sw2zAQbGlgu8AlhARzgWn,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/5s...,"[{'text': '2025 Tropa dos Hitmados', 'type': '...",{'upc': '790092484238'},[],Tropa dos Hitmados,0
4,single,1,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/2T...,https://api.spotify.com/v1/albums/2TYsZMFqcENa...,2TYsZMFqcENa8toCUKenFw,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Fall Of The Celestials,2025-01-31,day,album,spotify:album:2TYsZMFqcENa8toCUKenFw,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/2T...,[{'text': '2025 Q-dance Music B.V. / Q-dance R...,{'upc': '8719244866205'},[],Q-dance Records,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/3L...,https://api.spotify.com/v1/albums/3LMUEL6JF8xi...,3LMUEL6JF8xiYhxOxNPy9H,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Kamar Ke Rog,2025-01-30,day,album,spotify:album:3LMUEL6JF8xiYhxOxNPy9H,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/3L...,"[{'text': 'Wave Music', 'type': 'C'}, {'text':...",{'upc': '6161125616294'},[],Wave Music,0
19,single,1,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/2F...,https://api.spotify.com/v1/albums/2FMZfDpMhD6Z...,2FMZfDpMhD6ZCzb1eolrYU,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Gözlerinden Gözlerine,2025-01-31,day,album,spotify:album:2FMZfDpMhD6ZCzb1eolrYU,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/2F...,"[{'text': '2025 Eva Records', 'type': 'C'}, {'...",{'upc': '3617669015541'},[],Eva Yapım & Eva Records,0
0,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",{'spotify': 'https://open.spotify.com/album/0Q...,https://api.spotify.com/v1/albums/0Q6gW46EI0qd...,0Q6gW46EI0qdnDDReYTXCz,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Giro Pela Praia,2025-01-23,day,album,spotify:album:0Q6gW46EI0qdnDDReYTXCz,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/0Q...,"[{'text': '2025 Sonar Music', 'type': 'C'}, {'...",{'upc': '3617668988365'},[],Sonar Music,40
1,album,13,"[AE, AM, AU, AZ, BD, BG, BH, BI, BN, BT, BW, B...",{'spotify': 'https://open.spotify.com/album/6N...,https://api.spotify.com/v1/albums/6NgdgRdxc6sp...,6NgdgRdxc6spifVLe8KXRM,[{'url': 'https://i.scdn.co/image/ab67616d0000...,I am the Blueprint,2025-01-31,day,album,spotify:album:6NgdgRdxc6spifVLe8KXRM,[{'external_urls': {'spotify': 'https://open.s...,{'href': 'https://api.spotify.com/v1/albums/6N...,"[{'text': '(P) 2025 JTON Music, under exclusiv...",{'upc': '196872802849'},[],Bu Vision,0


AttributeError: 'dict' object has no attribute 'explode'

In [15]:
## Importing Credentials from Google Cloud

from google.cloud import storage
from google.oauth2 import service_account

CREDENTIALS = service_account.Credentials.from_service_account_file(penv.bq_path)
STORAGE = storage.Client(credentials=CREDENTIALS)

In [16]:
# Acessing Bucket Path

bucket = STORAGE.get_bucket(penv.bucket_path)

In [17]:
# Getting currentTimestamp

currentTimestamp = datetime.today().strftime('%Y-%m-%d %X')

# Adding currentTimestamp on file name, so it doesn't overwrite itself. 
# Also, it helps keep track on incremental models

file_name = f"spotify_api_test_data__{currentTimestamp}"

In [19]:
## Defining a function called avro_df_prep to prepare the dataframe for the Avro format

def avro_df_prep():

    # pip install fastavro

    from fastavro import writer, parse_schema

    # Converting all columns to string, because Avro doesn't support object type

    columns_to_convert = [  # Lista de colunas definidas no esquema Avro
        'album_type', 'external_urls', 'href',
       'id', 'images', 'name', 'release_date', 'release_date_precision',
       'type', 'uri', 'artists', 'restrictions']

    df[columns_to_convert] = df[columns_to_convert].astype(str)

    # Declaring dataframe schema

    schema = {
        'name': 'spotify'
        , 'type': 'record'
        , 'fields': [
                        {'name': 'album_type', 'type': 'string'}, 
                        {'name': 'total_tracks', 'type': 'int'}, 
                        {'name': 'is_playable', 'type': 'boolean'}, 
                        {'name': 'external_urls', 'type': 'string'},
                        {'name': 'id', 'type': 'string'},
                        {'name': 'images', 'type': 'string'},  
                        {'name': 'name', 'type': 'string'}, 
                        {'name': 'release_date', 'type': 'string'}, 
                        {'name': 'release_date_precision', 'type': 'string'}, 
                        {'name': 'href', 'type': 'string'}, 
                        {'name': 'type', 'type': 'string'}, 
                        {'name': 'uri', 'type': 'string'}, 
                        {'name': 'artists', 'type': 'string'}, 
                        {'name': 'restrictions', 'type': 'string'}, 
                    ]
    }

    parsed_schema = parse_schema(schema)
    records = df.to_dict('records')

    # Writing an Avro file on 'archive' directory

    with open(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb') as out:
       writer(out, parsed_schema, records)
            

In [35]:
avro_df_prep()

: 

In [None]:
## Writing Dataframe to Bucket folder with desired file format 

file_formats = [
                'csv'
                , 'parquet'
                , 'json'
                , 'orc'
                , 'avro'
]

for i in range(len(file_formats)):
    
    blob = bucket.blob(f"{penv.bucket_folder}/{file_name}.{file_formats[i]}")
    
    if file_formats[i] == 'csv':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_csv(), '/text/csv')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'parquet':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_parquet(), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'json':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_json(orient='table'), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'orc':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.reset_index().to_orc(index=None), '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'avro':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        avro_df_prep()
        blob.upload_from_filename(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb', '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))