In [1]:
## Installing Spotify's API library in Python

# pip install spotipy

In [2]:
## Importing libraries

import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from datetime import datetime

sys.path.append("/home/tabas/personal-dev/pyprojects")
import pipelines.utils.personal_env as penv

In [3]:
# Importing Spotify Credentials

CLIENT_ID = penv.spotify_client_id
CLIENT_SECRET = penv.spotify_client_secret

In [4]:
# Stablishing Spotify Authentication

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [5]:
# Creating a list of all available markets on Spotify

markets = [
            "AD", "AE", "AG", "AL", #"AM", "AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", 
            #"BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW", "BY", 
            #"BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", 
            #"CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", 
            #"FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", 
            #"GW", "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IQ", "IS", 
            #"IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", 
            #"LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", 
            #"MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV", "MW", 
            #"MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", 
            #"PA", "PE", "PG", "PH", "PK", "PL", "PR", "PS", "PT", "PW", "PY", "QA", "RO", 
            #"RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", 
            #"ST", "SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR", "TT", "TV", 
            #"TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", 
            #"ZA", "ZM", "ZW"
           ]

In [6]:
## Creating empty DataFrame to append API values after request

releases = pd.DataFrame()

In [7]:
## Creating loop to make GET Request
## The first request gets the list of new Albums released two weeks ago from each market defined above
## Then, it collects the ids of the Artists of each release and makes the second request, 
## which returns the Artist's data

for i in range(len(markets)):
    
    ## The Spotify only returns 50 values per request 
    # (the variables 'limit' and 'batchSize' helps Spotify not crash if the data exceeds )
    
    limit = 50
    offset = 0
    
    while offset < 1000:    # Spotify limit for Search Request is 1000
        
        ## Making GET request of the type search with the tag:'new', that returns the latest Albums

        newReleases = sp.search(q="tag:new", market=markets[i], type="album", limit=limit, offset=offset)
        newReleasesData = pd.DataFrame.from_dict(newReleases['albums']['items'])
        
        releases = pd.concat([releases, newReleasesData])
        releases['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
        
        # Incremental addition to offset to return the following pages of data
        
        offset=offset+limit
        
    print("Successfully got request from ", markets[i], "market")

Successfully got request from  AD market
Successfully got request from  AE market
Successfully got request from  AG market
Successfully got request from  AL market


In [8]:
releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 49
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              4000 non-null   object
 1   total_tracks            4000 non-null   int64 
 2   is_playable             4000 non-null   bool  
 3   external_urls           4000 non-null   object
 4   href                    4000 non-null   object
 5   id                      4000 non-null   object
 6   images                  4000 non-null   object
 7   name                    4000 non-null   object
 8   release_date            4000 non-null   object
 9   release_date_precision  4000 non-null   object
 10  type                    4000 non-null   object
 11  uri                     4000 non-null   object
 12  artists                 4000 non-null   object
 13  extractionTimestamp     4000 non-null   object
dtypes: bool(1), int64(1), object(12)
memory usage: 441.4+ KB


In [9]:
## Returning Album DataFrame
releases.head()

Unnamed: 0,album_type,total_tracks,is_playable,external_urls,href,id,images,name,release_date,release_date_precision,type,uri,artists,extractionTimestamp
0,single,1,True,{'spotify': 'https://open.spotify.com/album/7x...,https://api.spotify.com/v1/albums/7x2bjtxejjCz...,7x2bjtxejjCzg64hRa5f5Y,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Amor Clandestino (Acústica),2025-02-05,day,album,spotify:album:7x2bjtxejjCzg64hRa5f5Y,[{'external_urls': {'spotify': 'https://open.s...,2025-02-05 12:59:46
1,single,2,True,{'spotify': 'https://open.spotify.com/album/4n...,https://api.spotify.com/v1/albums/4nRdzHLndZ5z...,4nRdzHLndZ5zRaJvYDn1IM,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Electric Green Lambo,2025-02-04,day,album,spotify:album:4nRdzHLndZ5zRaJvYDn1IM,[{'external_urls': {'spotify': 'https://open.s...,2025-02-05 12:59:46
2,single,1,True,{'spotify': 'https://open.spotify.com/album/2e...,https://api.spotify.com/v1/albums/2evlCGKSXmun...,2evlCGKSXmun0ByXPY5BY2,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Milk & Honey,2025-02-04,day,album,spotify:album:2evlCGKSXmun0ByXPY5BY2,[{'external_urls': {'spotify': 'https://open.s...,2025-02-05 12:59:46
3,single,2,True,{'spotify': 'https://open.spotify.com/album/0c...,https://api.spotify.com/v1/albums/0c7AnOZzrNxS...,0c7AnOZzrNxSSNwEotCL8m,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Omorfo Mou,2025-02-05,day,album,spotify:album:0c7AnOZzrNxSSNwEotCL8m,[{'external_urls': {'spotify': 'https://open.s...,2025-02-05 12:59:46
4,single,1,True,{'spotify': 'https://open.spotify.com/album/5t...,https://api.spotify.com/v1/albums/5tfCvNOUuhzc...,5tfCvNOUuhzck0IWU9XaUu,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",La talla,2025-02-05,day,album,spotify:album:5tfCvNOUuhzck0IWU9XaUu,[{'external_urls': {'spotify': 'https://open.s...,2025-02-05 12:59:46


In [10]:
## Treating some fields (renaming, exploding the dicts, etc)

releases['image_url'] = pd.json_normalize(releases['images'].explode())['url']
releases['album_href'] = releases['href'] 
releases['artist_id'] = releases['artists'].apply(lambda artists: [artist['id'] for artist in artists])
releases['artist_name'] = releases['artists'].apply(lambda artists: [artist['name'] for artist in artists])
releases['album_type'] = releases['type']
releases['artist_type'] = releases['artists'].apply(lambda artists: [artist['type'] for artist in artists])
releases['album_uri'] = releases['uri'] 
releases['artist_uri'] = releases['artists'].apply(lambda artists: [artist['uri'] for artist in artists])
releases['artist_href'] = releases['artists'].apply(lambda artists: [artist['href'] for artist in artists])

In [11]:
releases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 49
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              4000 non-null   object
 1   total_tracks            4000 non-null   int64 
 2   is_playable             4000 non-null   bool  
 3   external_urls           4000 non-null   object
 4   href                    4000 non-null   object
 5   id                      4000 non-null   object
 6   images                  4000 non-null   object
 7   name                    4000 non-null   object
 8   release_date            4000 non-null   object
 9   release_date_precision  4000 non-null   object
 10  type                    4000 non-null   object
 11  uri                     4000 non-null   object
 12  artists                 4000 non-null   object
 13  extractionTimestamp     4000 non-null   object
 14  image_url               4000 non-null   object
 15  album_href 

In [12]:
## Creating empty DataFrame to append API values after request

artists = pd.DataFrame()

In [13]:
## Making request to GET Artists' Data

# Here we're accessing the Artist ID to make the loop request below 
# We transforme the arrays into sets to remove duplicates, and then convert it back to lists, so it can be used
# on the API request
        
artistsList =  list(set(releases['artist_id'].explode()))

batchSize = 50 # Spotify limit for Artist Request is 50
        
for j in range(0, len(artistsList), batchSize):
    artistsBatch = artistsList[j:j + batchSize]
    artistsData = sp.artists(artistsBatch)
    artistsData = pd.DataFrame.from_dict(artistsData['artists'])  
    artists = pd.concat([artists, artistsData])
    artists['extractionTimestamp'] = datetime.today().strftime('%Y-%m-%d %X')
    
    print("Successfully got ", round(((j + batchSize)/len(artistsList)) * 100, 2), "% of Artists")

Successfully got  3.35 % of Artists
Successfully got  6.7 % of Artists
Successfully got  10.05 % of Artists
Successfully got  13.4 % of Artists
Successfully got  16.74 % of Artists
Successfully got  20.09 % of Artists
Successfully got  23.44 % of Artists
Successfully got  26.79 % of Artists
Successfully got  30.14 % of Artists
Successfully got  33.49 % of Artists
Successfully got  36.84 % of Artists
Successfully got  40.19 % of Artists
Successfully got  43.54 % of Artists
Successfully got  46.89 % of Artists
Successfully got  50.23 % of Artists
Successfully got  53.58 % of Artists
Successfully got  56.93 % of Artists
Successfully got  60.28 % of Artists
Successfully got  63.63 % of Artists
Successfully got  66.98 % of Artists
Successfully got  70.33 % of Artists
Successfully got  73.68 % of Artists
Successfully got  77.03 % of Artists
Successfully got  80.38 % of Artists
Successfully got  83.72 % of Artists
Successfully got  87.07 % of Artists
Successfully got  90.42 % of Artists
Succe

In [14]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1493 entries, 0 to 42
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   external_urls        1493 non-null   object
 1   followers            1493 non-null   object
 2   genres               1493 non-null   object
 3   href                 1493 non-null   object
 4   id                   1493 non-null   object
 5   images               1493 non-null   object
 6   name                 1493 non-null   object
 7   popularity           1493 non-null   int64 
 8   type                 1493 non-null   object
 9   uri                  1493 non-null   object
 10  extractionTimestamp  1493 non-null   object
dtypes: int64(1), object(10)
memory usage: 140.0+ KB


In [15]:
## Returning Artists DataFrame
artists.head()

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,extractionTimestamp
0,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 54490}",[malayalam pop],https://api.spotify.com/v1/artists/0udTsqVsPij...,0udTsqVsPijUXCJgdVKYWp,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Shafi Kollam,40,artist,spotify:artist:0udTsqVsPijUXCJgdVKYWp,2025-02-05 13:00:28
1,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 5}",[kannada pop],https://api.spotify.com/v1/artists/2df2v0cykuf...,2df2v0cykufkVCZP0TNn9f,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Pramod Jois,7,artist,spotify:artist:2df2v0cykufkVCZP0TNn9f,2025-02-05 13:00:28
2,{'spotify': 'https://open.spotify.com/artist/1...,"{'href': None, 'total': 33616}",[phonk],https://api.spotify.com/v1/artists/1iKfUld50BO...,1iKfUld50BOVBIgHy3GlGt,[{'url': 'https://i.scdn.co/image/ab6761610000...,DJ CEREJASS,64,artist,spotify:artist:1iKfUld50BOVBIgHy3GlGt,2025-02-05 13:00:28
3,{'spotify': 'https://open.spotify.com/artist/1...,"{'href': None, 'total': 89628}","[phonk, brazilian funk]",https://api.spotify.com/v1/artists/1M02J1PLobt...,1M02J1PLobtQYok93DEvVa,[{'url': 'https://i.scdn.co/image/ab6761610000...,slxughter,58,artist,spotify:artist:1M02J1PLobtQYok93DEvVa,2025-02-05 13:00:28
4,{'spotify': 'https://open.spotify.com/artist/2...,"{'href': None, 'total': 15}",[mollywood],https://api.spotify.com/v1/artists/2cPcXiURP23...,2cPcXiURP23fMWNxpO7042,[{'url': 'https://i.scdn.co/image/ab67616d0000...,Lejin Chemmani,12,artist,spotify:artist:2cPcXiURP23fMWNxpO7042,2025-02-05 13:00:28


In [16]:
## Treating some fields (renaming, exploding the dicts, etc)

artists['genres'] = artists["genres"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
artists['external_urls'] = pd.json_normalize(artists['external_urls'], max_level=1)['spotify']
artists['followers'] = pd.json_normalize(artists['followers'], max_level=1)['total']
artists['image_url'] = pd.json_normalize(artists['images'].explode())['url']

In [16]:
## Let's get more data on this script
## Now that we have New Releases information, we can make a request to collect further information about albums and tracks
## Note that a new release can be an album or a single.
## So, to get more data, we're going to make a Request to return more informations about albums (like tracks, for example)
## And then, we are going to make a final request to return information about all tracks (single releases and tracks from the albums)

In [17]:
## Creating empty DataFrame to append API values after request

albums = pd.DataFrame()

In [18]:
albumsList = list(set(releases['id']))

# Redefining batchSize variable

batchSize = 20 # Spotify limit for Album Request is 20

for k in range(0, len(albumsList), batchSize):
    albumsData = sp.albums(albumsList[k:k + batchSize])
    albumsData = pd.DataFrame.from_dict(albumsData['albums'])
    
    albums = pd.concat([albums, albumsData])
    
    print("Successfully got ", round((k + batchSize) / len(albumsList) * 100, 2), "% of albums")

Successfully got  1.41 % of albums
Successfully got  2.83 % of albums
Successfully got  4.24 % of albums
Successfully got  5.66 % of albums
Successfully got  7.07 % of albums
Successfully got  8.49 % of albums
Successfully got  9.9 % of albums
Successfully got  11.32 % of albums
Successfully got  12.73 % of albums
Successfully got  14.14 % of albums
Successfully got  15.56 % of albums
Successfully got  16.97 % of albums
Successfully got  18.39 % of albums
Successfully got  19.8 % of albums
Successfully got  21.22 % of albums
Successfully got  22.63 % of albums
Successfully got  24.05 % of albums
Successfully got  25.46 % of albums
Successfully got  26.87 % of albums
Successfully got  28.29 % of albums
Successfully got  29.7 % of albums
Successfully got  31.12 % of albums
Successfully got  32.53 % of albums
Successfully got  33.95 % of albums
Successfully got  35.36 % of albums
Successfully got  36.78 % of albums
Successfully got  38.19 % of albums
Successfully got  39.6 % of albums
Suc

In [36]:
albums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1414 entries, 0 to 13
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   album_type              1414 non-null   object
 1   total_tracks            1414 non-null   int64 
 2   available_markets       1414 non-null   object
 3   external_urls           1414 non-null   object
 4   href                    1414 non-null   object
 5   id                      1414 non-null   object
 6   images                  1414 non-null   object
 7   name                    1414 non-null   object
 8   release_date            1414 non-null   object
 9   release_date_precision  1414 non-null   object
 10  type                    1414 non-null   object
 11  uri                     1414 non-null   object
 12  artists                 1414 non-null   object
 13  tracks                  1414 non-null   object
 14  copyrights              1414 non-null   object
 15  external_id

In [32]:
albumsExploded = pd.json_normalize(albums['tracks'])

tracksExploded = pd.DataFrame()

for i in range(len(albumsExploded)):
    tracks = pd.json_normalize(albumsExploded['items'][i][0])
    tracks['index'] = i
    tracksData = pd.concat([tracks, tracksExploded], axis=0)

In [43]:
trackList = list(set(tracksData['id']))

In [51]:
tracks = pd.DataFrame()

In [52]:
# Redefining batchSize variable

batchSize = 50 # Spotify limit for Track Request is 50

for k in range(0, len(trackList), batchSize):
    tracksData = sp.tracks(trackList[k:k + batchSize])
    tracksData = pd.DataFrame.from_dict(tracksData['tracks'])
    
    tracks = pd.concat([tracks, tracksData])
    
    print("Successfully got ", round((k + batchSize) / len(trackList) * 100, 2), "% of tracks")

Successfully got  3.54 % of tracks
Successfully got  7.09 % of tracks
Successfully got  10.63 % of tracks
Successfully got  14.17 % of tracks
Successfully got  17.72 % of tracks
Successfully got  21.26 % of tracks
Successfully got  24.81 % of tracks
Successfully got  28.35 % of tracks
Successfully got  31.89 % of tracks
Successfully got  35.44 % of tracks
Successfully got  38.98 % of tracks
Successfully got  42.52 % of tracks
Successfully got  46.07 % of tracks
Successfully got  49.61 % of tracks
Successfully got  53.15 % of tracks
Successfully got  56.7 % of tracks
Successfully got  60.24 % of tracks
Successfully got  63.78 % of tracks
Successfully got  67.33 % of tracks
Successfully got  70.87 % of tracks
Successfully got  74.42 % of tracks
Successfully got  77.96 % of tracks
Successfully got  81.5 % of tracks
Successfully got  85.05 % of tracks
Successfully got  88.59 % of tracks
Successfully got  92.13 % of tracks
Successfully got  95.68 % of tracks
Successfully got  99.22 % of tra

In [55]:
tracks

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,144019,False,{'isrc': 'GBMJG2503072'},{'spotify': 'https://open.spotify.com/track/4j...,https://api.spotify.com/v1/tracks/4jdeBQ5lvgLa...,4jdeBQ5lvgLalFlNgVOgtP,False,Vibrations Élevées (Rain),0,,1,track,spotify:track:4jdeBQ5lvgLalFlNgVOgtP
1,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,200076,False,{'isrc': 'USRC12400873'},{'spotify': 'https://open.spotify.com/track/6G...,https://api.spotify.com/v1/tracks/6GTTgyBHSqYL...,6GTTgyBHSqYLbJfMWNwPVU,False,"Dear Me - From The Original Documentary ""Diane...",57,,1,track,spotify:track:6GTTgyBHSqYLbJfMWNwPVU
2,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,131720,False,{'isrc': 'DEE862400447'},{'spotify': 'https://open.spotify.com/track/0V...,https://api.spotify.com/v1/tracks/0VRtOnSPYL8u...,0VRtOnSPYL8u2U8FW07kqN,False,"Mendelssohn Variation (From Violin Concerto, O...",8,,1,track,spotify:track:0VRtOnSPYL8u2U8FW07kqN
3,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,171857,True,{'isrc': 'BKOUP2500012'},{'spotify': 'https://open.spotify.com/track/54...,https://api.spotify.com/v1/tracks/546JIYIWmy5l...,546JIYIWmy5lKMhWYVJrxG,False,Mês de Fevereiro,0,,1,track,spotify:track:546JIYIWmy5lKMhWYVJrxG
4,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,127995,False,{'isrc': 'GBMJG2503210'},{'spotify': 'https://open.spotify.com/track/04...,https://api.spotify.com/v1/tracks/04DRNTmxTal2...,04DRNTmxTal280tYEgoZmb,False,Liquid Horizon (Rain),8,,1,track,spotify:track:04DRNTmxTal280tYEgoZmb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,203134,False,{'isrc': 'GBCFB2401153'},{'spotify': 'https://open.spotify.com/track/36...,https://api.spotify.com/v1/tracks/36sARCGegSJc...,36sARCGegSJcQp8cXNswGH,False,DNA ∞,0,,1,track,spotify:track:36sARCGegSJcQp8cXNswGH
7,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,110586,False,{'isrc': 'DEN961830312'},{'spotify': 'https://open.spotify.com/track/6G...,https://api.spotify.com/v1/tracks/6Gvnd4HTG7XL...,6Gvnd4HTG7XL0B1ew2GaVR,False,Aria variata (alla maniera italiana) in A Mino...,0,,1,track,spotify:track:6Gvnd4HTG7XL0B1ew2GaVR
8,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,179777,False,{'isrc': 'GXFCP2500004'},{'spotify': 'https://open.spotify.com/track/4w...,https://api.spotify.com/v1/tracks/4wm7PC1t648A...,4wm7PC1t648ATT1pd8Zltr,False,Cash Only - Andrea Oliva Remix,46,,1,track,spotify:track:4wm7PC1t648ATT1pd8Zltr
9,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,246707,False,{'isrc': 'INB942520919'},{'spotify': 'https://open.spotify.com/track/3T...,https://api.spotify.com/v1/tracks/3TdYqOBjK6M5...,3TdYqOBjK6M5QmFnf4tjeV,False,Hamro Hau Haliya Uhe Dil Ke,0,,1,track,spotify:track:3TdYqOBjK6M5QmFnf4tjeV


In [15]:
## Importing Credentials from Google Cloud

from google.cloud import storage
from google.oauth2 import service_account

CREDENTIALS = service_account.Credentials.from_service_account_file(penv.bq_path)
STORAGE = storage.Client(credentials=CREDENTIALS)

In [16]:
# Acessing Bucket Path

bucket = STORAGE.get_bucket(penv.bucket_path)

In [17]:
# Getting currentTimestamp

currentTimestamp = datetime.today().strftime('%Y-%m-%d %X')

# Adding currentTimestamp on file name, so it doesn't overwrite itself. 
# Also, it helps keep track on incremental models

file_name = f"spotify_api_test_data__{currentTimestamp}"

In [19]:
## Defining a function called avro_df_prep to prepare the dataframe for the Avro format

def avro_df_prep():

    # pip install fastavro

    from fastavro import writer, parse_schema

    # Converting all columns to string, because Avro doesn't support object type

    columns_to_convert = [  # Lista de colunas definidas no esquema Avro
        'album_type', 'external_urls', 'href',
       'id', 'images', 'name', 'release_date', 'release_date_precision',
       'type', 'uri', 'artists', 'restrictions']

    df[columns_to_convert] = df[columns_to_convert].astype(str)

    # Declaring dataframe schema

    schema = {
        'name': 'spotify'
        , 'type': 'record'
        , 'fields': [
                        {'name': 'album_type', 'type': 'string'}, 
                        {'name': 'total_tracks', 'type': 'int'}, 
                        {'name': 'is_playable', 'type': 'boolean'}, 
                        {'name': 'external_urls', 'type': 'string'},
                        {'name': 'id', 'type': 'string'},
                        {'name': 'images', 'type': 'string'},  
                        {'name': 'name', 'type': 'string'}, 
                        {'name': 'release_date', 'type': 'string'}, 
                        {'name': 'release_date_precision', 'type': 'string'}, 
                        {'name': 'href', 'type': 'string'}, 
                        {'name': 'type', 'type': 'string'}, 
                        {'name': 'uri', 'type': 'string'}, 
                        {'name': 'artists', 'type': 'string'}, 
                        {'name': 'restrictions', 'type': 'string'}, 
                    ]
    }

    parsed_schema = parse_schema(schema)
    records = df.to_dict('records')

    # Writing an Avro file on 'archive' directory

    with open(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb') as out:
       writer(out, parsed_schema, records)
            

In [35]:
avro_df_prep()

: 

In [None]:
## Writing Dataframe to Bucket folder with desired file format 

file_formats = [
                'csv'
                , 'parquet'
                , 'json'
                , 'orc'
                , 'avro'
]

for i in range(len(file_formats)):
    
    blob = bucket.blob(f"{penv.bucket_folder}/{file_name}.{file_formats[i]}")
    
    if file_formats[i] == 'csv':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_csv(), '/text/csv')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'parquet':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_parquet(), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'json':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.to_json(orient='table'), '/text/plain')
        print("Sucessfully written in ", file_formats[i])
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'orc':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        blob.upload_from_string(df.reset_index().to_orc(index=None), '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))
        
    if file_formats[i] == 'avro':
        print("Begin at: ", datetime.today().strftime('%Y-%m-%d %X'))
        avro_df_prep()
        blob.upload_from_filename(f'/home/tabas/personal-dev/pyprojects/pipelines/archive/{file_name}.avro', 'wb', '/text/plain')
        print("Sucessfully written in ", file_formats[i])        
        print("End at: ", datetime.today().strftime('%Y-%m-%d %X'))