In this notebook, we will add in Spotify data regarding the artists. There is a few things we may want to see:
1. How long before the "start" of the tour was their latest album/ep release
2. What is their artist popularity / followers

In [1]:
# import packages
import pandas as pd
import requests
import os
import json
import time

# import keys
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")



## Functions

In [2]:
# function to search spotify
def search_spotify(search_term, type, auth_headers, limit=1):
    '''
    types = ["artist", "track", "album"]
    '''
    response = requests.get("https://api.spotify.com/v1/search?"
                            + "q=" + search_term
                            + "&type=" + type
                            + "&limit=" + str(limit),
                            headers=auth_headers)
    output = json.loads(response.content)[type + "s"]["items"][0]
    if type == "artist":
        output_formatted = {"type":type,
                            "artist_name":output["name"],
                            "artist_id":output["id"],
                            "popularity":output["popularity"],
                            "genres":output["genres"],
                            "followers":output["followers"]["total"]}
    if type == "album":
        output_formatted = {"type":type,
                            "artist_name":output["artists"][0]["name"],
                            "artist_id":output["artists"][0]["id"],
                            "album_name":output["name"],
                            "album_id":output["id"]}
    if type == "track":
        output_formatted = {"type":type,
                            "artist_name":output["artists"][0]["name"],
                            "artist_id":output["artists"][0]["id"],
                            "album_name":output["album"]["name"],
                            "album_id":output["album"]["id"],
                            "track_name":output["name"],
                            "track_id":output["id"],
                            "popularity":output["popularity"],
                            "duration_ms":output["duration_ms"]}
        
    return output_formatted

# get most recent albums
def get_artists_album(uri, auth_headers):
    response = requests.get(f"https://api.spotify.com/v1/artists/{uri}/albums?include_groups=album", headers=auth_headers)
    return json.loads(response.content)


In [3]:
# import data
artist_info = pd.read_csv("../data/processed/artist_info_filtered.csv")

## Code

In [4]:
# okay set up the authorization for the spotify API

## authorize
auth_url = 'https://accounts.spotify.com/api/token'
data = {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
}
auth_response = requests.post(auth_url, data=data)
access_token = auth_response.json().get('access_token')
headers = {"Authorization":"Bearer " + access_token}

In [5]:
# cycle through and get artist information
spotify_info = []

for i in range(len(artist_info)):
    output = search_spotify(artist_info.loc[i, "artist_name"],"artist",headers).copy()
    output.update({"artist_mbid":artist_info.loc[i, "artist_mbid"]})
    spotify_info.append(output.copy())
    time.sleep(0.1)

In [6]:
spotify_info_df = pd.DataFrame(spotify_info)

In [7]:
# write out
spotify_info_df.to_csv("../data/source/spotify/spotify_artist_info.csv", index=False)

In [8]:
# cycle through and get most recent albums
artist_info = []

for i in range(len(spotify_info_df)):
    try:
        output = get_artists_album(spotify_info_df.loc[i, "artist_id"],headers)
        artist_info.append(output.copy())
    except:
        print(f'Issue with {spotify_info_df.loc[i, "artist_name"]}')
    time.sleep(0.1)

In [9]:
# artists with more than 20 albums
[{"artist":artist["items"][0]["artists"][0]["name"], "total":artist["total"]} for artist in artist_info if artist["total"]>20]

[{'artist': "Joe Russo's Almost Dead", 'total': 24},
 {'artist': 'The String Cheese Incident', 'total': 45},
 {'artist': 'My Morning Jacket', 'total': 25},
 {'artist': 'Trey Anastasio', 'total': 23},
 {'artist': 'Greensky Bluegrass', 'total': 21},
 {'artist': 'Cory Wong', 'total': 28},
 {'artist': 'Jack White', 'total': 21},
 {'artist': 'The Mountain Goats', 'total': 34},
 {'artist': 'Iron Maiden', 'total': 28},
 {'artist': 'PJ Harvey', 'total': 27},
 {'artist': 'New Found Glory', 'total': 21},
 {'artist': 'Nick Cave & The Bad Seeds', 'total': 24},
 {'artist': 'Jethro Tull', 'total': 63},
 {'artist': 'The The', 'total': 21}]

In [13]:
# artists with no albums
[{"artist":spotify_info_df.loc[spotify_info_df["artist_id"]==artist["href"].replace("https://api.spotify.com/v1/artists/", "").replace("/albums?offset=0&limit=20&include_groups=album", ""), "artist_name"].iloc[0], "total":artist["total"]}
 for artist in artist_info if artist["total"]==0]

[{'artist': 'Myles Smith', 'total': 0}]

In [14]:
with open(f"../data/source/spotify/albums_info_spotify.json", 'w') as json_file:
    json.dump(artist_info, json_file, indent=4)

In [37]:
# let us manipulate the album into what we want to know: the latest release and how many total albums they have
artist_info_detail = []
no_info = []
for artist in artist_info:
    if len(artist["items"])>0:
        data = {"artist_id":artist["href"].replace("https://api.spotify.com/v1/artists/", "").replace("/albums?offset=0&limit=20&include_groups=album", ""),
                "total_albums":artist["total"]}
        dates = []

        for album in artist["items"]:
            dates.append(album["release_date"])
            data.update({"album_name":album["name"],
                         "album_date":album["release_date"]}) 
            artist_info_detail.append(data.copy())

    else:
        no_info.append(artist["href"].replace("https://api.spotify.com/v1/artists/", "").replace("/albums?offset=0&limit=20&include_groups=album", ""))

In [38]:
pd.DataFrame(artist_info_detail).to_csv("../data/source/spotify/album_detail.csv", index=False)

## Clean appify data

In [24]:
with open("../data/source/dataset_spotify-monthly-listeners_2025-12-26_19-07-23-350_appify.json", 'r') as json_file:
        c = json_file.read()
monthly_listeners = json.loads(c)

In [27]:
extra_spotify = []
for artist in monthly_listeners:
    extra_spotify.append({"artist_id":artist["artist_id"], "monthly_listeners":artist["monthlyListeners"]})

In [30]:
pd.DataFrame(extra_spotify).to_csv("../data/processed/appify_info.csv", index=False)