# Getting Spotify playlist data 

This Notebook has the intention to do some basic analysis from my favorite artists from spotify for my application website

## Initialization

In [1]:
import json
import os
import re
import time
from collections import Counter, defaultdict

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from requests.auth import HTTPBasicAuth

In [2]:
os.makedirs("src/assets/analysis/", exist_ok=True);

In [4]:
# Spotify API parameters
SPOTIFY_AUTH_URL     = 'https://accounts.spotify.com/api/token'
SPOTIFY_API_URL = 'https://api.spotify.com/v1'

SPOTIFY_PLAYLIST_ID = os.getenv('PLAYLIST_ID');

## API Auth
Define the Spotify development project secrets as credentials to request from endpoints. For basic usage, a simple auth method is sufficient. However, to add or remove songs from a playlist, for example, an oauth via link is required - which is not needed in this application. More information can be found in [Spotify's API docs](https://developer.spotify.com/documentation/web-api).

In [5]:
client_id = os.getenv('CLIENT_ID');
client_secret = os.getenv('CLIENT_SECRET');

In [6]:
# Make a POST request to the Spotify Accounts service
response = requests.post(SPOTIFY_AUTH_URL, {
    'grant_type': 'client_credentials'
}, auth=HTTPBasicAuth(client_id, client_secret))

# Convert the response to JSON
data = response.json()

# Your new access token
access_token = data['access_token']

# Update your headers with the new access token
headers = {
    'Authorization': f'Bearer {access_token}'
}


## Playlist data

First, all playlist tracks are loaded in batches and formatted by album (as album covers are displayed on the website). The track IDs are stored separately for the latter analysis.

In [7]:
def fetchPlaylistTracks(url, headers):
    formatted_data = []
    tracks = []

    while url:
        response = requests.get(url, headers=headers)
        data = response.json()

        for item in data['items']:
            track = item['track']
            album = track['album']

            tracks.append(track['id'])
            # Check if the album already exists in formatted_data
            existing_album = next((item for item in formatted_data if item['id'] == album['id']), None)

            if existing_album:
                # If the album exists, add the track to the album's tracks
                if track['id'] not in existing_album['tracks']:
                    existing_album['tracks'].append(track['id'])
            else:
                # If the album does not exist, create a new entry for the album
                formatted_album = {
                    'cover': album['images'][0]['url'].removeprefix('https://i.scdn.co/image/ab67616d0000b273'),
                    'id': album['id'],
                    'name': album['name'],
                    'artists': [{'name': artist['name'], 'id': artist['id']} for artist in album['artists']],
                    'release': album['release_date'],
                    'tracks': [track['id']]
                }

                # Add the constructed album/track data to your list
                formatted_data.append(formatted_album)

        url = data.get('next')  # URL for the next set of items, if any

    return formatted_data, tracks


In [8]:
albums, tracks = fetchPlaylistTracks(f'{SPOTIFY_API_URL}/playlists/{SPOTIFY_PLAYLIST_ID}/tracks', headers);

# stats
print("Albums:" + str(len(albums)));
print("Songs:" + str(len(tracks)))

Albums:331
Songs:999


In [9]:
with open("export-spotify-covers.json", "w") as outfile:
    json.dump(albums, outfile)

## Artist occurence & genres

The artist profile data is loaded to get the genre information (what genre an artist belongs to), but also to get more information about the artist, like the name and also the link to their profile picture.

In [10]:
# Function to process albums and update stats for favorite artists
def getArtistStats(albums, stats):
    for album in albums:
        album_artist_ids = [artist['id'] for artist in album['artists']]

        for artist_id in album_artist_ids:

            if artist_id not in stats:
                stats[artist_id] = {
                    'img': '',
                    'tracks': len(album['tracks']),
                    'albums': 1,
                    'name': ''
                }
            else:
                stats[artist_id]['tracks'] += len(album['tracks'])
                stats[artist_id]['albums'] += 1

# Function to fetch artist data
def fetch_artist_data(artist_id):
    response = requests.get(f'https://api.spotify.com/v1/artists/{artist_id}', headers=headers)

    if response.status_code == 200:
        return response.json()

    elif (response.status_code == 429):
      raise Exception("Exceeded API rate limit")

    else:
        return None

In [11]:
artists_stats = dict();

# Process albums data structure here
getArtistStats(albums, artists_stats)

genre_track_count = Counter()  # Using a Counter to easily sum values for the same key


# Assuming artistStats is defined somewhere with artist IDs and their track counts
for artist_id in artists_stats:
    artist_data = fetch_artist_data(artist_id)  # Ensure headers is defined with your auth token

    if not artist_data:
        continue

    genres = artist_data.get('genres', [])
    artistTracksCount = artists_stats[artist_id]['tracks']

    # Incrementing track count for each genre found for the artist
    for genre in genres:
        genre_track_count[genre] += artistTracksCount

    # Optionally update image and name if present
    if 'images' in artist_data and artist_data['images']:
        artists_stats[artist_id]['img'] = artist_data['images'][0]['url'].replace('https://i.scdn.co/image/ab6761610000e5eb', '')
        artists_stats[artist_id]['name'] = artist_data.get("name", '')

In [12]:
# add percentage
artists_stats = {artist_id: {**details, 'occurrence': format(((details['tracks'] / len(tracks)) * 100), ".2f")} for artist_id, details in artists_stats.items()}
favorite_artists = sorted(artists_stats.items(), key=lambda x: x[1]['occurrence'], reverse=True)[:5];

print(favorite_artists)
print(dict(genre_track_count.most_common(10)))

[('0k70gnDBLPirCltbTzoxuM', {'img': 'https://i.scdn.co/image/ab6761610000e5ebad25a4c036cdc21fe95986ad', 'tracks': 35, 'albums': 7, 'name': 'Faber', 'occurrence': '3.50'}), ('23xqmJEN3oVxwzqtNIyR5m', {'img': 'https://i.scdn.co/image/ab6761610000e5ebbcf79c131f4ddb3635dd60bb', 'tracks': 28, 'albums': 6, 'name': 'AnnenMayKantereit', 'occurrence': '2.80'}), ('5fNCrHCBzuRdM8AbVajVh0', {'img': 'https://i.scdn.co/image/ab6761610000e5eb45a6963fbc20ede928638715', 'tracks': 25, 'albums': 7, 'name': 'Lance Butters', 'occurrence': '2.50'}), ('2XbRunIT35jrB8HRsISPgT', {'img': 'https://i.scdn.co/image/ab6761610000e5eb65e8d02a37ce4b2371caf085', 'tracks': 25, 'albums': 6, 'name': 'Tristan Brusch', 'occurrence': '2.50'}), ('0r0R5nIjDY04TfxRM10Bcb', {'img': 'https://i.scdn.co/image/ab6761610000e5eb489fdf3f409eed0c27751c64', 'tracks': 25, 'albums': 5, 'name': 'Alligatoah', 'occurrence': '2.50'})]
{'german pop': 278, 'german indie': 210, 'german hip hop': 158, 'german alternative rap': 129, 'pop': 91, 'ant

### Generalised genres

Since many genres are very specific and cannot be visualised in a meaningful way, they have to be grouped, e.g. German rap, Swiss rap are all in the category rap. Some genres like " antidetusch " cannot be grouped and are put into the category "other". The counts of this category are stored separately to be able to list the "other" genres.

In [13]:
# Updated list of general genres with some additional groupings
general_genres = [
    'pop', 'indie', 'hip hop', 'rock', 'edm', 'jazz', 'reggae', 'singer-songwriter',
    'ambient', 'experimental', 'electronic', 'folk', 'alternative', 'punk', 'ska', 'metal',
    'classical', 'country', 'soul', 'blues', 'r&b', 'comedy', 'grunge', 'new wave'
]

# Initialize Counters for the general genres and unmatched genres
general_genre_counts = Counter()
unmatched_genre_counts = Counter()  # New counter for unmatched genres

# Broaden matching rules and handle special cases
def match_general_genre(specific_genre, general_genres):
    # Normalize and consolidate similar genres
    specific_genre = specific_genre.lower()
    if 'electro' in specific_genre or 'dance' in specific_genre:
        return 'electronic'
    if 'hip hop' in specific_genre or 'rap' in specific_genre:
        return 'hip hop/rap'
    if 'motown' in specific_genre or 'soul' in specific_genre or 'r&b' in specific_genre:
        return 'soul/r&b'
    if 'neue deutsche welle' in specific_genre or 'german' in specific_genre:
        return 'alternative'
    if 'grunge' in specific_genre or 'post-grunge' in specific_genre:
        return 'grunge'
    if 'new wave' in specific_genre or 'new romantic' in specific_genre:
        return 'new wave'
    if 'comedy' in specific_genre or 'parody' in specific_genre or 'comic' in specific_genre:
        return 'comedy'

    # Check for matches in general genres
    for general_genre in general_genres:
        if general_genre in specific_genre:
            return general_genre

    # Track unmatched genres
    unmatched_genre_counts[specific_genre] += 1
    return 'other'

# Categorize each specific genre
for specific_genre, count in genre_track_count.items():
    general_genre = match_general_genre(specific_genre, general_genres)
    general_genre_counts[general_genre] += count
    if general_genre == 'other':
        unmatched_genre_counts[specific_genre] += count

general_genre_counts = sorted(general_genre_counts.items(), key=lambda x: x[1], reverse=True)

In [14]:
# Print the updated general genre counts
print(dict(general_genre_counts))

# Print the top 5 "other" genres with counts
print(dict(unmatched_genre_counts.most_common(5)))


{'alternative': 697, 'hip hop/rap': 662, 'pop': 274, 'other': 177, 'rock': 149, 'indie': 147, 'electronic': 48, 'edm': 21, 'comedy': 18, 'punk': 11, 'singer-songwriter': 10, 'ambient': 9, 'experimental': 7, 'new wave': 7, 'jazz': 6, 'soul/r&b': 5, 'grunge': 5, 'folk': 5, 'metal': 2}
{'antideutsche': 69, 'house': 15, 'stutter house': 15, 'permanent wave': 15, 'sunnlensk tonlist': 12}


## Track's audio features

The Spotify API can be used to retrieve a track's features. This data includes danceability, valence, speechiness and so on. This information for each song is retrieved in batches and then averaged to give a score for the whole dataset.

In [15]:

# Deduplicate the track IDs in case the same track appears more than once
unique_track_ids = tracks

print(unique_track_ids)
# Dictionary to hold audio features for each track, along with popularity
audio_features_data = {}

# Spotify limits the number of track IDs you can request at once to 100
batch_size = 100
for i in range(0, len(unique_track_ids), batch_size):

    batch = unique_track_ids[i:i+batch_size]

    response = requests.get((f'{SPOTIFY_API_URL}/audio-features/'), headers=headers, params={'ids': ','.join(batch)})
    data = response.json()

    # For each track, fetch its audio features
    for feature in data['audio_features']:
        if feature:  # Check if the feature is not None
            track_id = feature['id']

            # Remove 'track_href' and 'analysis_url' from the feature data
            feature.pop('track_href', None)  # Use pop to remove 'track_href', if it exists
            feature.pop('analysis_url', None)  # Use pop to remove 'analysis_url', if it exists
            feature.pop('uri', None)  # Use pop to remove 'analysis_url', if it exists

            audio_features_data[track_id] = feature

# Now, 'audio_features_data' contains both the audio features and popularity for each track
print(audio_features_data)

# Initialize a dictionary to hold the sum of each feature
features_sum = {key: 0 for key in audio_features_data[list(audio_features_data.keys())[0]].keys() if key != 'type' and key != 'id' and key != 'uri' and key != 'track_href' and key != 'analysis_url' and key != 'duration_ms' and key != 'time_signature'}

# Sum up all the features
for features in audio_features_data.values():
    for key, value in features.items():
        if key in features_sum:
            features_sum[key] += value

# Calculate the average
features_avg = {key: (value / len(audio_features_data)) for key, value in features_sum.items()}

['7ruRqXvFzjBmXi5Rp3D0XP', '1hUT7kFty1Sgfngm51vHPO', '0vOGbYyA5ktPzHQwZL8Ig7', '2wW3yIdBoOJ2qI03k1itjJ', '2qhG2ccN8D1LmXwIqFDwzn', '0FTGWwMiZG7Bww8Ox10WTl', '1EZXfymJrWi0hi9lvug4Do', '7KHZ8dYC0Mt8DDrVcF6kd4', '0XzLZsv7L9eZeqLGb03fKe', '4wEDActEN5ZyJi0DK8bCtf', '2fl0B0OaXjWbjHCQFx2O8W', '5HelvHUpgmjB6i91wp0syw', '22Z5lOqrkHsp5TGWxmoNeY', '2P9OKiFiDDwHcMkdGVuqcd', '6maG9OGek8E3dEwp0BwFjT', '3sdhpra6cF3DCQfi2ZJrtQ', '3xJfxlKsfgJ74Wud1KXWlR', '6dK8Giqc65HHkhG2vnI7Cb', '5qvImv7TRpDhlcLZZmDruU', '21azelPRm5CTJR29UEqUQq', '0PHZPjZsidVYmmNyYCPQBm', '7q08I7aOt87YE7tTLMsJW3', '6Dma0t0hOe6Bd6u5YRKF3n', '6UdNDWZtkdAvaAZ9TRzkUF', '1o8kO8YyMWRAeb9bE2AIxO', '2xpbLSWc4JxsROu3g98Jnn', '50nfwKoDiSYg8zOCREWAm5', '096eU6V7NOWbfgIz7r153d', '3OZpMiuR0lZkODDWkd6ELs', '45wMBGri1PORPjM9PwFfrS', '4mtwqruaUbam6BCQ1892Mt', '2TeUFXLBuG1CCRB2dUyJkL', '3hEFg7qm2MFx9kODgTJKsJ', '4TlbZgqxhJ6uN3tpVyjw90', '3jGYt4gshAbEIt2GQ02P9e', '1rKdC0qqdgOGkQnIZky7Ww', '5a1oLH15OThDOWdmhlvYxh', '0XIfbb3y8Ltx58T1s2FpvK', '4KUyldIKTn

In [16]:
# speechiness: 14% --> more music less text
# 'valence': 46.94867990654203 % --> tracks with low valence sound more negative (e.g. sad, depressed, angry)
# 'danceability': 62.282593457943904, 'energy': 60.837155373831706
print(features_avg)

## Playlist duration

In [17]:

# Function to fetch data for multiple tracks at once
def fetch_tracks_data(track_ids):
    track_ids_str = ','.join(track_ids)
    response = requests.get(f'https://api.spotify.com/v1/tracks?ids={track_ids_str}', headers=headers)
    if response.status_code == 200:
        return response.json()['tracks']
    else:
        return []

playlist_duration = 0

# Create batches of up to 50 track IDs
batch_size = 50
for i in range(0, len(tracks), batch_size):
    batch = unique_track_ids[i:i+batch_size]
    track_data = fetch_tracks_data(batch)
    for track in track_data:
        if track:  # Ensure the track data is not None
            playlist_duration += track.get('duration_ms', 0)




{'danceability': 0.6172487462387146, 'energy': 0.6159181043129384, 'key': 5.320962888665998, 'loudness': -7.724303911735208, 'mode': 0.559679037111334, 'speechiness': 0.13753711133400204, 'acousticness': 0.297226126479438, 'instrumentalness': 0.0383473000100301, 'liveness': 0.2069755265797393, 'valence': 0.4729438314944837, 'tempo': 120.15809227683054}


## Release date grouping


In [19]:
# Initialize a defaultdict to count albums per decade
decade_counts = defaultdict(int)

for details in albums:
    # Extract year from the release_date
    date = pd.to_datetime(details['release'])

    # Convert year to its corresponding decade
    year = date.year
    decade = (year // 10) * 10

    # Count albums per decade
    decade_counts[decade] += len(details['tracks'])

sorted_counts = sorted(decade_counts.items())

print(dict(sorted_counts));

{1960: 5, 1970: 9, 1980: 7, 1990: 18, 2000: 38, 2010: 459, 2020: 461}


# Export resulting data

In [23]:
# Constructing the data structure
dataResult = {
    "update_time": int(time.time()),
    "stats": {
        "albums": len(albums),
        "tracks": len(tracks),
        "artists": len(artists_stats),
        "duration": playlist_duration/60000
    },

    "genres": {
        "general": dict(general_genre_counts), #dict(general_genre_counts),
        "detailed": dict(genre_track_count.most_common(10))
    },

    "features": dict(features_avg),
    "artists": dict(favorite_artists),
    "decades": dict(sorted_counts)

}

In [22]:
with open("export-spotify-analysis.json", "w") as outfile:
    json.dump(dataResult, outfile)