# Dependencies

In [None]:
# Spotify API 
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data
import spotipy

# Data Science Libraries
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

# Miscellaneous Libraries
from tqdm import tqdm
from dotenv import load_dotenv
import os
import time

load_dotenv()

# Obtaining client id and client secret from env
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

# Extracting Songs and their Features

Obtaining all the tracks from an artist

In [None]:
def artist_tracks(artist):
    
    '''
    Takes an artist name, iterates through their Spotify albums, checks for 
    duplicate albums, then appends all the tracks in those albums to a list of lists
    '''
    
    # Each list in this list will be a track and its features
    tracks = []
    
    # Get the artist URI (a unique ID)
    query = f"q={artist}&type=artist&limit=1"
    artist_uri = sp.search(q=query)['tracks']['items'][0]['album']['artists'][0]['uri']

    # Spotify has a lot of duplicate albums, but we'll cross-reference them with this list to avoid extra loops
    album_checker = []
    
    # The starting point of our loop of albums for those artists with more than 50
    n = 0
    
    # Note the album_type = 'album'. This discounts singles, compilations and collaborations
    while len(sp.artist_albums(artist_uri, album_type = 'album', limit=50, offset = n)['items']) > 0:
        
        # Avoid overloading Spotify with requests by assigning the list of album dictionaries to a variable
        dict_list = sp.artist_albums(artist_uri, album_type = 'album', limit=50, offset = n)['items']
        
        for i, album in tqdm(enumerate(dict_list)):

            # Add the featured artists for the album in question to the checklist
            check_this_album = [j['name'] for j in dict_list[i]['artists']]
            # And the album name
            check_this_album.append(dict_list[i]['name'])
            # And its date
            check_this_album.append(dict_list[i]['release_date'])

            # Only continue looping if that album isn't in the checklist
            if check_this_album not in album_checker:
                
                # Add this album to the checker
                album_checker.append(check_this_album)
                # For every song on the album, get its descriptors and features in a list and add to the tracklist
                tracks.extend([[artist, album['name'], album['uri'], song['name'], album['release_date']] + list(sp.audio_features(song['uri'])[0].values()) for song in sp.album_tracks(album['uri'])['items']])
                time.sleep(8)
        
        # Go through the next 50 albums (otherwise we'll get an infinite while loop)
        n += 50

    # HOT FIX
    # Since we are only looking at songs within an album, we are missing out on songs that simply appear on it's own.
    # For example, New Jeans only has 1 album (their remix album), but all of their top hits are singles
    # This will definetly cause duplicates in the case for other artists, but duplicates will be handled later on

    single_tracks = sp.search(q=artist, limit=50, type='track')['tracks']['items']
    tracks.extend([[artist, item['album']['name'], item['album']['uri'], item['name'], item['album']['release_date']] + list(sp.audio_features(item['uri'])[0].values()) for item in single_tracks if item['album']['artists'][0]['uri'] == artist_uri])
    time.sleep(8)

    return tracks

Function that creates the final dataframe of tracks and audio features for a single artist

In [None]:
def df_tracks(tracklist):
    '''
    This method takes the output of artist_tracks (i.e. list of lists),
    and organizes all the data into a dataframe
    '''

    # Creating the dataframe
    df = pd.DataFrame(data=tracklist, columns = ['artist', 'album_name', 'album_uri', 'track', 'release_date'] + list(sp.audio_features('7tr2za8SQg2CI8EDgrdtNl')[0].keys()))

    # Renaming columns and dropping duplicate songs
    df.rename({'uri':'song_uri'}, axis=1, inplace=True)
    df.drop_duplicates(subset=['artist', 'track', 'release_date'], inplace=True, keep='first')

    # Filtering songs that are instrumental or a language variant
    df = df[(~(df['track'].str.lower().str.contains('inst')) & ~(df['track'].str.lower().str.contains('ver')))]

    # Reorder the columns to have identifiers first, auditory features last
    cols = ['artist', 'album_name', 'album_uri', 'track', 'release_date', 'id', 'song_uri', 'track_href',
     'analysis_url', 'type', 'danceability', 'energy', 'key',  'loudness', 'mode', 'speechiness',
     'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

    df = df[cols]

    return df



Function that combines the 2 functions above

In [None]:
def df_creator(artist):

    track_list = artist_tracks(artist)
    training_df = df_tracks(track_list)
    return training_df

# Creating the Dataframes

In [None]:
# I could create a loop to loop through all of these artists, but I encounter Spotify's Rate Limit error 429.
# Everytime I receve a 429 error response, I have to wait a long period of time before I can use the API again.
# To ensure that I can get all of the songs from the desired artist, I will get the dataframes separately and concatenate them at the end

df1 = df_creator('NewJeans')
time.sleep(8)
df2 = df_creator('ITZY')
time.sleep(8)
df3 = df_creator('BLACKPINK')
time.sleep(8)
df4 = df_creator('BTS')
time.sleep(8)
df5 = df_creator('Stray Kids')
time.sleep(8)
df6 = df_creator('TWICE')
time.sleep(8)
df7 = df_creator('SEVENTEEN')
time.sleep(8)
df8 = df_creator('LE SSERAFIM')
time.sleep(8)
df9 = df_creator('GIDLE')
time.sleep(8)
df10 = df_creator('aespa')
time.sleep(8)
df11 = df_creator('IVE')
time.sleep(8)
df12 = df_creator('MAMAMOO')
time.sleep(8)
df13 = df_creator('Red Velvet')
time.sleep(8)
df14 = df_creator('NCT 127')
time.sleep(8)
df15 = df_creator('STAYC')
time.sleep(8)
df16 = df_creator('MOMOLAND')
time.sleep(8)
df17 = df_creator('Girls Generation')



Some data cleaning and data transformation

In [None]:
training_df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,df17], axis=0)
training_df = training_df.reset_index(drop=True)

# Obtaining the 11 features of a song
features = training_df.columns[10:21]
# Creating a dictionary that converts the datatype of all the numeric features to a float
dtype_conversion_dict = {feature:'float64' for feature in features}

training_df = training_df.astype(dtype_conversion_dict)
training_df.dtypes

In [None]:
training_df.to_csv('training_df.csv')

# Debugging Issues

In [None]:
# This code was used for debugging

"""
tracks = []
query = "q=Stayc&type=artist&limit=1"
artist_uri = sp.search(q=query)['tracks']['items'][0]['album']['artists'][0]['uri']
print(f"This is the artist uri: {artist_uri}")

#query = "IVE"#&type=artist&limit=1"


for item in sp.search(q="Stayc", limit=50, type='track')['tracks']['items']:
    print(item['album']['artists'][0]['uri'] + "   " + item['name'])
    if item['album']['artists'][0]['uri'] == artist_uri:
        print("YES")
        tracks.append(item['name'])

print(tracks)

"""