In [None]:
# Spotify API 
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data
import spotipy

# Data Science Libraries
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

# Miscellaneous Libraries
from tqdm import tqdm
from dotenv import load_dotenv
import os
import time

load_dotenv()

# Obtaining client id and client secret from env
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

# Debugging Audio Features

In [None]:
tracks = []
query = "q=NewJeans&type=artist&limit=1"
artist_uri = sp.search(q=query)['tracks']['items'][0]['album']['artists'][0]['uri']
print(f"This is the artist uri: {artist_uri}")

query = "NewJeans"#&type=artist&limit=1"


for item in sp.search(q=query, limit=50, type='track')['tracks']['items']:
    print(item['album']['artists'][0]['uri'] + "   " + item['name'])
    if item['album']['artists'][0]['uri'] == artist_uri:
        print("YES")
        tracks.append(item['name'])

#print(tracks)




In [None]:
sp.audio_features(tracks)

# Obtaining Audio Features

In [None]:
def artist_tracks(artist):
    
    '''
    Takes a list of artist names, iterates through their Spotify albums, checks for 
    duplicate albums, then appends all the tracks in those albums to a list of lists
    '''
    
    # Each list in this list will be a track and its features
    tracks = []
    
    # Get the artist URI (a unique ID)
    query = f"q={artist}&type=artist&limit=1"
    artist_uri = sp.search(q=query)['tracks']['items'][0]['album']['artists'][0]['uri']

    # Spotify has a lot of duplicate albums, but we'll cross-reference them with this list to avoid extra loops
    album_checker = []
    
    # The starting point of our loop of albums for those artists with more than 50
    n = 0
    
    # Note the album_type = 'album'. This discounts singles, compilations and collaborations
    while len(sp.artist_albums(artist_uri, album_type = 'album', limit=50, offset = n)['items']) > 0:
        
        # Avoid overloading Spotify with requests by assigning the list of album dictionaries to a variable
        dict_list = sp.artist_albums(artist_uri, album_type = 'album', limit=50, offset = n)['items']
        
        for i, album in tqdm(enumerate(dict_list)):

            # Add the featured artists for the album in question to the checklist
            check_this_album = [j['name'] for j in dict_list[i]['artists']]
            # And the album name
            check_this_album.append(dict_list[i]['name'])
            # And its date
            check_this_album.append(dict_list[i]['release_date'])

            # Only continue looping if that album isn't in the checklist
            if check_this_album not in album_checker:
                
                # Add this album to the checker
                album_checker.append(check_this_album)
                # For every song on the album, get its descriptors and features in a list and add to the tracklist
                tracks.extend([[artist, album['name'], album['uri'], song['name'], album['release_date']] + list(sp.audio_features(song['uri'])[0].values()) for song in sp.album_tracks(album['uri'])['items']])
                time.sleep(8)
        
        # Go through the next 50 albums (otherwise we'll get an infinite while loop)
        n += 50

    # HOT FIX
    # Since we are only looking at songs within an album, we are missing out on songs that simply appear on it's own.
    # For example, New Jeans only has 1 album (their remix album), but all of their top hits are singles
    # This will definetly cause duplicates in the case for other artists, but duplicates will be handled later on

    single_tracks = sp.search(q=artist, limit=50, type='track')['tracks']['items']
    tracks.extend([[artist, item['album']['name'], item['album']['uri'], item['name'], item['album']['release_date']] + list(sp.audio_features(item['uri'])[0].values()) for item in single_tracks if item['album']['artists'][0]['uri'] == artist_uri])
    time.sleep(8)

    return tracks

In [None]:
def df_tracks(tracklist):
    '''
    This method takes the output of artist_tracks (i.e. list of lists),
    and organizes all the data into a dataframe
    '''

    # Creating the dataframe
    df = pd.DataFrame(data=tracklist, columns = ['artist', 'album_name', 'album_uri', 'track', 'release_date'] + list(sp.audio_features('7tr2za8SQg2CI8EDgrdtNl')[0].keys()))

    # Renaming columns and dropping duplicate songs
    df.rename({'uri':'song_uri'}, axis=1, inplace=True)
    df.drop_duplicates(subset=['artist', 'track', 'release_date'], inplace=True, keep='first')

    # Filtering songs that are instrumental or a language variant
    df = df[(~(df['track'].str.lower().str.contains('inst')) & ~(df['track'].str.lower().str.contains('ver')))]

    # Reorder the columns to have identifiers first, auditory features last
    cols = ['artist', 'album_name', 'album_uri', 'track', 'release_date', 'id', 'song_uri', 'track_href',
     'analysis_url', 'type', 'danceability', 'energy', 'key',  'loudness', 'mode', 'speechiness',
     'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

    df = df[cols]

    return df



In [None]:
def training_df_creator(artists):

    training_df = pd.DataFrame()
    
    for artist in artists:
        track_list = artist_tracks(artist)
        temp_df = df_tracks(track_list)
        training_df = pd.concat([training_df, temp_df], axis=0)
        time.sleep(8)
        
        
    return training_df

In [None]:
kpop_artists = ['NewJeans']
#kpop_artists = ['NewJeans', 'ITZY', 'BTS', 'BLACKPINK', 'Stray Kids', 'TWICE', 'SEVENTEEN', 'LE SSERAFIM', '(G)I-DLE', 'aespa', 'IVE', 'NMIXX', 'MAMAMOO', 'EXO', 'Red Velvet', 'NCT 127', 'STAYC', 'MOMOLAND', "Girls' Generation", 'Weeekly', 'BIGBANG']

training_df = training_df_creator(kpop_artists)

In [None]:
training_df = training_df.reset_index(drop=True)

# Obtaining the 11 features of a song
features = training_df.columns[10:21]
# Creating a dictionary that converts the datatype of all the numeric features to a float
dtype_conversion_dict = {feature:'float64' for feature in features}

training_df = training_df.astype(dtype_conversion_dict)
training_df.dtypes

# Data Exploration

In [None]:
training_df.head()

In [None]:
training_df.info()

In [None]:
training_df.describe()

In [None]:
# Obtaining the 11 features of a song
features = training_df.columns[10:21]

# Setting the figure size
plt.figure(figsize=(15,15))

for i, feature in enumerate(features, 1):
    plt.subplot(4,3,i)
    sns.histplot(data=training_df[feature], kde=True)

plt.tight_layout()
plt.show()

