# Init & Login

In [1]:
import time, pickle, os
now = time.time
from math import ceil
from random import randrange, choice, random
from time import sleep
from pprint import pprint
from datetime import datetime

import spotipy
import spotipy.util as util
from IPython.display import clear_output

## Client Info ##
CLIENT_ID     = ""
CLIENT_SECRET = ""
CLIENT_SCOPE  = "user-follow-modify playlist-modify-private playlist-modify-public"
USER_NAME     = "31ytgsr7wdmiaroy77msqpiupdsi"
REDIR_URI     = "https://github.com/jwatson-CO-edu/yt_shuffle_so_good"
AUTH_URL      = 'https://accounts.spotify.com/api/token'
BASE_URL      = 'https://api.spotify.com/v1/'
## API Info ##
_RESPONSE_LIMIT =  100
_MAX_OFFSET     = 1000
_T_LOGIN_S      = 50 * 60.0
tLastAuth       = 0.0

with open( "../keys/spot_ID.txt" , 'r' ) as f:
    CLIENT_ID = f.readlines()[0].strip()

with open( "../keys/spot_SECRET.txt" , 'r' ) as f:
    CLIENT_SECRET = f.readlines()[0].strip()

token = None
spot  = None

def check_API_token():
    global tLastAuth, token, _T_LOGIN_S, spot
    tNow    = now()
    elapsed = tNow - tLastAuth
    if elapsed >= _T_LOGIN_S:
        token = util.prompt_for_user_token(
            username      = USER_NAME,
            scope         = CLIENT_SCOPE,
            client_id     = CLIENT_ID,
            client_secret = CLIENT_SECRET,
            redirect_uri  = REDIR_URI
        )
        spot = spotipy.Spotify( auth = token )
        print( token )
        clear_output( wait = True )
        sleep( 2 )
        print( "TOKEN OBTAINED" )
        tLastAuth = tNow
    else:
        print( f"TOKEN STILL VALID, AGE: {elapsed/60.0} MINUTES" )

In [2]:
check_API_token()

TOKEN OBTAINED


## Playlists

In [3]:
playlist = {
    'study01' : "0a2qoe6S7lYeZ6nlhZdA0v",
    'study02' : "6gbtR2cBq5PvkghidCvvGk",
    'study03' : "3o3lN2qntdEV7UKTuuC77K",
    'study04' : "41sFSisljvBDMBXtpp5NIw",
    'study05' : "02iS5AFGp8YVuUUqcQf8ys",
    'study06' : "6KI7A4MWrSM7EyKRUjxIi1",
    'study07' : "3V055Md2JdrUT8tX0af7di",
    'study08' : "0tspdJlwSgiyf2O9PO6QaP",
    'study09' : "5mHRBFoQtYy2izeZ66pG95",
    'study10' : "3832xeKGEOAXFJqE4K8kIq",
    'study11' : "65MXR4dubPL9t0P4dgTWvn",
    'study12' : "0ecSAfnD4CulIVnLt26ukI",
    'study13' : "7K9ucByFRgDuZk8KMHeJkL",
}

backfill = "0v26bHydUxcGC5EbMlkjzG"
_N_BKFL  = 400


## Data

In [4]:
_FILTER_TYPES = [ 'album', 'artist', 'track', 'year', 'upc', 'tag:hipster', 'tag:new', 'isrc', 'genre', ]
_SEARCH_TYPES = [ "album", "artist", "playlist", "track", "show", "episode", "audiobook", ]
_N_MAX_SEARCH = 50
_N_DEF_SEARCH = 10
_YEAR_PADDING =  5
_STALE_TIME_S = 60.0 * 60 * 24 * 31
_MIN_LEN_S    = 60.0 + 45.0
_DATA_DIR     = "data/"
_DATA_PREFIX  = "Study-Music-Data_"
_DATA_POSTFIX = ".pkl"
_NULL_GENRE   = "Music"

In [5]:
data = {
    'time'     : now()  , # Data Structure Creation Time
    'playlists': dict() , # Study Playlist Info
    'collectID': set([]), # Currently accepted track IDs
    'artists'  : dict() , # Study Artist Info
    'queries'  : dict() , # Queries made during music searches
    'genres'   : dict() , # Study Genre Info
    # 2024-08-11: Track info does NOT contain play count
}
timestamp = datetime.now().strftime( '%Y-%m-%dT%H:%M:%S' )
outFilNam = _DATA_PREFIX + timestamp + _DATA_POSTFIX
outPath   = os.path.join( 'data/', outFilNam )

# Query Functions

In [6]:

def fetch_entire_playlist( playlist_ID ):
    """ Get infodump on all plalist tracks """
    plTracks = []
    trCount  = 0
    response = spot.user_playlist_tracks(
        CLIENT_ID, 
        playlist_ID, 
        fields = 'items,uri,name,id,total', 
        limit  = _RESPONSE_LIMIT
    )
    Ntracks = response['total']
    while 1:
        trCount += len(response['items'])
        plTracks.extend( response['items'] )
        
        if trCount >= Ntracks:
            break
    
        response = spot.user_playlist_tracks(
            CLIENT_ID, 
            playlist_ID, 
            fields = 'items,uri,name,id,total', 
            limit  = _RESPONSE_LIMIT,
            offset = trCount
        )
    return plTracks


def load_music_database( dataDir = _DATA_DIR ):
    """ Find the latest music database, test for freshness, and set current db if fresh """
    global data
    dbFiles = [os.path.join( dataDir, f ) for f in os.listdir( dataDir ) if (_DATA_PREFIX in str(f))]
    if len( dbFiles ):
        dbFiles.sort( reverse = True )
        with open( dbFiles[0], 'rb' ) as f:
            db = pickle.load( f )
        if ((data['time'] - db['time']) <= _STALE_TIME_S):
            data.update( db )
            return dbFiles[0]
    return None


def populate_playlist_data( dataDct, plDict, pause_s = 1.0 ):
    """ Gather data across specified playlists """
    print( "\n### READ MUSIC COLLECTION ###\n" )
    nuDB = load_music_database()
    if nuDB is not None:
        print( f"Found current collection data at {nuDB}!" )
    else:
        for plName_i, plID_i in plDict.items():
            print( plName_i, '-', plID_i, '...' )
            dataDct['playlists'][ plName_i ] = {
                'ID'    : plID_i,
                'tracks': fetch_entire_playlist( plID_i ),
            }
            # pprint( dataDct['playlists'][ plName_i ]['tracks'][0] )
            # return None
            plSet_i = set([item['track']['id'] for item in dataDct['playlists'][ plName_i ]['tracks']])
            dataDct['collectID'] = dataDct['collectID'].union( plSet_i )
    
            for track_j in dataDct['playlists'][ plName_i ]['tracks']:
    
                # pprint( track_j )
                for artist_k in track_j['track']['artists']:
                    artistID_j = artist_k['id']
                    if artistID_j not in dataDct['artists']:
                        dataDct['artists'][ artistID_j ] = { 
                            'name'    : track_j['track']['artists'][0]['name'], 
                            'count'   : 1, 
                            'releases': [track_j['track']['album']['release_date'],], 
                        }
                    else:
                        dataDct['artists'][ artistID_j ]['count'   ] += 1
                        dataDct['artists'][ artistID_j ]['releases'].append( track_j['track']['album']['release_date'] )
    
            sleep( pause_s )
    
    print( "\n### COMPLETE ###\n" )
    

def search_artist_within_era( artistName, releaseDate, 
                              db = None, N = _N_MAX_SEARCH, yearPadding = _YEAR_PADDING, pause_s = 0.5 ):
    """ Return `N` tracks within `yearPadding` of `trackDict` and by the same artist """
    rtnLs = list()
    query = "artist%3A" + str( artistName ).replace( " ", "%20")
    try:
        rYear = int( str( releaseDate )[:4] )
    except Exception:
        rYear = 2024
    bYear   = rYear - yearPadding
    eYear   = rYear + yearPadding
    years   = list( range( bYear, eYear+1 ) )
    miniLim = max( int(N/(eYear - bYear)), 1 )
    Nloop   = int(N / miniLim * 2)
    for i in range( Nloop ):
        iYear = choice( years )
        qry_i = query + "%20year%3A" + str( iYear )
        print( f"Search: {qry_i}" )

        if (db is not None):
            if (qry_i in db['queries']):
                ofst = db['queries'][ qry_i ]
                db['queries'][ qry_i ] += miniLim
            else:
                ofst = 0
                db['queries'][ qry_i ] = miniLim
        else:
            ofst = 0
        
        res = spot.search( qry_i, 
                           limit  = miniLim, 
                           offset = min( ofst, _MAX_OFFSET ), 
                           type   = 'track' )
        tracks_i = [item['id'] for item in res['tracks']['items']]
        
        if (db is not None):
            tracks_ii = list()
            for trk_j in tracks_i:
                if trk_j not in db['collectID']:
                    tracks_ii.append( trk_j )
            tracks_i = tracks_ii[:]

        rem = N - len( rtnLs )
        if len( tracks_i ) > rem:
            rtnLs.extend( tracks_i[:rem] )
            return rtnLs
        else:
            rtnLs.extend( tracks_i )
            sleep( pause_s )
    return rtnLs


def save_music_database( dataDct ):
    """ Pickle `dataDct` to store current music collection data as well as search activity """
    print( f"About to write {outPath} ..." )
    with open( outPath, 'wb' ) as f:
        pickle.dump( dataDct, f )
    print( "COMPLETE!" )


def choose_N_artist_year_pairs_from_db( N, db ):
    """ Fetch `N` random (<Artist>, <Date>) pairs from the `db` for searching """
    rtnPairs = list()
    artList  = list( db['artists'].keys() )
    for i in range(N):
        artKey_i = choice( artList )
        # pprint( db['artists'][ artKey_i ] )
        artist_i = db['artists'][ artKey_i ]['name']
        rlYear_i = choice( db['artists'][ artKey_i ]['releases'] )
        rtnPairs.append( (artist_i, rlYear_i,) )
    return rtnPairs


def basic_new_music_search_01( db, Ntot, Mper = 5, pause_s = 0.125 ):
    """ Choose random `db` entries as search queries, Return a list of `Ntot` tracks consisting of `Mper` entries for each artist """  
    rtnLst   = list()
    searches = choose_N_artist_year_pairs_from_db( int( ceil( Ntot/Mper ) )*2, db )
    addSet   = set([])
    for (art_i, rel_i) in searches:
        print( f"\tSearch, Artist: {art_i}, Around Year: {rel_i}" )
        rem    = Ntot - len( rtnLst )
        Mper   = min( Mper, rem )
        trks   = search_artist_within_era( art_i, rel_i, db, N = Mper, yearPadding = 3, pause_s = 0.5 )
        trks_i = list()
        for trk in trks:
            if trk not in addSet:
                trks_i.append( trk )
                addSet.add( trk )
        rem = Ntot - len( rtnLst )
        if rem > len( trks_i ):
            rtnLst.extend( trks_i )
        else:
            rtnLst.extend( trks_i[ :rem ] )
            break
        sleep( pause_s )
    return rtnLst


def get_playlist_length( playlist_ID ):
    """ Get the number of total tracks in the playlist """
    response = spot.user_playlist_tracks(
        CLIENT_ID, 
        playlist_ID, 
        fields = 'items,uri,name,id,total', 
        limit  = _RESPONSE_LIMIT
    )
    return response['total']


def refill_playlist_with_new_tracks( plID, db, Ntot = 400, Mper = 5 ):
    """ Top off the playlist with new tracks """
    plLen = get_playlist_length( plID )
    if Ntot > plLen:
        nRem    = Ntot - plLen
        print( f"About to add {nRem} tracks ..." )
        addTrks = basic_new_music_search_01( db, nRem, Mper )
        result  = spot.user_playlist_add_tracks( CLIENT_ID, plID, addTrks )
        print( result )
    else:
        print( "No room for new tracks!" )


def scrub_short_and_explicit_tracks( plID ):
    """ Remove short (<1:45) and explicit (guaranteed vocal) songs """
    tracks = fetch_entire_playlist( plID )

    # 3. For every track j in playlist, do
    j = 0
    while j < len( tracks ):
        track_j   = tracks[j]
        # pprint( track_j )
        # break
        p_dump_j  = False
        trackID_j = track_j['track']['id']
        len_s_j   = track_j['track']['duration_ms']/1000.0
        explc_j   = track_j['track']['explicit']

        if ((len_s_j < _MIN_LEN_S) or explc_j):
            res = spot.playlist_remove_specific_occurrences_of_items( 
                plID, 
                [{'uri': trackID_j, 'positions':[j,]},]
            )
            print( "\tRemove:", trackID_j, j, res )
            tracks.pop(j)
        else:
            j += 1


def scrub_and_refill_playlist_with_new_tracks( plID, db, Ntot = 400, Mper = 5, pause_s = 1.0 ):
    """ Remove suspect tracks and refill backfill in a loop until full """
    scrub_short_and_explicit_tracks( plID )
    plLen = get_playlist_length( plID )
    if Ntot > plLen:
        print( f"\n########## About to top off backfill with {Ntot-plLen} tracks! ##########\n" )
        i = 0
        while plLen < Ntot:
            i += 1
            sleep( pause_s )
            print( f"\n##### Iteration {i} #####\n" )
            refill_playlist_with_new_tracks( plID, db, Ntot, Mper )
            scrub_short_and_explicit_tracks( plID )
            plLen = get_playlist_length( plID )
            
        print( f"\n########## COMPLETE after {i} iterations! ##########\n" )
    else:
        print( f"\n########## NO additional backfill required! ##########\n" )
            
        

# Search Version 01, Query by Existing Artist and Year

## Read Study Music Collection

In [7]:
# populate_playlist_data( data, playlist, pause_s = 1.0 )

## Add New Tracks

In [8]:
# refill_playlist_with_new_tracks( backfill, data, Ntot = _N_BKFL, Mper = 5 )
# scrub_and_refill_playlist_with_new_tracks( backfill, data, Ntot = 400, Mper = 5, pause_s = 1.0 )

In [9]:
# save_music_database( data )

In [10]:
# scrub_short_and_explicit_tracks( backfill )

# Helper Functions

In [11]:

########## CONTAINER FUNCTIONS #####################################################################

def sort_keys_by_value( dct, reverse = True ):
    """ Return a list of keys sorted by their (numeric) values """
    srtLst = list()
    for k, v in dct.items():
        srtLst.append( [v,k,] )
    srtLst.sort( key = lambda x: x[0], reverse = reverse )
    return [pair[1] for pair in srtLst] 



########## STRING ANALYSIS #########################################################################


def levenshtein_dist( s1, s2 ):
    """ Get the edit distance between two strings """
    # Author: Salvador Dali, https://stackoverflow.com/a/32558749
    if len(s1) > len(s2):
        s1, s2 = s2, s1
    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]



########## STATS & SAMPLING ########################################################################


def total_pop( odds ):
    """ Sum over all categories in the prior odds """
    total = 0
    for k in odds:
        total += odds[k]
    return total


def normalize_dist( odds_ ):
    """ Normalize the distribution so that the sum equals 1.0 """
    total  = total_pop( odds_ )
    rtnDst = dict()
    for k in odds_:
        rtnDst[k] = odds_[k] / total
    return rtnDst


def roll_outcome( odds ):
    """ Get a random outcome from the distribution """
    oddsNorm = normalize_dist( odds )
    distrib  = []
    outcome  = []
    total    = 0.0
    for o, p in oddsNorm.items():
        total += p
        distrib.append( total )
        outcome.append( o )
    roll = random()
    for i, p in enumerate( distrib ):
        if roll <= p:
            return outcome[i]
    return None


# Micro-Genre Extraction

In [16]:
from uuid import uuid4

import enchant
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy.spatial import cKDTree

_DBS_EPSILON  =  5.0
_DBS_MIN_MMBR =  5
_ARTIST_Q_LIM = 50


def fetch_entire_playlist_with_audio_features( playlist_ID ):
    """ Get maximum infodump on all playlist tracks """
    plTracks = []
    trCount  = 0
    
    response = spot.user_playlist_tracks(
        CLIENT_ID, 
        playlist_ID, 
        fields = 'items,uri,name,id,total', 
        limit  = _RESPONSE_LIMIT
    )
    resTracks = response['items']
    Ntracks   = response['total']

    resIDs = [item['track']['id'] for item in resTracks]

    resFeatrs = spot.audio_features( resIDs )
    for i, track_i in enumerate( resTracks ):
        track_i.update( resFeatrs[i] )
    
    while 1:
        trCount += len( resTracks )
        plTracks.extend( resTracks )
        
        if trCount >= Ntracks:
            break
    
        response = spot.user_playlist_tracks(
            CLIENT_ID, 
            playlist_ID, 
            fields = 'items,uri,name,id,total', 
            limit  = _RESPONSE_LIMIT
        )
        resTracks = response['items']
    
        resIDs = [item['track']['id'] for item in resTracks]
    
        resFeatrs = spot.audio_features( resIDs )
        for i, track_i in enumerate( resTracks ):
            track_i.update( resFeatrs[i] )
    return plTracks


def get_track_vector( track ):
    """ Express the track characteristics as a vector """
    return np.array([
        track['acousticness'],
        track['danceability'],
        track['duration_ms'] / 1000.0 / _MIN_LEN_S,
        track['energy'],
        track['instrumentalness'],
        track['liveness'],
        track['loudness'],
        track['speechiness'],
        track['tempo'],
        track['valence'],
    ])


def get_tracks_as_vectors( tracks ):
    """ Convert all tracks to vectors """
    Mrows  = len( tracks )
    Ncols  = len( get_track_vector( tracks[0] ) )
    rtnMtx = np.zeros( (Mrows, Ncols,) ) 
    for i, trk in enumerate( tracks ):
        rtnMtx[i,:] = get_track_vector( trk )
    return rtnMtx


def vector_distance_to_genre( qVec, genreDct ):
    """ Get Euclidean distance between `qVec` and the nearest track vector of `genreDct` """
    return genreDct['kdTree'].query( qVec )[0]
    

def track_distance_to_genre( qTrack, genreDct ):
    """ Get Euclidean distance between `qTrack` and the nearest track vector of `genreDct` """
    return vector_distance_to_genre( get_track_vector( qTrack ), genreDct )


def remove_disallowed_entries_by_key( dct ):
    """ Remove segments unsuitable for a genre name """
    blocked = [ 'soundtrack', 'for', ]
    qKeys   = list( dct.keys() )
    qLen    = len( qKeys )
    delSet  = set([])
    difFrac = 0.25
    for i, k in enumerate( qKeys ):
        kLo     = str(k).lower()
        removed = False
        # Remove short keys
        if (len( kLo.strip() ) <= 2) and (k in dct):
            del dct[k]
            removed = True
        # Remove keys with apostrophees
        elif ("'" in kLo) and (k in dct):
            del dct[k]
            removed = True
        # Remove keys too similar to the blocked list
        else:
            for blkd in blocked:
                if ((levenshtein_dist( kLo, blkd ) / len( blkd )) < difFrac) and (k in dct):
                    del dct[k]
                    removed = True
                    break
        # If the current key is suitable, Then remove all following keys that are too similar
        if not removed:
            for j in range( i+1, qLen ):
                key_j    = qKeys[j]
                key_jLo  = str( key_j ).lower()
                fracRght = levenshtein_dist( kLo, key_jLo ) / len( key_jLo )
                fracLeft = levenshtein_dist( kLo, key_jLo ) / len( kLo     )
                if (fracRght < difFrac) or (fracLeft < difFrac):
                    if key_j in dct:
                        del dct[ key_j ]
                        

def repair_keys( dct ):
    """ Replace keys that have characters we don't like """
    badChars = [ '(', ')' ]
    dctKeys  = list( dct.keys() )
    for key in dctKeys:
        nuKey = str( key )
        p_bad = False
        for ch in badChars:
            if ch in key:
                nuKey.replace( ch, '' )
                p_bad = True
        if p_bad and (key in dct):
            dct[ nuKey ] = dct[ key ]
            del dct[ key ]


def Proper_Namify( namStr ):
    """ Capitalize every split string, and reassemble """
    namSeg = str( namStr ).split()
    Nseg   = len( namSeg )
    rtnNam = ""
    for i, seg in enumerate( namSeg ):
        rtnNam += seg[0].upper() + seg[1:].lower()
        if i+1 < Nseg:
            rtnNam += ' '
    return rtnNam
    

def extract_and_generate_genre_names( genreDct ):
    """ Extract Spotify genre and Generate local genre """
    mainDist = dict()
    loclDist = dict()
    artSet   = set([])
    englishD = enchant.Dict( "en_US" )
    segments = list()
    # Get artist info and local genre candidate substrings
    for track in genreDct['tracks']:
        # Gather artist IDs
        for artist in track['track']['album']['artists']:
            artSet.add( artist['id'] )
            segments.extend( artist['name'].split() )
        # Gather naming strings
        segments.extend( track['track']['album']['name'].split() )
        segments.extend( track['track']['name'].split() )
    qSegmnts = segments[:]
    nglshSeg = list()
    Norig    = len( segments )
    for qSeg in qSegmnts:
        segments.extend( englishD.suggest( qSeg ) ) # https://stackoverflow.com/a/3789057
    for i, seg in enumerate( segments ):
        if i < Norig:
            value = 1.0
        else:
            value = 0.5
        if seg in loclDist:
            loclDist[ seg ] += value
        else:
            loclDist[ seg ]  = value
    
    repair_keys( loclDist )
    remove_disallowed_entries_by_key( loclDist )
    # pprint( loclDist )

    # Extract Spotify genre from the artist set
    artSetLs = list( artSet )
    Nartists = len( artSetLs )
    artQList = list()
    if Nartists <= _ARTIST_Q_LIM:
        artQList.append( artSetLs )
    else:
        bgn = 0
        end = 0
        while end < Nartists:
            bgn = end
            end = min( end+_ARTIST_Q_LIM, Nartists )
            artQList.append( artSetLs[bgn:end] )

    for qArtLs in artQList:
        response = spot.artists( qArtLs )
        for artist in response['artists']:
            for spGenre in artist['genres']:
                if spGenre not in mainDist:
                    mainDist[ spGenre ]  = 1
                else:
                    mainDist[ spGenre ] += 1

    if len( mainDist ):
        topSpGenre = sort_keys_by_value( mainDist, reverse = True )[0]
    else:
        topSpGenre = _NULL_GENRE

    genreDct['nameSpot'] = topSpGenre

    loclDist = normalize_dist( loclDist )
    mainDist = normalize_dist( mainDist )
    genreDct['nameDist'] = mainDist

    localName = ""
    namLast   = roll_outcome( mainDist )
    if namLast is None:
        namLast = "Music"
        P_last    = 1.0
        running   = True
    else:
        P_last    = mainDist[ namLast ]
        running   = (random() < P_last)
    
    localName += Proper_Namify( roll_outcome( loclDist ) ) + '-' + Proper_Namify( namLast )

    while running:
        namLast = roll_outcome( loclDist )
        P_last  = loclDist[ namLast ]
        running = random() < P_last
        localName += ' ' + Proper_Namify( namLast ) 

    genreDct['nameLocal'] = localName

    print( f"{genreDct['nameLocal']} | {genreDct['nameSpot']} | {genreDct['nameDist']}" )
    
        


def generate_genres_from_track_list( tracks ):
    """ Use DBSCAN to generate clusters based on track vectors, Give them names, Then return as a `dict` """
    # NOTE: This function assumes that `tracks` was built using `fetch_entire_playlist_with_audio_features`                

    print( f"\n########## Extract genre info from {len(tracks)} tracks! ##########\n" )

    ### Compute Clusters ###
    trkVecs = get_tracks_as_vectors( tracks )
    clustrs = DBSCAN( eps = _DBS_EPSILON, min_samples = _DBS_MIN_MMBR ).fit( trkVecs )
    genres  = dict()
    for i, trk_i in enumerate( tracks ):
        lbl_i = clustrs.labels_[i]
        if (lbl_i not in genres):
            genres[ lbl_i ] = {
                'nameSpot' : None, # --- Most prominent Spotify genre across all artists
                'nameDist' : None, # --- Discrete distribution of Spotify genre across all artists
                'nameLocal': None, # --- Humorous (semi-)unique name given to micro-genre
                'tracks'   : [trk_i,], # Tracks that belong to this micro-genre
                'len'      : 1, # ------ Number of identified tracks in the micro-genre
                'vectors'  : None, # --- Vector representation of the tracks
                'origins'  : list(), # Playlist(s) that tracks come from
            }
        else:
            genres[ lbl_i ]['tracks'].append( trk_i )
            genres[ lbl_i ]['len'   ] += 1

    # Erase outliers
    if -1 in genres:
        del genres[-1]
    print( f"Identified {len(genres)} genres in this collection of {len(tracks)} tracks!" )
    # Generate unique keys for genres
    rtnGenres = dict()
    for k, v in genres.items():
        rtnGenres[ str( uuid4() ) ] = v

    ### Compute Center and kdTree for genre ###
    for gnre in rtnGenres.values():
        gnre['vectors'] = get_tracks_as_vectors( gnre['tracks'] )
        cntr = np.mean( gnre['vectors'], axis = 0 )
        dim  = len( cntr )
        for i in range( gnre['len'] ):
            pnt_i   = gnre['vectors'][i,:]
            dist_i  = np.linalg.norm( np.subtract( cntr, pnt_i ) )
            alpha_i = np.exp( -dist_i )
            cntr    = cntr * (1.0 - alpha_i) + pnt_i * alpha_i
        gnre['center'] = cntr # 2024-08-16: This is probably guaranteed to be inside the convex hull
        gnre['kdTree'] = cKDTree( gnre['vectors'] )

        # Generate micro-genre names {Spotify, Distribution, Local}
        extract_and_generate_genre_names( gnre )
    
    print( f"\n########## Genre extraction COMPLETE! ##########\n" )

    return rtnGenres


def fetch_collection_with_audio_features( dataDct, plDct, pause_s = 3.0 ):
    """ Get maximum infodump on all playlists """

    for plName_i, plID_i in plDct.items():
        print( plName_i, '-', plID_i, '...' )
        tracks_i = fetch_entire_playlist_with_audio_features( plID_i )
        dataDct['playlists'][ plName_i ] = {
            'ID'    : plID_i,
            'tracks': tracks_i,
            'len'   : len( tracks_i ),
        }
        sleep( pause_s )


def extract_micro_genres_from_collection( dataDct ):
    """ Extract and merge micro-genres from the entire collection defined by `dataDct` """
    for plName_i, playls_i in dataDct['playlists'].items():
        print( f"\n### Playlist: {plName_i}, {playls_i['ID']} ###\n" )
        bgn_i    = now()
        tracks_i = playls_i['tracks']
        gnres_i  = generate_genres_from_track_list( tracks_i )
        for gnre_j in gnres_i.values():
            gnre_j['origins'] = [playls_i['ID'],]
        dataDct['genres'].update( gnres_i )
        dur_i = now() - bgn_i
        print( f"\nGenre generation from {plName_i} took {dur_i} seconds!\n" )
        check_API_token() # 2024-08-21: This process can take a while on my home machine



## Micro-Genre Creation

In [13]:
fetch_collection_with_audio_features( data, playlist, pause_s = 3.0 )

study01 - 0a2qoe6S7lYeZ6nlhZdA0v ...
study02 - 6gbtR2cBq5PvkghidCvvGk ...
study03 - 3o3lN2qntdEV7UKTuuC77K ...
study04 - 41sFSisljvBDMBXtpp5NIw ...
study05 - 02iS5AFGp8YVuUUqcQf8ys ...
study06 - 6KI7A4MWrSM7EyKRUjxIi1 ...
study07 - 3V055Md2JdrUT8tX0af7di ...
study08 - 0tspdJlwSgiyf2O9PO6QaP ...
study09 - 5mHRBFoQtYy2izeZ66pG95 ...
study10 - 3832xeKGEOAXFJqE4K8kIq ...
study11 - 65MXR4dubPL9t0P4dgTWvn ...
study12 - 0ecSAfnD4CulIVnLt26ukI ...
study13 - 7K9ucByFRgDuZk8KMHeJkL ...


In [17]:
extract_micro_genres_from_collection( data )


### Playlist: study01, 0a2qoe6S7lYeZ6nlhZdA0v ###


########## Extract genre info from 400 tracks! ##########

Identified 8 genres in this collection of 400 tracks!
Cost-Violao | bossa nova | {'bossa nova': 0.125, 'brazilian jazz': 0.125, 'jazz guitar': 0.125, 'samba-jazz': 0.125, 'violao': 0.125, 'classic japanese jazz': 0.125, 'japanese jazz': 0.125, 'speedrun': 0.125}
True-Samba-jazz Wan | chillsynth | {'chillsynth': 0.21739130434782608, 'spacewave': 0.06521739130434782, 'synthwave': 0.10869565217391304, 'bboy': 0.021739130434782608, 'hammond organ': 0.021739130434782608, 'bossa nova': 0.043478260869565216, 'brazilian jazz': 0.043478260869565216, 'samba-jazz': 0.043478260869565216, 'boston metal': 0.021739130434782608, 'modern hard rock': 0.021739130434782608, 'post-metal': 0.021739130434782608, 'progressive doom': 0.021739130434782608, 'sludge metal': 0.021739130434782608, 'stoner metal': 0.021739130434782608, 'stoner rock': 0.021739130434782608, 'filter house': 0.0217391304347826

In [18]:
save_music_database( data )

About to write data/Study-Music-Data_2024-08-22T09:13:39.pkl ...
COMPLETE!


## Merge Micro-Genres into Mini-Genres

In [None]:

def merge_micro_genres_in_db( db ):
    """ Attempt to merge similar genres in the `db` """

    print( f"########## Attempt to merge similar genres in the music database ##########" )
    
    ### Init ###
    Ngenres = len( db['genres'] )
    IDs     = list()
    trees   = list()
    pntLsts = list()
    
    # Gather track vectors
    for k, v in db['genres']:
        IDs.append( k )
        trees.append( v['kdTree'] )
        pntLsts.append( v['vectors'] )
    
    ### Search for merge candidates ###
    mergeLst = list()
    for i, gID_i in enumerate( IDs ):
        pts_i = pntLsts[i]
        for j in range( i+1, Ngenres ):
            gID_j = IDs[j]
            dif_j = list()
            for pnt_k in pts_i:
                dif_j.append( kdt_j.query( pnt_k )[0] )
            if np.mean( dif_j ) < _DBS_EPSILON:
                found = False
                for mrgSet in mergeLst:
                    if ((gID_i in mrgSet) or (gID_j in mrgSet)):
                        mrgSet.add( gID_i )
                        mrgSet.add( gID_j )
                        found = True
                        break
                if not found:
                    mergeLst.append( set([gID_i, gID_j,]) )

    print( f"There are {len(mergeLst)} merge jobs to perform!" )
    
    ### Perform all merge jobs ###
    for mrgJob in mergeLst:
        mDct = {
            'nameSpot' : None, # --- Most prominent Spotify genre across all artists
            'nameDist' : dict(), # --- Discrete distribution of Spotify genre across all artists
            'nameLocal': None, # --- Humorous (semi-)unique name given to micro-genre
            'tracks'   : list(), # Tracks that belong to this micro-genre
            'len'      : 0, # ------ Number of identified tracks in the micro-genre
            'vectors'  : None, # --- Vector representation of the tracks
            'kdTree'   : None, # --- Spatial tree for 'vectors'
            'origins'  : list(), # Playlist(s) that tracks come from
        }
        namSplt = list()
        keys_i = list( mrgJob )
        print( "\nMerge:" )
        for j, key_ij in keys_i:
            if j > 0:
                print( "\t\t-and-" )
            print( f"\t{db['genres'][ key_ij ]['nameLocal']}" )
            
        for mID in mrgJob:
            if mID in db['genres']:
                # Local name components
                namSplt.append( [db['genres'][ mID ]['len'], db['genres'][ mID ]['nameLocal'].split(),] )
                # Tracks
                mDct['tracks'].extend( db['genres'][ mID ]['tracks'] )
                # Vectors
                if mDct['vectors'] is None:
                    mDct['vectors'] = db['genres'][ mID ]['vectors']
                else:
                    mDct['vectors'] = np.vstack( (mDct['vectors'], db['genres'][ mID ]['vectors'],) )
                # Len
                mDct['len'] += db['genres'][ mID ]['len']
                # Origins
                mDct['origins'].extend( db['genres'][ mID ]['origins'] )
                # Name Distribution
                for k, v in db['genres'][ mID ]['nameDist']:
                    if k in mDct['nameDist']:
                        mDct['nameDist'][k] += v * db['genres'][ mID ]['len']
                    else:
                        mDct['nameDist'][k]  = v * db['genres'][ mID ]['len']
                # Delete the merged genre
                del db['genres'][ mID ]
        # Recalc the spatial tree
        mDct['kdTree'] = cKDTree( mDct['vectors'] )
        # Normalize Name Distribution
        mDct['nameDist'] = normalize_dist( mDct['nameDist'] )
        # Choose the top Spotify genre
        if len( mDct['nameDist'] ):
            topSpGenre = sort_keys_by_value( mainDist, reverse = True )[0]
        else:
            topSpGenre = _NULL_GENRE
        mDct['nameSpot'] = topSpGenre
        # Construct a new local name
        lenLst = [item[0] for item in namSplt]
        lenMax = max( lenLst )
        lclNam = ""
        for j in range( lenMax ):
            dice = dict()
            for k, lst_k in enumerate( namSplt ):
                len_k = lenLst[k]
                if j < len_k:
                    dice[ namSplt[k][j] ] = len_k
            token = roll_outcome( dice )
            if j > 0:
                lclNam += ' '
            lclNam += token
        mDct['nameLocal'] = lclNam

        # Store the merged mini-genre
        nuID = str( uuid4() )
        db['genres'][ nuID ] = mDct
        print( f"Merge Complete!: New Mini-Genre {mDct['nameLocal']} ({nuID}) created with {mDct['len']} tracks!" )


def move_mini_genre_outliers_to_better_homes( db ):
    """ Attempt to rehome outlier tracks that were collected during the micro-genre creation and merge """

    ### Search for split candidates ###
    Ngenres = len( db['genres'] )
    IDs     = list()
    trees   = list()
    pntLsts = list()
    dstLsts = list()

    # Gather track vectors
    for k, v in db['genres']:
        IDs.append( k )
        trees.append( v['kdTree'] )
        pntLsts.append( v['vectors'] )

    # Evaluate spread within each mini-genre
    for i, pts_i in enumerate( pntLsts ):
        avgDist = list()
        for j, pnt_j in enumerate( pts_i ):
            dists = list()
            for k, pnt_k in enumerate( pts_i ):
                dists.append( np.linalg.norm( np.subtract( pnt_j, pnt_k ) ) )
            avgDist.append( np.mean( dists ) )
        dstLsts.append( avgDist )

    ## Evaluate relative closeness of every point in a genre to every other genre ##
    # For every mini-genre, do
    for i, gID_i in enumerate( IDs ):
        pts_i = pntLsts[i]
        # For every vector in the mini-genre, do
        for j, pnt_j in enumerate( pts_i ):
            dst_j = dstLsts[i][j]
            dMn_j = 1e6
            gnr_j = None
            # For every other mini-genre, Search for the shortest dist
            for k, gID_k in enumerate( IDs ):
                if i != k:
                    tre_k  = db['genres'][ gID_k ]['kdTree']
                    dst_jk = tre_k.query( pnt_j )[0]
                    if ((dst_jk < dst_j) and (dst_jk < dMn_j)):
                        dMn_j = dst_jk
                        gnr_j = gID_k
            # If a new home was found, then move
            if gnr_j is not None:
                print( f"Moving track {i} of {gID_i} --to-> {gnr_j}" )
                trk_j = db['genres'][ gID_i ]['tracks'][i]
                db['genres'][ gID_i ]['tracks'].pop(i)
                db['genres'][ gnr_j ]['tracks'].append( trk_j )
                db['genres'][ gnr_j ]['changed'] = True
                db['genres'][ gID_i ]['changed'] = True
                
    # For every mini-genre, Recalc vectors if it has changed
    for i, gID_i in enumerate( IDs ):
        if (('changed' in db['genres'][ gID_i ]) and db['genres'][ gID_i ]['changed']):
            # FIXME, START HERE: RECALC VECTOR SHIT
                
            
                        
                


        
        

In [None]:
# FIXME: MERGE GENRES
# FIXME: STORE GENRES

# Search Version 02, Graded by Mini-Genre Proximity

In [None]:
# FIXME: EVALUATE ADDING AUDIO ANALYSIS FEATURES

In [None]:
# tracks = fetch_entire_playlist_with_audio_features( backfill )
# generate_genres_from_track_list( tracks )

# pprint( tracks[0] )

# print( get_track_vector( tracks[0] ) )

In [None]:
# trkData = get_tracks_as_vectors( tracks )

In [None]:
# pca = PCA( n_components = 10 )

# pca.fit( trkData )
# print( pca.explained_variance_ )
# for comp in pca.components_:
#     comp_i = np.abs( comp )
#     print( np.argmax( comp_i ) )
# print( pca.get_params()          )

In [None]:

# print( len(  ) )
# print( np.max( clustering.labels_ ) )