`python3.11 -m pip install jupyterlab PyEnchant scipy numpy spotipy scikit-learn --user`

# Init & Login

In [1]:
import time, pickle, os
from math import ceil
from random import randrange, choice, random
from time import sleep
from pprint import pprint
from uuid import uuid4

import enchant
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy.spatial import cKDTree

from ShuffleSoGood import ( SpotifyClient, update_music_database, load_music_database, 
                            save_music_database, genre_vector_ops, now )

sc = SpotifyClient()
        

TOKEN OBTAINED


In [2]:
sc.check_API_token()

TOKEN STILL VALID, AGE: 0.03 MINUTES


## Playlists

In [3]:

playlist = {
    'study01' : "0a2qoe6S7lYeZ6nlhZdA0v",
    'study02' : "6gbtR2cBq5PvkghidCvvGk",
    'study03' : "3o3lN2qntdEV7UKTuuC77K",
    'study04' : "41sFSisljvBDMBXtpp5NIw",
    'study05' : "02iS5AFGp8YVuUUqcQf8ys",
    'study06' : "6KI7A4MWrSM7EyKRUjxIi1",
    'study07' : "3V055Md2JdrUT8tX0af7di",
    'study08' : "0tspdJlwSgiyf2O9PO6QaP",
    'study09' : "5mHRBFoQtYy2izeZ66pG95",
    'study10' : "3832xeKGEOAXFJqE4K8kIq",
    'study11' : "65MXR4dubPL9t0P4dgTWvn",
    'study12' : "0ecSAfnD4CulIVnLt26ukI",
    'study13' : "7K9ucByFRgDuZk8KMHeJkL",
    'study14' : "0v26bHydUxcGC5EbMlkjzG",
    'study15' : "6SqlfurCBP7eeMOojaDNtS",
}

review = {
    'zf_Over' : "5TtKaKCouyJp7Hhtu4YlYm",
    'zg_Over' : "5qX1Tq3IQ74iSgrlMfRhty",
    'zh_Over' : "1bwXMYoRgEFWebWZ9ZPBqs",
}


## Session Database

* FILTER TYPES: {'album', 'artist', 'track', 'year', 'upc', 'tag:hipster', 'tag:new', 'isrc', 'genre',}
* SEARCH TYPES: {"album", "artist", "playlist", "track", "show", "episode", "audiobook",}

In [4]:
##### Session Database Params #####
_MOD_T_DAY_S  = 60.0 * 60 * 24
_STALE_TIME_S = _MOD_T_DAY_S * 31
_NULL_GENRE   = "Music"
_DATA_DIR     = "data/"
_DATA_PREFIX  = "Study-Music-Data_"

In [5]:
from ShuffleSoGood import init_session_database
data, timestamp, outFilNam, outPath = init_session_database( _DATA_PREFIX )

## Settings

In [6]:

## General Params ##
_N_BACKFILL  = 400
_N_GN_CHUNK  = 100 # 50 # 200
_N_RM_CHUNK  = 125 #500
_NAME_PT_LIM =   6

## Search 01 Params ##
_N_MAX_SEARCH = 50
_N_DEF_SEARCH = 10
_YEAR_PADDING =  5

## DBSCAN Params ##
_DBS_EPSILON  =  0.7500 # 0.200 # 0.300 # 0.400 # 0.500 # 0.750
_DBS_MIN_MMBR =  3

## Mini-Genre Params ##
_MRG_D_FACTOR =  1.25 # 1.0 # 1.5 # 2.0 #3.0
_MIN_GNR_MMBR =  7
_ONE_BILLION  = 1e9

## Query History Keys ##
_NU_REL_Q_KEY = "NewReleases:Albums" # Query Key for New Releases
_FEAT_PL_Q_KY = "Featured:Playlists" # Query Key for Featured Playlists
_ART_Q_PREFIX = "ArtistTopTracks:" # - Query Key Prefix for Artist Top Tracks




# Function Selection

In [7]:
_LOAD_DB_PKL = False
_REGEN_DBASE = False
_SEARCH_BKFL = False
_FILL_PL_ID  = review['zh_Over']
_TEST_ADV    = True

# Query Functions

In [8]:
def get_playlist_length( playlist_ID ):
    """ Get the number of total tracks in the playlist """
    response = sc.spot.user_playlist_tracks(
        sc.CLIENT_ID, 
        playlist_ID, 
        fields = 'items,uri,name,id,total', 
        limit  = sc._RESPONSE_LIMIT
    )
    return response['total']
    

# Search Version 01, Query by Existing Artist and Year

In [9]:

def search_artist_within_era( artistName, releaseDate, 
                              N = _N_MAX_SEARCH, yearPadding = _YEAR_PADDING, pause_s = 0.5 ):
    """ Return `N` tracks within `yearPadding` of `trackDict` and by the same artist """
    global data
    rtnLs = list()
    query = "artist%3A" + str( artistName ).replace( " ", "%20")
    try:
        rYear = int( str( releaseDate )[:4] )
    except Exception:
        rYear = 2024
    bYear   = rYear - yearPadding
    eYear   = rYear + yearPadding
    years   = list( range( bYear, eYear+1 ) )
    miniLim = max( int(N/(eYear - bYear)), 1 )
    Nloop   = int(N / miniLim * 2)
    for i in range( Nloop ):
        iYear = choice( years )
        qry_i = query + "%20year%3A" + str( iYear )
        print( f"Search: {qry_i}" )

        if (data is not None):
            if (qry_i in data['queries']):
                ofst = data['queries'][ qry_i ]
                data['queries'][ qry_i ] += miniLim
            else:
                ofst = 0
                data['queries'][ qry_i ] = miniLim
        else:
            ofst = 0
        
        res = sc.spot.search( qry_i, 
                           limit  = miniLim, 
                           offset = min( ofst, _MAX_OFFSET ), 
                           type   = 'track' )
        sleep( pause_s )
        
        # tracks_i = [item['id'] for item in res['tracks']['items']]
        tracks_i = res['tracks']['items']
        
        if (data is not None):
            tracks_ii = list()
            for trk_j in tracks_i:
                if trk_j['id'] not in data['collectID']:
                    tracks_ii.append( trk_j )
            tracks_i = tracks_ii[:]

        rem = N - len( rtnLs )
        if len( tracks_i ) > rem:
            rtnLs.extend( tracks_i[:rem] )
            return rtnLs
        else:
            rtnLs.extend( tracks_i )
            # sleep( pause_s )
    return rtnLs


def choose_N_artist_year_pairs_from_db( N ):
    """ Fetch `N` random (<Artist>, <Date>) pairs from the `data` for searching """
    global data
    rtnPairs = list()
    artList  = list( data['artists'].keys() )
    print( f"How Many Artists?: {len(artList)}" )
    for i in range(N):
        artKey_i = choice( artList )
        artist_i = data['artists'][ artKey_i ]['name']
        rlYear_i = choice( data['artists'][ artKey_i ]['releases'] ) if len( data['artists'][ artKey_i ]['releases'] ) else '2024'
        rtnPairs.append( (artist_i, rlYear_i,) )
    return rtnPairs


def basic_new_music_search_01( Ntot, Mper = 5, pause_s = 0.125 ):
    """ Choose random `data` entries as search queries, Return a list of `Ntot` tracks consisting of `Mper` entries for each artist """  
    global data
    rtnLst   = list()
    searches = choose_N_artist_year_pairs_from_db( int( ceil( Ntot/Mper ) )*2 )
    addSet   = set([])
    for (art_i, rel_i) in searches:
        print( f"\tSearch, Artist: {art_i}, Around Year: {rel_i}" )
        rem    = Ntot - len( rtnLst )
        Mper   = min( Mper, rem )
        trks   = search_artist_within_era( art_i, rel_i, N = Mper, yearPadding = 3, pause_s = 0.5 )
        trks_i = list()
        for trk in trks:
            if trk['id'] not in addSet:
                trks_i.append( trk )
                addSet.add( trk['id'] )
        rem = Ntot - len( rtnLst )
        if rem > len( trks_i ):
            rtnLst.extend( trks_i )
        else:
            rtnLst.extend( trks_i[ :rem ] )
            break
        sleep( pause_s )
        sc.check_API_token()
    return rtnLst


# Helper Functions

In [10]:

########## CONTAINER FUNCTIONS #####################################################################

def sort_keys_by_value( dct, reverse = True ):
    """ Return a list of keys sorted by their (numeric) values """
    srtLst = list()
    for k, v in dct.items():
        srtLst.append( [v,k,] )
    srtLst.sort( key = lambda x: x[0], reverse = reverse )
    return [pair[1] for pair in srtLst] 



########## STRING ANALYSIS #########################################################################
from collections import deque

def levenshtein_dist( s1, s2 ):
    """ Get the edit distance between two strings """
    # Author: Salvador Dali, https://stackoverflow.com/a/32558749
    if len(s1) > len(s2):
        s1, s2 = s2, s1
    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = deque()
        distances_.append( i2+1 )
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        # distances = distances_
    # return distances[-1]
    return distances_.pop()


########## STATS & SAMPLING ########################################################################


def total_pop( odds ):
    """ Sum over all categories in the prior odds """
    total = 0
    for k in odds:
        total += odds[k]
    return total


def normalize_dist( odds_ ):
    """ Normalize the distribution so that the sum equals 1.0 """
    total  = total_pop( odds_ )
    rtnDst = dict()
    for k in odds_:
        rtnDst[k] = odds_[k] / total
    return rtnDst


def roll_outcome( odds ):
    """ Get a random outcome from the distribution """
    oddsNorm = normalize_dist( odds )
    distrib  = []
    outcome  = []
    total    = 0.0
    for o, p in oddsNorm.items():
        total += p
        distrib.append( total )
        outcome.append( o )
    roll = random()
    for i, p in enumerate( distrib ):
        if roll <= p:
            return outcome[i]
    return None


# Search Version 03, More Dimensions from Audio Analysis

In [11]:
from collections import defaultdict
from ShuffleSoGood import get_tracks_as_vectors, _V_NUM_FEATURES


def zip_dict_sorted_by_decreasing_value( dct ):
    """ Return a list of (k,v) tuples sorted by decreasing value """
    keys = list()
    vals = list()
    for k, v in dct.items():
        keys.append(k)
        vals.append(v)
    return sorted( zip( keys, vals ), key=lambda x: x[1], reverse=1)


"""
'segments': [{'confidence': 0.0,
               'duration': 0.20726,
               'loudness_end': 0.0,
               'loudness_max': -18.031,
               'loudness_max_time': 0.02426,
               'loudness_start': -24.279,
               'pitches': [0.004,
                           0.005,
                           0.01,
                           0.15,
                           1.0,
                           0.131,
                           0.009,
                           0.003,
                           0.008,
                           0.002,
                           0.002,
                           0.015],
               'start': 0.0,
               'timbre': [39.821,
                          28.025,
                          -99.44,
                          -42.135,
                          58.998,
                          -47.306,
                          7.856,
                          12.332,
                          -19.522,
                          13.344,
                          -3.278,
                          10.72]},
"""
    
    
def audio_analysis_summary( trkAudioDct ):
    """ Provide a summary of the audio analysis in order to provide extra sound dimensions """
    # FIXME: USE TIMBRE INFORMATION!

    def give_0():
        return 0
    
    ## Get Key Spread && Section Length ##
    sections = trkAudioDct['sections']
    trkKeys  = defaultdict( give_0 )
    sctDurs = list()
    for sctn in sections:
        sctDurs.append( sctn['duration'] )
        mode    = -0.5 if ((sctn['mode'] == 1) and (sctn['mode_confidence'] > 0.5)) else 0.0
        wght_i  = sctn['confidence'] * sctn['key_confidence']
        trkKeys[ sctn['key']+mode ] += wght_i
    SECT_DUR   = sum( sctDurs ) / len( sctDurs ) # Scalar for how often the song "shifts"
    keyRank    = zip_dict_sorted_by_decreasing_value( trkKeys )
    # Scalar for how much keys "move"
    if len( keyRank ) >= 3:
        KEY_SPREAD = abs( keyRank[0][0] - keyRank[1][0] ) + abs( keyRank[1][0] - keyRank[2][0] ) 
    elif len( keyRank ) == 2:
        KEY_SPREAD = abs( keyRank[0][0] - keyRank[1][0] )
    else:
        KEY_SPREAD = 0.0
    
    ## Get Bar Duration ##
    # Possibly analogous to tempo
    barDurs = list()
    for bar in trkAudioDct['bars']:
        barDurs.append( bar['duration'] )
    BAR_DUR = sum( barDurs ) / len( barDurs )

    ## Get Beat Strength ##
    # Use confidence in beats as an overall measure of how "sharp" the beats are
    beatCs = list()
    for beat in trkAudioDct['beats']:
        beatCs.append( beat['confidence'] )
    BEAT_STRENGTH = sum( beatCs ) / len( beatCs )

    return {
        "SECT_DUR"      : SECT_DUR,
        "BAR_DUR"       : BAR_DUR,
        "KEY_SPREAD"    : KEY_SPREAD,
        "BEAT_STRENGTH" : BEAT_STRENGTH,
    }
    

def fetch_entire_playlist_with_audio_features( playlist_ID, pause_s = 0.05 ):
    """ Get maximum infodump on all playlist tracks """
    # 2024-11-03: Summarized the audio analysis

    plTracks = []
    Ntracks  = get_playlist_length( playlist_ID )
    trCount  = 0
    trOffst  = 0

    resTracks = list()
    
    while trCount < Ntracks:

        lim = min( sc._RESPONSE_LIMIT, Ntracks-trCount )

        response = sc.spot.user_playlist_tracks(
            sc.CLIENT_ID, 
            playlist_ID, 
            fields = 'items,uri,name,id,total', 
            limit  = lim,
            offset = trCount
        )
        
        sleep( pause_s )

        if len( response['items'] ):
            trCount  += len( response['items'] )
            resTracks += response['items']

        print('>',end=' ',flush=True)

    print( f"\tObtained basic track info for {len(resTracks)} tracks!" )

            
    resIDs    = [item['track']['id'] for item in resTracks]
    resFeatrs = list()
    trCount   = 0
    bgn       = 0
    end       = 0

    while trCount < Ntracks:

        lim      = min( sc._RESPONSE_LIMIT, Ntracks-trCount )
        end      = bgn + lim
        response = sc.spot.audio_features( resIDs[ bgn:end ] )
        sleep( pause_s )
        
        if len( response ):
            trCount   += len( response )
            resFeatrs += response
            bgn        = end

        print('>',end=' ',flush=True)

    print( f"\tObtained detailed track info for {len(resFeatrs)} tracks!" )


    for i, track_i in enumerate( resTracks ):
        track_i.update( resFeatrs[i] )
        details   = sc.spot.audio_analysis( track_i['id'] )
        sleep( pause_s/10.0 )
        audioSmry = audio_analysis_summary( details )
        track_i['sectionDur']  = audioSmry["SECT_DUR"]
        track_i['barDuration'] = audioSmry["BAR_DUR"]
        track_i['keySpread']   = audioSmry["KEY_SPREAD"]
        track_i['beatSharp']   = audioSmry["BEAT_STRENGTH"]
        if( (i+1)%sc._RESPONSE_LIMIT == 0 ):
            print('>',end=' ',flush=True)

    print( f"\tObtained detailed audio info for {len(resFeatrs)} tracks!" )
            

    return resTracks


def assign_vectors_to_tracks( tracks ):
    """ Store vector info in each track dictionary """
    matxTrks = get_tracks_as_vectors( tracks )
    for i, vec_i in enumerate( matxTrks ):
        trk_i = tracks[i]
        trk_i['vector'] = vec_i
    print( f"Stored vectors for {len(tracks)} tracks!" )


def fetch_collection_with_audio_features( plDct, rvDct = None, pause_s = 3.0, renewSets = True ):
    """ Get maximum infodump on all playlists """
    global data

    if renewSets:
        data['collectID'] = set([])
    
    print( "##### Get Collection Data #####" )
    for plName_i, plID_i in plDct.items():
        print( plName_i, '-', plID_i, '...' )
        tracks_i = fetch_entire_playlist_with_audio_features( plID_i )
        assign_vectors_to_tracks( tracks_i )
        data['playlists'][ plName_i ] = {
            'ID'    : plID_i,
            'tracks': tracks_i,
            'len'   : len( tracks_i ),
        }
        data['collectID'] = data['collectID'].union( set([trk['track']['id'] for trk in tracks_i]) )

        # Catalog artists from collection tracks (Req'd for (Sub-)Search 01)
        for track_j in tracks_i:
            for artist_k in track_j['track']['artists']:
                artistID_j = artist_k['id']
                if artistID_j not in data['artists']:
                    data['artists'][ artistID_j ] = { 
                        'name'    : track_j['track']['artists'][0]['name'], 
                        'count'   : 1, 
                        'releases': [track_j['track']['album']['release_date'],], 
                    }
                else:
                    data['artists'][ artistID_j ]['count'   ] += 1
                    data['artists'][ artistID_j ]['releases'].append( track_j['track']['album']['release_date'] )
        
        sleep( pause_s )

    if rvDct is not None:
        print( "\n##### Get Review Data #####" )
        for plName_i, plID_i in rvDct.items():
            print( plName_i, '-', plID_i, '...' )
            tracks_i = fetch_entire_playlist_with_audio_features( plID_i )
            assign_vectors_to_tracks( tracks_i )
            data['review'][ plName_i ] = {
                'ID'    : plID_i,
                'tracks': tracks_i,
                'len'   : len( tracks_i ),
            }
            data['reviewID'] = data['reviewID'].union( set([trk['track']['id'] for trk in tracks_i]) )
            sleep( pause_s )

    print( "\n##### Complete #####" )


def analyze_db_vector_spread( ):
    """ Gather info for feature scaling and print it for manual scaling update """
    global data
    rtnMatx = np.ones( (2, _V_NUM_FEATURES,) )
    rtnMatx[0,:] *=  1e6
    rtnMatx[1,:] *= -1e6
    totlDct = dict()
    totlDct.update( data['playlists'] )
    totlDct.update( data['review'   ] )
    for plName_i, playls_i in totlDct.items():
        if len( playls_i['tracks'] ):
            matx_i = get_tracks_as_vectors( playls_i['tracks'] )
            mMin_i = np.min( matx_i, axis = 0 )
            mMax_i = np.max( matx_i, axis = 0 )
            for j in range( _V_NUM_FEATURES ):
                if mMin_i[j] < rtnMatx[0,j]:
                    rtnMatx[0,j] = mMin_i[j]
                if mMax_i[j] > rtnMatx[1,j]:
                    rtnMatx[1,j] = mMax_i[j]
    for j in range( _V_NUM_FEATURES ):
        print( f"Feature {j+1}, Span: {rtnMatx[1,j] - rtnMatx[0,j]}, Min: {rtnMatx[0,j]}" )
    return rtnMatx
    

In [12]:
if _LOAD_DB_PKL:
    data = load_music_database( data, _DATA_DIR, _DATA_PREFIX, forceLoad = 1 );
    

In [13]:
if _REGEN_DBASE:
    fetch_collection_with_audio_features( playlist, review, pause_s = 3.0 )
    

In [14]:
if _REGEN_DBASE:
    analyze_db_vector_spread(  );
    save_music_database( data, outPath )

# Audio Analysis Test

In [15]:
if _TEST_ADV:
    from pprint import pprint
    plTracks = fetch_entire_playlist_with_audio_features( playlist['study01'] )
    
if _TEST_ADV:
    track    = plTracks[0]
    # pprint( track )
    details  = sc.spot.audio_analysis( track['id'] )
    pprint( details )
    pprint( audio_analysis_summary( details ) )

> > > > 	Obtained basic track info for 400 tracks!
> > > > 	Obtained detailed track info for 400 tracks!
> > > > 	Obtained detailed audio info for 400 tracks!
{'bars': [{'confidence': 0.468, 'duration': 1.6651, 'start': 0.02346},
          {'confidence': 0.519, 'duration': 1.67591, 'start': 1.68856},
          {'confidence': 0.353, 'duration': 1.67899, 'start': 3.36447},
          {'confidence': 0.412, 'duration': 1.68212, 'start': 5.04345},
          {'confidence': 0.8, 'duration': 1.68041, 'start': 6.72557},
          {'confidence': 0.361, 'duration': 1.67592, 'start': 8.40598},
          {'confidence': 0.639, 'duration': 1.68096, 'start': 10.0819},
          {'confidence': 0.719, 'duration': 1.67707, 'start': 11.76286},
          {'confidence': 0.483, 'duration': 1.67465, 'start': 13.43993},
          {'confidence': 0.539, 'duration': 1.68066, 'start': 15.11458},
          {'confidence': 0.322, 'duration': 1.67617, 'start': 16.79524},
          {'confidence': 0.52, 'duration': 1.675

# Segment Micro-Genres && Give Them Names

In [16]:

def remove_disallowed_entries_by_key( dct ):
    """ Remove segments unsuitable for a genre name """
    # 2024-09-02: THERE ARE RACIAL SLURS IN THE PyEnchant MODULE! WHY?!? (Also: HITLER)
    blocked = ['soundtrack', 'for', 'nigger', 'gypsy', 'Hitler', 'klan',] 
    qKeys   = list( dct.keys() )
    qLen    = len( qKeys )
    delSet  = set([])
    difFrac = 0.25
    for i, k in enumerate( qKeys ):
        kLo     = str(k).lower()
        removed = False
        # Remove short keys
        if (len( kLo.strip() ) <= 2) and (k in dct):
            del dct[k]
            removed = True
        # Remove keys with apostrophees
        elif ("'" in kLo) and (k in dct):
            del dct[k]
            removed = True
        # Remove keys too similar to the blocked list
        else:
            for blkd in blocked:
                if ((levenshtein_dist( kLo, blkd ) / len( blkd )) < difFrac) and (k in dct):
                    del dct[k]
                    removed = True
                    break
        # If the current key is suitable, Then remove all following keys that are too similar
        if not removed:
            for j in range( i+1, qLen ):
                key_j    = qKeys[j]
                if (key_j in dct):
                    key_jLo  = str( key_j ).lower()
                    distance = levenshtein_dist( kLo, key_jLo )
                    fracRght = distance / len( key_jLo )
                    fracLeft = distance / len( kLo     )
                    if (fracRght < difFrac) or (fracLeft < difFrac):
                        if key_j in dct:
                            del dct[ key_j ]
                        

def repair_keys( dct ):
    """ Replace keys that have characters we don't like """
    badChars = ['(', ')', ':', '"']
    dctKeys  = list( dct.keys() )
    for key in dctKeys:
        nuKey = str( key )
        p_bad = False
        for ch in badChars:
            if ch in key:
                nuKey.replace( ch, '' )
                p_bad = True
        if p_bad and (key in dct):
            dct[ nuKey ] = dct[ key ]
            del dct[ key ]


def Proper_Namify( namStr ):
    """ Capitalize every split string, and reassemble """
    namSeg = str( namStr ).split()
    Nseg   = len( namSeg )
    rtnNam = ""
    for i, seg in enumerate( namSeg ):
        rtnNam += seg[0].upper() + seg[1:].lower()
        if i+1 < Nseg:
            rtnNam += ' '
    return rtnNam
    

def extract_and_generate_genre_names( genreDct, pause_s = 0.25 ):
    """ Extract Spotify genre and Generate local genre """
    global data
    mainDist = dict()
    loclDist = dict()
    artSet   = set([])
    englishD = enchant.Dict( "en_US" )
    segments = list()
    # Get artist info and local genre candidate substrings
    for track in genreDct['tracks']:
        # Gather artist IDs
        for artist in track['track']['album']['artists']:
            artSet.add( artist['id'] )
            segments.extend( artist['name'].split() )
        # Gather naming strings
        segments.extend( track['track']['album']['name'].split() )
        segments.extend( track['track']['name'].split() )
    qSegmnts = segments[:]
    nglshSeg = list()
    Norig    = len( segments )
    for qSeg in qSegmnts:
        segments.extend( englishD.suggest( qSeg ) ) # https://stackoverflow.com/a/3789057
    for i, seg in enumerate( segments ):
        if i < Norig:
            value = 1.0
        else:
            value = 0.5
        if seg in loclDist:
            loclDist[ seg ] += value
        else:
            loclDist[ seg ]  = value
    
    repair_keys( loclDist )
    remove_disallowed_entries_by_key( loclDist )
    # pprint( loclDist )

    # Extract Spotify genre from the artist set
    artSetLs = list( artSet )
    Nartists = len( artSetLs )
    artQList = list()
    if Nartists <= _ARTIST_Q_LIM:
        artQList.append( artSetLs )
    else:
        bgn = 0
        end = 0
        while end < Nartists:
            bgn = end
            end = min( end+_ARTIST_Q_LIM, Nartists )
            artQList.append( artSetLs[bgn:end] )

    for qArtLs in artQList:
        nuLst = list()
        for qArtist in qArtLs:
            if (qArtist not in data['artists']):
                data['artists'][ qArtist ] = { 
                    'name'    : None, 
                    'count'   : 0, 
                    'releases': list(), 
                    'genres'  : list(), 
                }
            if ('genres' not in data['artists'][ qArtist ]):
                data['artists'][ qArtist ]['genres'] = list()
            if (len( data['artists'][ qArtist ]['genres'] ) == 0):
                nuLst.append( qArtist )

        if len( nuLst ):
            sc.check_API_token() # 2024-09-01: WHY DOES IT KEEP FAILING HERE IT HAS NOT BEEN AN HOUR
            response = sc.spot.artists( nuLst )
            for artist in response['artists']:
                qArtist = artist['id']
                data['artists'][ qArtist ]['name'  ] = artist['name'  ]
                data['artists'][ qArtist ]['genres'] = artist['genres']
            sleep( pause_s )

        for qArtist in qArtLs:
            for spGenre in data['artists'][ qArtist ]['genres']:
                if spGenre not in mainDist:
                    mainDist[ spGenre ]  = 1
                else:
                    mainDist[ spGenre ] += 1

    if len( mainDist ):
        topSpGenre = sort_keys_by_value( mainDist, reverse = True )[0]
    else:
        topSpGenre = _NULL_GENRE

    genreDct['nameSpot'] = topSpGenre

    loclDist = normalize_dist( loclDist )
    mainDist = normalize_dist( mainDist )
    genreDct['nameDist'] = mainDist

    localName = ""
    namLast   = roll_outcome( mainDist )
    if namLast is None:
        namLast = "Music"
        P_last    = 1.0
        running   = True
    else:
        P_last    = max( 0.333, mainDist[ namLast ] )
        running   = (random() < P_last)
    
    localName += Proper_Namify( roll_outcome( loclDist ) ) + '-' + Proper_Namify( namLast )

    nPart = 0
    while running:
        namLast = roll_outcome( loclDist )
        P_last  = max( 0.333, loclDist[ namLast ] )
        running = random() < min( P_last*10.0, 0.95 )
        localName += ' ' + Proper_Namify( namLast ) 
        nPart += 1
        if nPart >= _NAME_PT_LIM:
            break

    genreDct['nameLocal'] = localName

    print( f"{genreDct['nameLocal']} | {genreDct['nameSpot']} | {genreDct['nameDist']}" )


def generate_genres_from_track_list( tracks ):
    """ Use DBSCAN to generate clusters based on track vectors, Give them names, Then return as a `dict` """
    # NOTE: This function assumes that `tracks` was built using `fetch_entire_playlist_with_audio_features`                
    global data
    print( f"\n########## Extract genre info from {len(tracks)} tracks! ##########\n" )

    ### Compute Clusters ###
    trkVecs = get_tracks_as_vectors( tracks )
    clustrs = DBSCAN( eps = _DBS_EPSILON, min_samples = _DBS_MIN_MMBR ).fit( trkVecs )
    genres  = dict()
    for i, trk_i in enumerate( tracks ):
        lbl_i = clustrs.labels_[i]
        if (lbl_i not in genres):
            genres[ lbl_i ] = {
                'nameSpot' : None, # --- Most prominent Spotify genre across all artists
                'nameDist' : None, # --- Discrete distribution of Spotify genre across all artists
                'nameLocal': None, # --- Humorous (semi-)unique name given to micro-genre
                'tracks'   : [trk_i,], # Tracks that belong to this micro-genre
                'len'      : 1, # ------ Number of identified tracks in the micro-genre
                'vectors'  : None, # --- Vector representation of the tracks
                'origins'  : list(), # Playlist(s) that tracks come from
            }
        else:
            genres[ lbl_i ]['tracks'].append( trk_i )
            genres[ lbl_i ]['len'   ] += 1

    # Erase outliers
    if -1 in genres:
        del genres[-1]
    print( f"Identified {len(genres)} genres in this collection of {len(tracks)} tracks!" )
    # Generate unique keys for genres
    rtnGenres = dict()
    for k, v in genres.items():
        rtnGenres[ str( uuid4() ) ] = v

    ### Compute Center and kdTree for genre ###
    for gnre in rtnGenres.values():
        genre_vector_ops( gnre )

        # Generate micro-genre names {Spotify, Distribution, Local}
        extract_and_generate_genre_names( gnre )
    
    print( f"\n########## Genre extraction COMPLETE! ##########\n" )

    return rtnGenres


def extract_micro_genres_from_collection( eraseExisting = True ):
    """ Extract and merge micro-genres from the entire collection defined by `data` """
    global data
    
    if eraseExisting:
        data['genres'] = dict()
    
    for plName_i, playls_i in data['playlists'].items():
        print( f"\n### Playlist: {plName_i}, {playls_i['ID']} ###\n" )
        bgn_i    = now()
        tracks_i = playls_i['tracks']
        N_trks_i = len( tracks_i )
        bgn      = 0
        end      = 0
        while end < N_trks_i:
            bgn     = end
            end     = min( end+_N_GN_CHUNK, N_trks_i )
            print( f"\n## Chunk[ {bgn}:{end} ] ##\n" )
            
            gnres_i = generate_genres_from_track_list( tracks_i[ bgn:end ] )
            
            for gnre_j in gnres_i.values():
                gnre_j['origins'] = [playls_i['ID'],]
                
            data['genres'].update( gnres_i )
            sc.check_API_token() # 2024-08-21: This process can take a while on my home machine

        # break
            
        dur_i = now() - bgn_i
        print( f"\nGenre generation from {plName_i} took {dur_i/60.0} minutes!\n" )
        

## Micro-Genre Creation

In [17]:
import cProfile

if _REGEN_DBASE:
    # cProfile.run("extract_micro_genres_from_collection( data )")
    extract_micro_genres_from_collection( )

## Merge Micro-Genres into Mini-Genres

In [18]:


def merge_micro_genres_in_db( ):
    """ Attempt to merge similar genres in the `data` """
    global data
    print( f"########## Attempt to merge similar genres in the music database ##########" )
    
    ### Init ###
    Ngenres = len( data['genres'] )
    IDs     = list()
    trees   = list()
    pntLsts = list()
    
    # Gather track vectors
    for k, v in data['genres'].items():
        IDs.append( k )
        trees.append( v['kdTree'] )
        pntLsts.append( v['vectors'] )
    
    ### Search for merge candidates ###
    mergeLst = list()
    for i, gID_i in enumerate( IDs ):
        pts_i = pntLsts[i]
        if len( pts_i ) > 1:
            for j in range( i+1, Ngenres ):
                gID_j = IDs[j]
                kdt_j = trees[j]
                dif_j = list()
                for pnt_k in pts_i:
                    dif_j.append( kdt_j.query( pnt_k )[0] )
                if (np.mean( dif_j ) < (_DBS_EPSILON/_MRG_D_FACTOR)):
                    found = False
                    for mrgSet in mergeLst:
                        if ((gID_i in mrgSet) or (gID_j in mrgSet)):
                            mrgSet.add( gID_i )
                            mrgSet.add( gID_j )
                            found = True
                            break
                    if not found:
                        mergeLst.append( set([gID_i, gID_j,]) )

    print( f"There are {len(mergeLst)} merge jobs to perform!" )
    
    ### Perform all merge jobs ###
    for mrgJob in mergeLst:
        mDct = {
            'nameSpot' : None, # --- Most prominent Spotify genre across all artists
            'nameDist' : dict(), # --- Discrete distribution of Spotify genre across all artists
            'nameLocal': None, # --- Humorous (semi-)unique name given to micro-genre
            'tracks'   : list(), # Tracks that belong to this micro-genre
            'len'      : 0, # ------ Number of identified tracks in the micro-genre
            'vectors'  : None, # --- Vector representation of the tracks
            'kdTree'   : None, # --- Spatial tree for 'vectors'
            'origins'  : list(), # Playlist(s) that tracks come from
        }
        namSplt = list()
        keys_i = list( mrgJob )
        print( "\nMerge:" )
        for j, key_ij in enumerate( keys_i ):
            if key_ij in data['genres']:
                if j > 0:
                    print( "\t\t-and-" )
                print( f"\t{data['genres'][ key_ij ]['nameLocal']}" )
            
        for mID in mrgJob:
            if mID in data['genres']:
                # Local name components
                namSplt.append( [data['genres'][ mID ]['len'], data['genres'][ mID ]['nameLocal'].split(),] )
                # Tracks
                mDct['tracks'].extend( data['genres'][ mID ]['tracks'] )
                # Vectors
                if mDct['vectors'] is None:
                    mDct['vectors'] = data['genres'][ mID ]['vectors']
                else:
                    mDct['vectors'] = np.vstack( (mDct['vectors'], data['genres'][ mID ]['vectors'],) )
                # Len
                mDct['len'] += data['genres'][ mID ]['len']
                # Origins
                mDct['origins'].extend( data['genres'][ mID ]['origins'] )
                # Name Distribution
                for k, v in data['genres'][ mID ]['nameDist'].items():
                    if k in mDct['nameDist']:
                        mDct['nameDist'][k] += v * data['genres'][ mID ]['len']
                    else:
                        mDct['nameDist'][k]  = v * data['genres'][ mID ]['len']
                # Delete the merged genre
                del data['genres'][ mID ]
        if (mDct['vectors'] is not None) and (len( mDct['vectors'] ) > 1):
            # Recalc the spatial tree
            mDct['kdTree'] = cKDTree( mDct['vectors'], balanced_tree = False, compact_nodes = False )
        # Normalize Name Distribution
        mDct['nameDist'] = normalize_dist( mDct['nameDist'] )
        # Choose the top Spotify genre
        if len( mDct['nameDist'] ):
            topSpGenre = sort_keys_by_value( mDct['nameDist'], reverse = True )[0]
        else:
            topSpGenre = _NULL_GENRE
        mDct['nameSpot'] = topSpGenre
        
        # Construct a new local name
        lenLst = [len( item[1] ) for item in namSplt]
        if len( lenLst ) > 0:
            lenMax = max( lenLst )
            lclNam = ""
            for j in range( lenMax ):
                dice = dict()
                for k, (mag_k, lst_k) in enumerate( namSplt ):
                    # print( f"Components: {k}, {mag_k}, {lst_k}" )
                    len_k = len( lst_k )
                    if j < len_k:
                        dice[ lst_k[j] ] = mag_k
                token = roll_outcome( dice )
                # print( namSplt )
                # print( lenLst )
                # pprint( dice )
                # print( f"Token: {token}" )
                if j > 0:
                    lclNam += ' '
                lclNam += (token if (token is not None) else _NULL_GENRE)
            mDct['nameLocal'] = lclNam
        else:
            mDct['nameLocal'] = _NULL_GENRE

        # Store the merged mini-genre
        nuID = str( uuid4() )
        data['genres'][ nuID ] = mDct
        print( f"Merge Complete!: New Mini-Genre {mDct['nameLocal']} ({nuID}) created with {mDct['len']} tracks!" )


def move_mini_genre_outliers_to_better_homes( ):
    """ Attempt to rehome outlier tracks that were collected during the micro-genre creation and merge """
    global data
    
    ### Search for split candidates ###
    Ngenres = len( data['genres'] )
    IDs     = list()
    trees   = list()
    pntLsts = list()
    dstLsts = list()

    # Gather track vectors
    for k, v in data['genres'].items():
        IDs.append( k )
        trees.append( v['kdTree'] )
        pntLsts.append( v['vectors'] )

    # Evaluate spread within each mini-genre
    for i, pts_i in enumerate( pntLsts ):
        avgDist = list()
        if pts_i is not None:
            for j, pnt_j in enumerate( pts_i ):
                dists = list()
                for k, pnt_k in enumerate( pts_i ):
                    dists.append( np.linalg.norm( np.subtract( pnt_j, pnt_k ) ) )
                avgDist.append( np.mean( dists ) )
            dstLsts.append( avgDist )
        else:
            dstLsts.append( _ONE_BILLION )

    ## Evaluate relative closeness of every point in a genre to every other genre ##
    # For every mini-genre, do
    for i, gID_i in enumerate( IDs ):
        if pntLsts[i] is None:
            continue
        pts_i = pntLsts[i][:].tolist()
        dst_i = dstLsts[i][:]
        # For every vector in the mini-genre, do
        j = 0 
        while (j < len( pts_i )):
            pnt_j = pts_i[j]
            dst_j = dst_i[j]
            dMn_j = 1e6
            gnr_j = None
            # For every other mini-genre, Search for the shortest dist
            for k, gID_k in enumerate( IDs ):
                if i != k:
                    tre_k  = data['genres'][ gID_k ]['kdTree']
                    if tre_k is not None:
                        dst_jk = tre_k.query( pnt_j )[0]
                        if ((dst_jk < dst_j) and (dst_jk < dMn_j)):
                            dMn_j = dst_jk
                            gnr_j = gID_k
            # If a new home was found, then move
            if gnr_j is not None:
                print( f"Moving track {j} of  {data['genres'][ gID_i ]['nameLocal']}  --to->  {data['genres'][ gnr_j ]['nameLocal']}" )
                trk_j = data['genres'][ gID_i ]['tracks'][j]
                data['genres'][ gID_i ]['tracks'].pop(j)
                pts_i.pop(j)
                dst_i.pop(j)
                data['genres'][ gnr_j ]['tracks'].append( trk_j )
                data['genres'][ gnr_j ]['changed'] = True
                data['genres'][ gID_i ]['changed'] = True
            else:
                j += 1
                
    # For every mini-genre, Recalc vectors if it has changed
    for i, gID_i in enumerate( IDs ):
        genre_vector_ops( data['genres'][ gID_i ] )
        data['genres'][ gID_i ]['changed'] = False
            

    print( "\n########## Genre Report ##########\n" )
    print( f"Filter {len( data['genres'] )} to filter...\n" )
    lstDel = list()
    
    for gID, genre in data['genres'].items():
        if genre['len'] >= _MIN_GNR_MMBR:
            print( f"{genre['nameLocal']}, {genre['len']}" )
        else:
            lstDel.append( gID )

    print( "Deleting ...", end = " " )
    for gID in lstDel:
        print( gID, end = ", " )
        del data['genres'][ gID ]
        
    print( f"\nDeleted {len( lstDel )} mini-genres, {len( data['genres'] )} remain" )
    print( "\n########## Report Complete ##########\n" )
            

In [19]:
if _REGEN_DBASE:
    merge_micro_genres_in_db( )
    move_mini_genre_outliers_to_better_homes( )
    save_music_database( data, outPath )

# Search Version 02, Graded by Mini-Genre Proximity

## Sub-Searches

In [20]:

def get_tracks_from_new_releases( N, div = 12, pause_s = 0.25 ):
    """ Get `N` tracks from newly-released albums """
    global data
    
    totTrks = list()
    Nalbums = int( ceil( N / div ) )
    albOfst =  0
    count   =  0
    Niter   =  0
    itrLim  = 20

    while count < N: 

        Niter += 1

        if Niter > itrLim:
            break
        
        if data is not None:
            if _NU_REL_Q_KEY in data['queries']:
                albOfst = data['queries'][ _NU_REL_Q_KEY ]
                data['queries'][ _NU_REL_Q_KEY ] += Nalbums
            else:
                data['queries'][ _NU_REL_Q_KEY ] = Nalbums
        
        response = sc.spot.new_releases( limit = Nalbums, offset = albOfst )
        nuAlbums = [item['id'] for item in response['albums']['items']]
        sleep( pause_s )
        
        for albumID in nuAlbums:
            res    = sc.spot.album_tracks( albumID, limit = 50, offset = 0 )
            tracks = res['items']
            totTrks.extend( tracks )
            count += len( tracks )
            sleep( pause_s )

    return totTrks


def get_recommended_tracks_from_db( N_tracks, pause_s = 0.25 ):
    """ Recommended tracks by both artists and genres """
    global data
    count  = 0
    gIDs   = list( data['genres'].keys() )
    rtnLst = list()
    while count < N_tracks:
        recArt = list()
        recTrk = list()
        recGnr = list()
        gID    = choice( gIDs )
        gnre   = data['genres'][ gID ]
        for i in range( 5 ):
            trk_i = choice( gnre['tracks'] )
            art_i = choice( trk_i['track']['album']['artists'] )
            if i % 3 == 0:
                recArt.append( art_i['id'] )
            elif i % 2 == 0:    
                recTrk.append( trk_i['track']['id'] )
            else:
                recGnr.append( roll_outcome( gnre['nameDist'] ) )
                
        nFetch = min( sc._RESPONSE_LIMIT, max( N_tracks-count, 0 ) )
        if nFetch > 0:
            response = sc.spot.recommendations(
                seed_artists = recArt, 
                seed_genres  = recGnr, 
                seed_tracks  = recTrk, 
                limit        = nFetch
            )
            sleep( pause_s )
            # pprint( response )
            tracks = response['tracks']
            count += len( tracks )
            rtnLst.extend( tracks )
        else:
            break
    return rtnLst
        

def get_tracks_from_related_artists( N_tracks, pause_s = 0.25 ):
    """ Attempt to get fresh tracks from Spotify given artists currently in the collection """
    global data
    skipN  = 5
    count  = 0
    plNam  = list( data['playlists'].keys() )
    rtnLst = list()

    while count < N_tracks:
        tracks = data['playlists'][ choice( plNam ) ]['tracks']
        artist = choice( choice( tracks )['track']['album']['artists'] )['id']
        res    = sc.spot.artist_related_artists( artist )
        artLst = [item['id'] for item in res['artists']]
        Nart   = len( artLst )
        sleep( pause_s )

        # For each artist, Get top tracks
        for art_i in artLst:
            qArtist = _ART_Q_PREFIX + str( art_i )
            if qArtist not in data['queries']:
                res = sc.spot.artist_top_tracks( art_i )
                sleep( pause_s )
                count += len( res['tracks'] )
                rtnLst.extend( res['tracks'] )
                data['queries'][ qArtist ] = 1

        # For each group of artists, Get recommendations
        bgn  = 0
        end  = 0
        grps = list()
        while end < Nart:
            bgn = end
            end = min( end+skipN, Nart )
            grps.append( artLst[ bgn:end ] )
        for artGrp in grps:
            nFetch = min( sc._RESPONSE_LIMIT, max( N_tracks-count, 0 ) )
            if nFetch > 0:
                response = sc.spot.recommendations(
                    seed_artists = artGrp, 
                    limit        = nFetch
                )
                sleep( pause_s )
                # pprint( response )
                tracks = response['tracks']
                count += len( tracks )
                rtnLst.extend( tracks )
            else:
                break

    return rtnLst


def get_tracks_from_featured_playlists( N_tracks, pause_s = 0.25 ):
    """ Attempt to get fresh tracks from Spotify featured playlists, one at a time """
    global data
    step    = 1
    count   = 0
    totTrks = list()
    while count < N_tracks:
        ofst  = data['queries'][ _FEAT_PL_Q_KY ] if (_FEAT_PL_Q_KY in data['queries']) else 0
        data['queries'][ _FEAT_PL_Q_KY ] = data['queries'].get( _FEAT_PL_Q_KY, 0 ) + 1
        response = sc.spot.featured_playlists( limit = step, offset = ofst )
        if len( response['playlists']['items'] ):
            plylstID = response['playlists']['items'][0]['id']
            tracks   =  fetch_entire_playlist_with_audio_features( plylstID )
            count   += len( tracks )
            totTrks.extend( tracks )
    return totTrks
    

## Search and Filter

In [21]:
from ShuffleSoGood import get_track_vector, _MIN_LEN_S

def add_audio_features_to_track_list( tracks, pause_s = 0.25 ):
    """ Add audio features and calc vector for all `tracks` """
    ## Init ##
    N_trks = len( tracks )
    featrs = list()
    rtnLst = list()

    if N_trks:
    
        bgn    = 0
        end    = 0
        
        ## Get Features ##
        while end < N_trks:
            bgn = end
            end = min( end+sc._RESPONSE_LIMIT, N_trks )
            # pprint( tracks[0] )
            lst = [item['id'] for item in tracks[ bgn:end ]]
            featrs.extend( sc.spot.audio_features( lst ) )
            sleep( pause_s )
    
            
        ## Add Features and Vectors ##
        for i, track_i in enumerate( tracks ):
            updat_i = featrs[i]
            if updat_i is not None:
                track_i.update( updat_i )
                track_i['vector'] = get_track_vector( track_i )
            rtnLst.append( track_i )
    return tracks


def filter_short_and_explicit_tracks( qTracks ):
    """ Remove short (<1:45) and explicit (guaranteed vocal) songs and Return filtered tracks """
    tracks = qTracks[:]
    # 3. For every track j in playlist, do
    j = 0
    while j < len( tracks ):
        track_j   = tracks[j]
        trackID_j = track_j['id']
        len_s_j   = track_j['duration_ms']/1000.0
        explc_j   = track_j.get( 'explicit', False )

        if ((len_s_j < _MIN_LEN_S) or explc_j):
            tracks.pop(j)
        else:
            j += 1
    return tracks


def filter_collected_and_reviewed_tracks( qTracks ):
    """ Remove songs that are either already in the collection or have been reviewed and Return filtered tracks """
    global data
    tracks = qTracks[:]
    trkSet = set([])
    # 3. For every track j in playlist, do
    j = 0
    while j < len( tracks ):
        track_j   = tracks[j]
        trackID_j = track_j['id']
        p_collect = (trackID_j in data['collectID'])
        p_reviewd = (trackID_j in data['reviewID' ])
        p_intake  = (trackID_j in trkSet)

        if (p_collect or p_reviewd or p_intake):
            tracks.pop(j)
        else:
            trkSet.add( trackID_j )
            j += 1
    return tracks


def guided_multi_search_version_02( N_tracks ):
    global data
    print( "\n########## Guided Multi-Search, Version_02 ##########\n" )
    
    ### Search && Gather ###
    print( f"Search for {N_tracks} new releases ..." )
    tracks = get_tracks_from_new_releases( N_tracks, div = 12,  pause_s = 0.25 )
    # pprint( tracks[-1] )
    
    print( f"Search for {N_tracks} recommendations ..." )
    tracks.extend( get_recommended_tracks_from_db( N_tracks, pause_s = 0.25 ) )
    # pprint( tracks[-1] )
    
    print( f"Search for {N_tracks} tracks from artists similar to collection ..." )
    tracks.extend( get_tracks_from_related_artists( N_tracks, pause_s = 0.25 ) )
    # pprint( tracks[-1] )
    
    print( f"Search for {N_tracks} tracks in Spotify featured playlists ..." )
    tracks.extend( get_tracks_from_featured_playlists( N_tracks, pause_s = 0.25 ) )
    # pprint( tracks[-1] )
    
    print( f"Search for {N_tracks} tracks using Version 01 ..." )
    tracks.extend( basic_new_music_search_01( N_tracks, Mper = 5, pause_s = 0.125 ) )
    # pprint( tracks[-1] )
    
    print( f"Multi-Search COMPLETE: Retreived {len(tracks)} tracks!\n" )

    ### Filter ###
    print( f"About to filter {len(tracks)} tracks ..." )
    
    # for i in range( 0, int(N_tracks/2), len(tracks) ):
    #     print( f"\n\n Track {i}:" )
    #     pprint( tracks[i] )
        
    tracks = add_audio_features_to_track_list( tracks, pause_s = 0.25 )
    
    tracks = filter_short_and_explicit_tracks( tracks )
    tracks = filter_collected_and_reviewed_tracks( tracks )
    
    ### Rank by Ascending Distance to Nearest Mini-Genre ###
    print( f"About to rank {len(tracks)} tracks ..." )
    for trk_i in tracks:
        dMin_i = _ONE_BILLION
        gnre_i = _NULL_GENRE
        for gnID_j, mGenre_j in data['genres'].items():
            dist_j = mGenre_j['kdTree'].query( trk_i['vector'] )[0]
            if dist_j < dMin_i:
                dMin_i = dist_j
                gnre_i = gnID_j
        trk_i['minGenreDist' ] = dMin_i
        trk_i['mini-genre_ID'] = gnre_i
    tracks.sort( key = lambda item: item['minGenreDist'] )
    print( "\n########## Multi-Search 02: COMPLETE ##########\n" )

    ### Return Top Hits ###
    if len( tracks ) < N_tracks:
        return tracks
    else:
        return tracks[ :N_tracks ]


def refill_playlist_with_new_tracks_02( plID, Ntot = _N_BACKFILL, pause_s = 0.50 ):
    """ Top off the playlist with new tracks """
    global data, outPath
    plLen = get_playlist_length( plID )
    if Ntot > plLen:
        nRem = Ntot - plLen
        print( f"About to add {nRem} tracks ..." )
        addTrks = guided_multi_search_version_02( nRem )
        addID   = [item['id'] for item in addTrks]
        Nadd    = len( addID )
        bgn     = 0
        end     = 0
        while end < Nadd:
            bgn    = end
            end    = min( end+sc._RESPONSE_LIMIT, Nadd )
            result = sc.spot.user_playlist_add_tracks( sc.CLIENT_ID, plID, addID[ bgn:end ] )
        save_music_database( data, outPath ) # Save queries we made
        print( "Playlist refill complete!" )
    else:
        print( "No room for new tracks!" )
        

# Genre Cleanup

In [22]:
from random import shuffle


def set_genre_membership( ):
    """ Make sure all genres have a membership ID hash """
    global data
    for gnreID_k, genre_k in data['genres'].items():
        if ('trackIDs' not in genre_k):
            genre_k['trackIDs'] = set([])
        for l, track_l in enumerate( genre_k['tracks'] ):
            genre_k['trackIDs'].add( track_l['id'] )
    

def get_homeless_tracks( ):
    """ Return a list of tracks not assicated with a current mini-genre """
    global data
    set_genre_membership( )
    rtnLst = list()
    for plName_i, playls_i in data['playlists'].items():
        for j, track_j in enumerate( playls_i['tracks'] ):
            found   = False
            trkID_j = track_j['id']
            for gnreID_k, genre_k in data['genres'].items():
                if trkID_j in genre_k['trackIDs']:
                    found = True
                    break
            if not found:
                rtnLst.append( track_j )
    shuffle( rtnLst )
    print( f"Found {len(rtnLst)} unaffiliated tracks!" )
    return rtnLst


def genretize_remainder( chunkSize = _N_RM_CHUNK ):
    """ Try to generate affliations for free tracks """
    global data
    freeTrks = get_homeless_tracks( )
    NfreTrks = len( freeTrks )
    
    bgn      = 0
    end      = 0
    trkChnks = list()
    while end < NfreTrks:
        bgn = end
        end = min( end+chunkSize, NfreTrks )
        trkChnks.append( freeTrks[ bgn:end ] )

    for i, tracks_i in enumerate( trkChnks ):
        bgn_i    = now()
        gnres_i  = generate_genres_from_track_list( tracks_i )
        for gnre_j in gnres_i.values():
            gnre_j['origins'] = ['FreeTracks',]
        data['genres'].update( gnres_i )
        dur_i = now() - bgn_i
        print( f"\nGenre generation from {len(tracks_i)} tracks took {dur_i/60.0} minutes!\n" )
        sc.check_API_token() # 2024-08-21: This process can take a while on my home machine


def recruit_remainder( ):
    """ Attempt to move free tracks to already-generated mini-genres """
    global data
    freeTrks = get_homeless_tracks( )
    NfreTrks = len( freeTrks )
    totCount = 0

    def membership_score( qPnt, kNNpts ):
        """ Return a score that expresses cluster membership, Higher is better, -1 means not a member """
        if (len(kNNpts) < _DBS_MIN_MMBR):
            return -1
        dists = sorted( [item[0] for item in kNNpts] )
        insid = 0
        score = 0
        for i, p_i in enumerate( kNNpts ):
            d_i    = np.linalg.norm( np.subtract( qPnt, p_i ) )
            score += np.exp( -d_i ) # The closer it is, the larger this number    
        return score

    for i, track_i in enumerate( freeTrks ):
        if (i%100==0):
            print( '.', end='', flush=1 )
        hiScore = -1
        hiGenre = None
        
        for gnreID_j, genre_j in data['genres'].items():
            nn_j = genre_j['kdTree'].query_ball_point( track_i['vector'], _DBS_EPSILON )
            nn_j = [genre_j['vectors'][ index ] for index in nn_j]
            sc_j = membership_score( track_i['vector'], nn_j )
            if sc_j > hiScore:
                hiScore = sc_j
                hiGenre = gnreID_j
                
        if (hiGenre is not None):
            data['genres'][ hiGenre ]['tracks' ].append( track_i )
            data['genres'][ hiGenre ]['changed'] = True
            totCount += 1
    print()

    for gnre in data['genres'].values():         
        if gnre.get( 'changed', False ):
            print( f"Recalculating  {gnre['nameLocal']}  ..." )
            genre_vector_ops( gnre )
            gnre['changed'] = False

    set_genre_membership()
    print( f"Found homes for {totCount} free tracks!" )
    get_homeless_tracks()


def populate_releases( ):
    """ Make sure releases are populated """
    global data
    print( type( data ) )
    for plName_i, plyLst_i in data['playlists'].items():
        print( plName_i, '-', plyLst_i['ID'], '...' )
        tracks_i = plyLst_i['tracks']
        assign_vectors_to_tracks( tracks_i )

        # Catalog artists from collection tracks (Req'd for (Sub-)Search 01)
        for track_j in tracks_i:
            for artist_k in track_j['track']['artists']:
                artistID_j = artist_k['id']
                if artistID_j not in data['artists']:
                    data['artists'][ artistID_j ] = { 
                        'name'    : track_j['track']['artists'][0]['name'], 
                        'count'   : 1, 
                        'releases': [track_j['track']['album']['release_date'],], 
                    }
                else:
                    data['artists'][ artistID_j ]['count'   ] += 1
                    data['artists'][ artistID_j ]['releases'].append( track_j['track']['album']['release_date'] )

In [23]:
if _REGEN_DBASE:
    genretize_remainder( chunkSize = _N_RM_CHUNK )
    merge_micro_genres_in_db( )
    move_mini_genre_outliers_to_better_homes( )
    recruit_remainder(  )
    save_music_database( data, outPath ) # Save queries we made

In [24]:
if _SEARCH_BKFL:
    sc.check_API_token()
    populate_releases()

In [25]:
if _SEARCH_BKFL:
    sc.check_API_token()
    refill_playlist_with_new_tracks_02( _FILL_PL_ID, Ntot = _N_BACKFILL )

In [26]:
if _SEARCH_BKFL:
    save_music_database( data, outPath ) # Save queries we made