# Generate recommendations for your Spotify Playlists

### Connecting to your Spotify account
* Follow instruction [here](https://towardsdatascience.com/extracting-song-data-from-the-spotify-api-using-python-b1e79388d50) for getting your own Spotify Developer API credentials
* To connect to your Spotify account, see [Client Credentials Flow](https://spotipy.readthedocs.io/en/2.19.0/#client-credentials-flow) for authentication

**Note:** when calling the `spotipy` API, if you receive the error message below, re-running the cell should fix

`ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))`

### Spotipy ref
* [spotipy docs](https://spotipy.readthedocs.io/en/2.19.0/#welcome-to-spotipy), 
* [github examples](https://github.com/plamere/spotipy/tree/master/examples),  
* [source code](https://github.com/plamere/spotipy/blob/master/spotipy/client.py#L20)

### REQUIRED:

* In your repo, create `spotipy_secret_creds.py`,  
* assign file to `.gitignore`
* define the variables below,

```
SPOTIPY_CLIENT_ID='YOUR_CLIENT_ID'
SPOTIPY_CLIENT_SECRET='YOUR_CLIENT_SECRET'
SPOTIFY_USERNAME='YOUR_USERNAME'
```

## TODOs

* reformat notebook (e.g., existing endpoints used in notebook defined up front?)

In [1]:
# !pip install spotipy --user

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


### pip & package

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import re
from tqdm import tqdm

import pandas as pd
import json
from io import BytesIO
from pprint import pprint
import os

from google.cloud import storage

pd.set_option('display.max_columns', 100)

### Setup Clients

In [3]:
import spotipy_secret_creds as creds

os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT_ID
os.environ['SPOTIPY_CLIENT_ID'] = creds.SPOTIPY_CLIENT_ID
os.environ['SPOTIPY_CLIENT_SECRET'] = creds.SPOTIPY_CLIENT_SECRET
os.environ['SPOTIFY_USERNAME'] = creds.SPOTIFY_USERNAME

SPOTIPY_CLIENT_ID=creds.SPOTIPY_CLIENT_ID
SPOTIPY_CLIENT_SECRET=creds.SPOTIPY_CLIENT_SECRET
SPOTIFY_USERNAME=creds.SPOTIFY_USERNAME

MAX_SEED_LENGTH = 5

In [4]:
# Authenticate
client_credentials_manager = SpotifyClientCredentials(
    client_id=creds.SPOTIPY_CLIENT_ID, 
    client_secret=creds.SPOTIPY_CLIENT_SECRET
)

sp = spotipy.Spotify(
    client_credentials_manager = client_credentials_manager,

)

### helper functions

TODO - put in utils file?

In [5]:
def get_track_features(
    track_uri, 
    count, 
    playlist_uri, 
    n_songs_pl, 
    num_artists_pl, 
    num_albums_pl,
    track_durations
):
    
    # Authenticate
    client_credentials_manager = SpotifyClientCredentials(
        client_id=creds.SPOTIPY_CLIENT_ID, 
        client_secret=creds.SPOTIPY_CLIENT_SECRET
    )

    sp = spotipy.Spotify(
        client_credentials_manager = client_credentials_manager,

    )
    
    feature_dict = {}
    
    feature_dict['num_pl_songs_new'] = float(n_songs_pl)
    feature_dict['num_pl_artists_new'] = float(num_artists_pl)
    feature_dict['num_pl_albums_new'] = float(num_albums_pl)
    feature_dict['duration_ms_songs_pl'] = float(track_durations)
    
    playlist_features = sp.playlist(playlist_uri)
    feature_dict['pl_name_src'] = playlist_features['name']
    # feature_dict['description_pl'] = playlist_features['description']
    feature_dict['pl_collaborative_src'] = str(playlist_features['collaborative']).lower()
    
    track_meta = sp.track(track_uri)
    # capture track metadata
    # feature_dict['track_pos'] = count
    feature_dict['track_uri_pl'] = track_uri
    feature_dict['track_name_pl'] = track_meta['name']
    feature_dict['duration_ms_songs_pl'] = float(track_meta['duration_ms'])
    feature_dict['track_pop_pl'] = float(track_meta['popularity'])
    feature_dict['album_name_pl'] = track_meta['album']['name']
    feature_dict['album_uri_pl'] = track_meta['album']['uri']
    # feature_dict['album_release_date'] = track_meta['album']['release_date']
    feature_dict['artist_name_pl'] = track_meta['album']['artists'][0]['name']
    feature_dict['artist_uri_pl'] = track_meta['album']['artists'][0]['uri']

    artist_meta = sp.artist(feature_dict['artist_uri_pl'])
    # capture artist metadata
    feature_dict['artists_followers_pl'] = float(artist_meta['followers']['total'])
    feature_dict['artist_pop_pl'] = float(artist_meta['popularity'])
    # artist_genres = artist_meta['genres']
    
    if artist_meta['genres']:
        feature_dict['artist_genres_pl'] = " ".join([re.sub(' ','_',i) for i in artist_meta['genres']])
    else:
        feature_dict['artist_genres_pl'] = "unknown"
    
    track_features = sp.audio_features(track_uri)[0]
    # capture track audio features
    # feature_dict['duration_ms_songs_pl'] = float(track_features['duration_ms'])
    feature_dict['track_acousticness_pl'] = track_features['acousticness']
    feature_dict['track_danceability_pl'] = track_features['danceability']
    feature_dict['track_energy_pl'] = track_features['energy']
    feature_dict['track_instrumentalness_pl'] = track_features['instrumentalness']
    feature_dict['track_key_pl'] = track_features['key']
    feature_dict['track_liveness_pl'] = track_features['liveness']
    feature_dict['track_loudness_pl']= track_features['loudness']
    feature_dict['track_mode_pl'] = track_features['mode']
    feature_dict['track_speechiness_pl'] = track_features['speechiness']
    feature_dict['track_tempo_pl'] = track_features['tempo']
    feature_dict['time_signature_pl'] = track_features['time_signature']
    feature_dict['track_valence_pl'] = track_features['valence']
    
    # TODO: print artist names for comparison later
    
    return feature_dict

def get_playlist_queries(playlist_uri):
    
    track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_uri)["items"]]
    album_uris = [x["track"]['album']['uri'] for x in sp.playlist_tracks(playlist_uri)["items"]]
    artist_uris = [x["track"]['artists'][0]['uri'] for x in sp.playlist_tracks(playlist_uri)["items"]]
    track_durations = [x["track"]["duration_ms"] for x in sp.playlist_tracks(playlist_uri)["items"]]
    
    n_songs_pl = len(track_uris)
    num_albums_pl = len(set(album_uris))
    num_artists_pl = len(set(artist_uris))
    track_durations = sum(track_durations)
    
    # n_songs_pl = len(track_uris)
    playlist_featutre_list = []
    
    for count, track_uri in enumerate(track_uris):
        # results = get_track_features(track_uri, count, playlist_uri, n_songs_pl)
        results = get_track_features(track_uri, count, playlist_uri, n_songs_pl, num_artists_pl, num_albums_pl,track_durations)
        playlist_featutre_list.append(results)
    
    return playlist_featutre_list

# Getting your Spotify playlists

Keep in mind:
* it's possible your playlists have tracks that are not present in the Million Playlists Dataset
* That's OK - we want the model to generalize to unseen data!
* Let's see what the model associates them with...

Note: to retrieve your playlists, make sure they are *added to your profile*

### get user playlists

Option 1: get playlists via `spotipy` API...

In [6]:
play_lists = []
playlists = sp.user_playlists(user=f'{SPOTIFY_USERNAME}', limit=10)

for pl in playlists['items']:
    uri = pl['uri']
    play_lists.append(uri)
    print(f"uri: {uri},  playlist name: {pl['name']}")

uri: spotify:playlist:5fV3fQ2sXEE8O1dbhmeIdo,  playlist name: sleeping with the phish
uri: spotify:playlist:3HeHZi8VGEm6ZNHZ2FVRr6,  playlist name: biebs weeknd
uri: spotify:playlist:3GX5FLE0IxHNZtLye0ETgb,  playlist name: Muscle Shoals
uri: spotify:playlist:0XPJ39OCBhOw5OZa7udYYP,  playlist name: Disco
uri: spotify:playlist:6imD2IJOyw3MEKdZ4XZqZ4,  playlist name: space is the place
uri: spotify:playlist:1E1EwxJyzjt6SYyfnp9mE8,  playlist name: all panic
uri: spotify:playlist:1pGfqRD9CzyO9lOn9Fp09V,  playlist name: live panic - small


Option 2: using the link provided when "sharing" a playlist...

In [7]:
# link from "share" feature
playlist_link = 'https://open.spotify.com/playlist/3GX5FLE0IxHNZtLye0ETgb?si=f99fa67315f14bbe'

# get the uri part
playlist_URI = playlist_link.split("/")[-1].split("?")[0]
print(f"playlist_URI: {playlist_URI}")

# get the tracks from that playlist
track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_URI)["items"]]
print(f"Track in playlist: {track_uris[0]}")

playlist_URI: 3GX5FLE0IxHNZtLye0ETgb
Track in playlist: spotify:track:7hqesNgWCx8NZTHl4MXkPF


### Loop over multiple playlists

In [8]:
# from sp_utils import get_playlist_queries

featureLIST = [] 

for uri in play_lists:
    results = get_playlist_queries(uri)
    featureLIST.append(results)

len(featureLIST)

7

In [9]:
len(featureLIST[0])

12

In [212]:
# first playlist
# featureLIST[0]

In [10]:
# first track of first playlist
featureLIST[0][0]

{'num_pl_songs_new': 12.0,
 'num_pl_artists_new': 1.0,
 'num_pl_albums_new': 3.0,
 'duration_ms_songs_pl': 289973.0,
 'pl_name_src': 'sleeping with the phish',
 'pl_collaborative_src': 'false',
 'track_uri_pl': 'spotify:track:6jcO51f5vHa9cJfCnfX1Rf',
 'track_name_pl': 'Soul Shakedown Party',
 'track_pop_pl': 22.0,
 'album_name_pl': 'Amsterdam',
 'album_uri_pl': 'spotify:album:709gu2Yj2tfqmNMIEDfOPg',
 'artist_name_pl': 'Phish',
 'artist_uri_pl': 'spotify:artist:5wbIWUzTPuTxTyG6ouQKqz',
 'artists_followers_pl': 469167.0,
 'artist_pop_pl': 55.0,
 'artist_genres_pl': 'blues_rock jam_band',
 'track_acousticness_pl': 0.261,
 'track_danceability_pl': 0.65,
 'track_energy_pl': 0.756,
 'track_instrumentalness_pl': 2e-05,
 'track_key_pl': 5,
 'track_liveness_pl': 0.466,
 'track_loudness_pl': -7.531,
 'track_mode_pl': 1,
 'track_speechiness_pl': 0.0829,
 'track_tempo_pl': 135.869,
 'time_signature_pl': 4,
 'track_valence_pl': 0.79}

### create dataframe of all playlists, tracks, and metadata defined in `get_playlist_queries()`

Inspect last `N` songs of playlist...

In [11]:
from itertools import chain

test_df = pd.DataFrame(list(chain.from_iterable(featureLIST)))

print(test_df.shape)
test_df.head()

(116, 28)


Unnamed: 0,num_pl_songs_new,num_pl_artists_new,num_pl_albums_new,duration_ms_songs_pl,pl_name_src,pl_collaborative_src,track_uri_pl,track_name_pl,track_pop_pl,album_name_pl,album_uri_pl,artist_name_pl,artist_uri_pl,artists_followers_pl,artist_pop_pl,artist_genres_pl,track_acousticness_pl,track_danceability_pl,track_energy_pl,track_instrumentalness_pl,track_key_pl,track_liveness_pl,track_loudness_pl,track_mode_pl,track_speechiness_pl,track_tempo_pl,time_signature_pl,track_valence_pl
0,12.0,1.0,3.0,289973.0,sleeping with the phish,False,spotify:track:6jcO51f5vHa9cJfCnfX1Rf,Soul Shakedown Party,22.0,Amsterdam,spotify:album:709gu2Yj2tfqmNMIEDfOPg,Phish,spotify:artist:5wbIWUzTPuTxTyG6ouQKqz,469167.0,55.0,blues_rock jam_band,0.261,0.65,0.756,2e-05,5,0.466,-7.531,1,0.0829,135.869,4,0.79
1,12.0,1.0,3.0,812453.0,sleeping with the phish,False,spotify:track:2jT3iJ9w0WD51E2VRDPzOD,Divided Sky,20.0,Amsterdam,spotify:album:709gu2Yj2tfqmNMIEDfOPg,Phish,spotify:artist:5wbIWUzTPuTxTyG6ouQKqz,469167.0,55.0,blues_rock jam_band,0.259,0.24,0.844,0.0582,2,0.956,-6.961,1,0.0505,94.237,4,0.343
2,12.0,1.0,3.0,783173.0,sleeping with the phish,False,spotify:track:1nZBf3KEHNO1NwVMnCPRWd,Bathtub Gin,17.0,Amsterdam,spotify:album:709gu2Yj2tfqmNMIEDfOPg,Phish,spotify:artist:5wbIWUzTPuTxTyG6ouQKqz,469167.0,55.0,blues_rock jam_band,0.21,0.434,0.834,0.0524,0,0.369,-5.999,1,0.0423,118.021,4,0.822
3,12.0,1.0,3.0,1098160.0,sleeping with the phish,False,spotify:track:3GgcI3Efpom6Se3xRho66s,Down With Disease,19.0,Amsterdam,spotify:album:709gu2Yj2tfqmNMIEDfOPg,Phish,spotify:artist:5wbIWUzTPuTxTyG6ouQKqz,469167.0,55.0,blues_rock jam_band,0.32,0.496,0.868,0.109,2,0.355,-6.165,1,0.0515,142.017,4,0.66
4,12.0,1.0,3.0,763373.0,sleeping with the phish,False,spotify:track:5s5DIVU9rB2npB9cV5JThl,Limb By Limb,13.0,Amsterdam,spotify:album:709gu2Yj2tfqmNMIEDfOPg,Phish,spotify:artist:5wbIWUzTPuTxTyG6ouQKqz,469167.0,55.0,blues_rock jam_band,0.338,0.383,0.761,0.141,10,0.431,-7.391,1,0.039,105.575,3,0.412


### albums, artists, and tracks per playlist

In [12]:
unique_albums = test_df.groupby('pl_name_src')['album_uri_pl'].nunique()
unique_artists = test_df.groupby('pl_name_src')['artist_uri_pl'].nunique()
n_songs_pl = test_df.groupby('pl_name_src')['track_uri_pl'].count()

print(f"unique_albums {unique_albums}\n")
print(f"unique_artists {unique_artists}\n")
print(f"n_songs_pl {n_songs_pl}")

unique_albums pl_name_src
Disco                       8
Muscle Shoals              11
all panic                   8
biebs weeknd               14
live panic - small          1
sleeping with the phish     3
space is the place          9
Name: album_uri_pl, dtype: int64

unique_artists pl_name_src
Disco                       7
Muscle Shoals               9
all panic                   1
biebs weeknd               11
live panic - small          1
sleeping with the phish     1
space is the place          5
Name: artist_uri_pl, dtype: int64

n_songs_pl pl_name_src
Disco                       8
Muscle Shoals              16
all panic                  40
biebs weeknd               16
live panic - small         11
sleeping with the phish    12
space is the place         13
Name: track_uri_pl, dtype: int64


In [13]:
def get_test_instance(list_dict_test):
    '''
    create single test instances given a 
    list of dictionaries representing playlist tracks
    '''
    
    # model serving signature with candidate tower fields
    TEST_PL_QUERY = {
        # 'album_name_can': '',
        'album_name_pl': [],
        'artist_uri_pl': [],
        'album_uri_pl':[],
        # 'album_uri_can': '',
        # 'artist_followers_can': 0, 
        # 'artist_genres_can': "", 
        'artist_genres_pl': [], 
        # 'artist_name_can': '', 
        'artist_name_pl': [], 
        # 'artist_pop_can': 0, 
        'artist_pop_pl': [], 
        # 'artist_uri_can': '', 
        'artists_followers_pl': [], 
        'pl_collaborative_src': '', 
        # 'description_pl': '', 
        # 'duration_ms_seed_pl': 0, 
        # 'duration_ms_songs_pl': [ ], 
        'duration_ms_songs_pl':[],
        'pl_duration_ms_new': 0.0,
        'num_pl_songs_new': 0.0, 
        'pl_name_src': '', 
        'num_pl_albums_new': 0.0, 
        'num_pl_artists_new': 0.0, 
        # 'track_name_can': '', 
        'track_name_pl': [ ], 
        # 'track_pop_can': 0, 
        'track_pop_pl': [ ], 
        # 'track_uri_can': '', 
        'track_uri_pl': [ ],
        # 'pid': 1,
        'track_acousticness_pl':[],
        'track_danceability_pl':[],
        'track_energy_pl':[],
        'track_instrumentalness_pl':[],
        'track_key_pl':[],
        'track_liveness_pl':[],
        'track_loudness_pl':[],
        'track_mode_pl':[],
        'track_speechiness_pl':[],
        'track_tempo_pl':[],
        'time_signature_pl':[],
        'track_valence_pl':[],
        # candidates
        'album_name_can': '',
        'album_uri_can': '',
        'artist_followers_can': 0.0,
        'artist_genres_can': '',
        'artist_name_can': '',
        'artist_pop_can': 0.0,
        'artist_uri_can': '',
        'duration_ms_can': 0.0,
        'time_signature_can': '',
        'track_acousticness_can': 0.0,
        'track_danceability_can': 0.0,
        'track_energy_can': 0.0,
        'track_instrumentalness_can': 0.0,
        'track_key_can': '',
        'track_liveness_can': 0.0,
        'track_loudness_can': 0.0,
        'track_mode_can': '',
        'track_name_can': '',
        'track_pop_can': 0.0,
        'track_speechiness_can': 0.0,
        'track_tempo_can': 0.0,
        'track_uri_can': '',
        'track_valence_can': 0.0,
    }

    counter = 0
    for track in list_dict_test:
        if counter == 0:
            TEST_PL_QUERY['pl_name_src'] = track['pl_name_src']
            TEST_PL_QUERY['num_pl_songs_new'] = track['num_pl_songs_new'] 
            TEST_PL_QUERY['num_pl_albums_new'] = track['num_pl_albums_new']
            TEST_PL_QUERY['num_pl_artists_new'] = track['num_pl_artists_new']
            # TEST_PL_QUERY['description_pl'] = track['description_pl']
            TEST_PL_QUERY['pl_collaborative_src'] = str(track['pl_collaborative_src'])
        else:
            # do these
            TEST_PL_QUERY['track_pop_pl'].append(track['track_pop_pl'])
            TEST_PL_QUERY['track_uri_pl'].append(track['track_uri_pl'])
            TEST_PL_QUERY['track_name_pl'].append(track['track_name_pl'])
            TEST_PL_QUERY['album_name_pl'].append(track['album_name_pl'])
            TEST_PL_QUERY['album_uri_pl'].append(track['album_uri_pl'])
            TEST_PL_QUERY['artist_name_pl'].append(track['artist_name_pl'])
            TEST_PL_QUERY['artist_uri_pl'].append(track['artist_uri_pl'])
            TEST_PL_QUERY['artist_genres_pl'].append(track['artist_genres_pl'])
            TEST_PL_QUERY['artist_pop_pl'].append(track['artist_pop_pl'])
            TEST_PL_QUERY['duration_ms_songs_pl'].append(track['duration_ms_songs_pl'])
            TEST_PL_QUERY['artists_followers_pl'].append(track['artists_followers_pl'])
            # audio feats
            TEST_PL_QUERY['track_acousticness_pl'].append(track['track_acousticness_pl'])
            TEST_PL_QUERY['track_danceability_pl'].append(track['track_danceability_pl'])
            TEST_PL_QUERY['track_energy_pl'].append(track['track_energy_pl'])
            TEST_PL_QUERY['track_instrumentalness_pl'].append(track['track_instrumentalness_pl'])
            TEST_PL_QUERY['track_key_pl'].append(str(track['track_key_pl']))
            TEST_PL_QUERY['track_liveness_pl'].append(track['track_liveness_pl'])
            TEST_PL_QUERY['track_loudness_pl'].append(track['track_loudness_pl'])
            TEST_PL_QUERY['track_mode_pl'].append(str(track['track_mode_pl']))
            TEST_PL_QUERY['track_speechiness_pl'].append(track['track_speechiness_pl'])
            TEST_PL_QUERY['track_tempo_pl'].append(track['track_tempo_pl'])
            TEST_PL_QUERY['time_signature_pl'].append(str(track['time_signature_pl']))
            TEST_PL_QUERY['track_valence_pl'].append(track['track_valence_pl'])
            

        counter=+1
        
    return TEST_PL_QUERY

In [14]:
# list of tracks
sample_tracks = featureLIST[0][-MAX_SEED_LENGTH-1:]

# get metadata for each track
TEST_QUERY = get_test_instance(sample_tracks)
pprint(TEST_QUERY)

{'album_name_can': '',
 'album_name_pl': ['Phish: 12/30/2015 Madison Square Garden, New York, NY',
                   'LivePhish 04/03/98',
                   'LivePhish 04/03/98',
                   'LivePhish 04/03/98',
                   'LivePhish 04/03/98'],
 'album_uri_can': '',
 'album_uri_pl': ['spotify:album:4fwquTwetrNGSfHxC3JVC0',
                  'spotify:album:251YMVId8YBkTapKyYgExP',
                  'spotify:album:251YMVId8YBkTapKyYgExP',
                  'spotify:album:251YMVId8YBkTapKyYgExP',
                  'spotify:album:251YMVId8YBkTapKyYgExP'],
 'artist_followers_can': 0.0,
 'artist_genres_can': '',
 'artist_genres_pl': ['blues_rock jam_band',
                      'blues_rock jam_band',
                      'blues_rock jam_band',
                      'blues_rock jam_band',
                      'blues_rock jam_band'],
 'artist_name_can': '',
 'artist_name_pl': ['Phish', 'Phish', 'Phish', 'Phish', 'Phish'],
 'artist_pop_can': 0.0,
 'artist_pop_pl': [55.0, 55

# Query Matching Engine

### TODO:
* parametrize this section
* structure section and notebook for readers

In [15]:
from google.cloud import aiplatform as vertex_ai

# Vertex SDK 
vertex_ai.init(project=PROJECT_ID, location=LOCATION)

import time

### Index Endpoint

In [22]:
# INDEX_ENDPOINT_URI = "projects/934903580331/locations/us-central1/indexEndpoints/5901413157808635904"
# INDEX_ENDPOINT_URI = "projects/934903580331/locations/us-central1/indexEndpoints/1633126610968248320" # jw 100 epoch

INDEX_ENDPOINT_URI = 'projects/934903580331/locations/us-central1/indexEndpoints/381618495768494080' # ann 50e

# INDEX_ENDPOINT_URI = 'projects/934903580331/locations/us-central1/indexEndpoints/1829068379130953728' # last-15 ANN

In [23]:
ME_index_endpoint = vertex_ai.MatchingEngineIndexEndpoint(INDEX_ENDPOINT_URI)
ME_index_endpoint

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7ff3c104bb10> 
resource name: projects/934903580331/locations/us-central1/indexEndpoints/381618495768494080

In [24]:
DEPLOYED_INDEX_ID = ME_index_endpoint.deployed_indexes[0].id
print(f"DEPLOYED_INDEX_ID: {DEPLOYED_INDEX_ID}")

ME_index_endpoint.deployed_indexes

DEPLOYED_INDEX_ID: deployed_tfrs_32dim_50e_v8


[id: "deployed_tfrs_32dim_50e_v8"
index: "projects/934903580331/locations/us-central1/indexes/5123345953436205056"
create_time {
  seconds: 1673552056
  nanos: 94509000
}
private_endpoints {
  match_grpc_address: "10.41.2.5"
}
index_sync_time {
  seconds: 1674674313
  nanos: 230862000
}
automatic_resources {
  min_replica_count: 2
  max_replica_count: 2
}
deployment_group: "default"
]

### Model Endpoint

In [25]:
# ENDPOINT_URI = 'projects/934903580331/locations/us-central1/endpoints/185144563977945088'
# ENDPOINT_URI = "projects/934903580331/locations/us-central1/endpoints/8515115024753098752" # jw 100 epoch

ENDPOINT_URI = 'projects/934903580331/locations/us-central1/endpoints/4002948002778972160' # 50e
# ENDPOINT_URI = 'projects/934903580331/locations/us-central1/endpoints/6383522618309345280' # last-15

In [26]:
model_endpoint = vertex_ai.Endpoint(ENDPOINT_URI)
model_endpoint

<google.cloud.aiplatform.models.Endpoint object at 0x7ff3c104bf10> 
resource name: projects/934903580331/locations/us-central1/endpoints/4002948002778972160

In [27]:
print(model_endpoint.gca_resource.deployed_models[0])

id: "520565979193802752"
model: "projects/934903580331/locations/us-central1/models/6791535990214230016"
display_name: "deployed_qmodel_tfrs_32dim_50e_v7"
create_time {
  seconds: 1673456208
  nanos: 327780000
}
dedicated_resources {
  machine_spec {
    machine_type: "n1-standard-4"
  }
  min_replica_count: 1
  max_replica_count: 2
}
model_version_id: "1"



### Retrieve nearest neighbors in deployed index

**TODO** add Feature Store to this step

In [28]:
def candidate_retrieval(retrieval_index, query_instance, deployed_index_id, num_neighbs=10):
    '''
    TODO: some args
    '''
    # here
    start = time.process_time()
    playlist_emb = model_endpoint.predict([query_instance])
    print(f"Generate embeddings in {round((time.process_time() - start),2)} seconds\n")
    
    
    start = time.process_time()
    candidate_tracks = retrieval_index.match(
        deployed_index_id=f'{deployed_index_id}',
        queries=playlist_emb.predictions,
        num_neighbors=10
    )
    
    print(f"Retrieved nearest neighbors in {round((time.process_time() - start),2)} seconds\n")
    playlist_name = query_instance['pl_name_src']
    # playlist_description = query_instance['description_pl']
    
    return candidate_tracks, playlist_name # playlist_description

def interpret_results(candidate_tracks, playlist_name): # playlist_description
    '''
    TODO:
    '''
    # here
    results = []
    
    print(f"playlist: {playlist_name}")
    # print(f"description: {playlist_description}\n")
    
    print(f"Retrieved Candidates:\n")
    for i, neighbors in enumerate(candidate_tracks[0]):
        
        track_dict = {}
        
        track_index_id = str(neighbors.id)
        
        # print(track_index_id)
        # results.append(track_index_id)
        # if len(track_index_id) == 36:
        
        track_meta = sp.track(track_index_id)
        track_name = track_meta['name']
        artist_name = track_meta['artists'][0]['name']
        art_uri = track_meta['artists'][0]['uri']
        
        art_genres = sp.artist(art_uri)['genres']

        # populate dict
        track_dict['track_uri'] = track_index_id
        track_dict['track_name'] = track_name
        track_dict['artist_name'] = artist_name
        track_dict['track_genres'] = art_genres
        track_dict['track_preview_url'] = track_meta['preview_url']
        track_dict['track_spotify_url'] = track_meta['external_urls']['spotify']
        track_dict['neighbor_distance'] = neighbors.distance
        track_dict['playlist_name'] = playlist_name

        print(f"{i+1}) {track_name} - by {artist_name}; genres: {art_genres}; {track_index_id}")
        
        results.append(track_dict)

    # print(results)

    return results


In [29]:
all_recs = []

for playlist in featureLIST:
    
    seed_tracks = playlist[-MAX_SEED_LENGTH-1:]
    
    query = get_test_instance(seed_tracks)
    
    candidates, name = candidate_retrieval( # pl_description
        retrieval_index=ME_index_endpoint,
        query_instance=query, 
        deployed_index_id=DEPLOYED_INDEX_ID
    )
    
    results = interpret_results(candidates, name) # pl_description
    all_recs.append(results)
    print("--------")

Generate embeddings in 0.02 seconds

Retrieved nearest neighbors in 0.0 seconds

playlist: sleeping with the phish
Retrieved Candidates:

1) Motorhead - by Hawkwind; genres: ['art rock', 'experimental', 'hard rock', 'progressive rock', 'proto-metal', 'protopunk', 'psychedelic rock', 'space rock', 'symphonic rock']; spotify:track:1PqB9tIfRdLNJJBPln3xRh
2) Open My Eyes - by The Move; genres: ['beatlesque', 'british blues', 'bubblegum pop', 'classic garage rock', 'classic uk pop', 'folk rock', 'freakbeat', 'merseybeat', 'protopunk', 'psychedelic rock', 'pub rock']; spotify:track:38GhS6VdKqmOI7ayfwSnD8
3) Potatoe Chips - by Slim Gaillard; genres: ['jazz blues', 'rhythm and blues', 'stride', 'swing']; spotify:track:0U3LrnGhbIqz6rO8muM0gy
4) La linterna - by Banda 20 De Julio De Repelon; genres: ['cumbia', 'porro']; spotify:track:4AJOOAOmIhIeJWe4JppOaQ
5) That's It for the Other One - Live in San Francisco, February 27, 1969 - by Grateful Dead; genres: ['classic rock', 'cosmic american', 'co

In [30]:
len(all_recs)

7

In [31]:
all_recs[0][4]

{'track_uri': 'spotify:track:44FKqeyePqfAcWfJKJkpGy',
 'track_name': "That's It for the Other One - Live in San Francisco, February 27, 1969",
 'artist_name': 'Grateful Dead',
 'track_genres': ['classic rock',
  'cosmic american',
  'country rock',
  'double drumming',
  'jam band',
  'psychedelic rock',
  'rock'],
 'track_preview_url': 'https://p.scdn.co/mp3-preview/7275129ab1aa03c50b3b1a7ee4152d095a40381b?cid=2dce494e64a74be980138668f4402b97',
 'track_spotify_url': 'https://open.spotify.com/track/44FKqeyePqfAcWfJKJkpGy',
 'neighbor_distance': 131.3176727294922,
 'playlist_name': 'sleeping with the phish'}

## Compare Brute Force retrieval

In [284]:
# create brute force index

bf_INDEX_ENDPOINT_URI = 'projects/934903580331/locations/us-central1/indexEndpoints/6417567896351801344' # bf 50e

ME_BF_index_endpoint = vertex_ai.MatchingEngineIndexEndpoint(bf_INDEX_ENDPOINT_URI)
ME_BF_index_endpoint

DEPLOYED_BF_INDEX_ID = ME_BF_index_endpoint.deployed_indexes[0].id
print(f"DEPLOYED_INDEX_ID: {DEPLOYED_BF_INDEX_ID}")

ME_BF_index_endpoint.deployed_indexes

DEPLOYED_INDEX_ID: deployed_tfrs_32dim_50e_v8_bf


[id: "deployed_tfrs_32dim_50e_v8_bf"
index: "projects/934903580331/locations/us-central1/indexes/4605994946242019328"
create_time {
  seconds: 1673552276
  nanos: 416921000
}
private_endpoints {
  match_grpc_address: "10.41.2.5"
}
index_sync_time {
  seconds: 1673558195
  nanos: 18382000
}
automatic_resources {
  min_replica_count: 2
  max_replica_count: 2
}
deployment_group: "default"
]

In [290]:
candidates, name = candidate_retrieval( # pl_description
    retrieval_index=ME_BF_index_endpoint,
    query_instance=TEST_QUERY, 
    deployed_index_id=DEPLOYED_BF_INDEX_ID
)

results = interpret_results(candidates, name)
results

Generate embeddings in 0.0 seconds

Retrieved nearest neighbors in 0.0 seconds

playlist: sleeping with the phish
Retrieved Candidates:

1) Sabotage - by Maria Isa; genres: []; spotify:track:0Nj0MFkl0s3LGaf0h6d03Q
2) Motorhead - by Hawkwind; genres: ['art rock', 'experimental', 'hard rock', 'progressive rock', 'proto-metal', 'protopunk', 'psychedelic rock', 'space rock', 'symphonic rock']; spotify:track:1PqB9tIfRdLNJJBPln3xRh
3) In The Desert - Axel Boman Remix - by Dreems; genres: ['float house']; spotify:track:7sm2FKb3fju37zuGbBdo1D
4) Open My Eyes - by The Move; genres: ['beatlesque', 'british blues', 'bubblegum pop', 'classic garage rock', 'classic uk pop', 'folk rock', 'freakbeat', 'merseybeat', 'protopunk', 'psychedelic rock', 'pub rock']; spotify:track:38GhS6VdKqmOI7ayfwSnD8
5) Swingin' down the lane - by Jerry Wallace; genres: ['deep adult standards']; spotify:track:1WQxjJWQTUQdugmysoANRw
6) Potatoe Chips - by Slim Gaillard; genres: ['jazz blues', 'stride', 'swing']; spotify:tr

[{'track_uri': 'spotify:track:0Nj0MFkl0s3LGaf0h6d03Q',
  'track_name': 'Sabotage',
  'artist_name': 'Maria Isa',
  'track_genres': [],
  'track_preview_url': 'https://p.scdn.co/mp3-preview/91d7730d8ca03f9dad620715cd4f50b124ea5c4f?cid=2dce494e64a74be980138668f4402b97',
  'track_spotify_url': 'https://open.spotify.com/track/0Nj0MFkl0s3LGaf0h6d03Q',
  'neighbor_distance': 133.32156372070312,
  'playlist_name': 'sleeping with the phish'},
 {'track_uri': 'spotify:track:1PqB9tIfRdLNJJBPln3xRh',
  'track_name': 'Motorhead',
  'artist_name': 'Hawkwind',
  'track_genres': ['art rock',
   'experimental',
   'hard rock',
   'progressive rock',
   'proto-metal',
   'protopunk',
   'psychedelic rock',
   'space rock',
   'symphonic rock'],
  'track_preview_url': None,
  'track_spotify_url': 'https://open.spotify.com/track/1PqB9tIfRdLNJJBPln3xRh',
  'neighbor_distance': 133.00466918945312,
  'playlist_name': 'sleeping with the phish'},
 {'track_uri': 'spotify:track:7sm2FKb3fju37zuGbBdo1D',
  'track_

## evaluating recall

In [291]:
# TEST_QUERY

playlist_emb = model_endpoint.predict([TEST_QUERY])

candidate_tracks_bf = ME_BF_index_endpoint.match(
    deployed_index_id=DEPLOYED_BF_INDEX_ID,
    queries=playlist_emb.predictions,
    num_neighbors=10
)

candidate_tracks_ann = ME_index_endpoint.match(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=playlist_emb.predictions,
    num_neighbors=10
)

In [292]:
# Calculate recall by determining how many neighbors were correctly retrieved as compared to the brute-force option.
recalled_neighbors = 0
for tree_ah_neighbors, brute_force_neighbors in zip(
    candidate_tracks_ann, candidate_tracks_bf
):
    tree_ah_neighbor_ids = [neighbor.id for neighbor in tree_ah_neighbors]
    brute_force_neighbor_ids = [neighbor.id for neighbor in brute_force_neighbors]

    recalled_neighbors += len(
        set(tree_ah_neighbor_ids).intersection(brute_force_neighbor_ids)
    )

recall = recalled_neighbors / len(
    [neighbor for neighbors in candidate_tracks_bf for neighbor in neighbors]
)

print("Recall: {}".format(recall))

Recall: 0.5


In [294]:
%%timeit
model_endpoint.predict([TEST_QUERY])

34.1 ms ± 7.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Plotting 

* TODO

In [241]:
# !pip install bokeh

import numpy as np
import seaborn as sns
# from bokeh.charts import Histogram, Scatter, Donut, show
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh import palettes

In [297]:
my_playlist_name = all_recs[0][0]['playlist_name']
my_playlist_name

'sleeping with the phish'

In [263]:
# output_notebook()

# tr_pop = test_df[test_df['pl_name_src'] == my_playlist_name]['track_pop_pl'].values

# hist, edges = np.histogram(tr_pop, density=True, bins=50)

# p = figure()
# p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="white")
# show(p)

# Archive - save

In [None]:
# TEST_INSTANCE = {
#  'album_name_can': '',
#  'album_name_pl': ['Im An Island Boy',
#                    'Might Not',
#                    '4REAL 4REAL',
#                    'SremmLife',
#                    'Peace Is The Mission (Extended)'],
#  'album_uri_can': '',
#  'album_uri_pl': ['spotify:album:3jcIJzp76JJpcB2NYGahOk',
#                   'spotify:album:3P1ZVNT9bnVfTjYqKVr4Oa',
#                   'spotify:album:5zuM1EG06X1J7VsIF1omRm',
#                   'spotify:album:6eDx949ONWDCN0O22wFZf7',
#                   'spotify:album:4pCLlUxlKj3pNVdBtFyhrU'],
#  'artist_followers_can': 0.0,
#  'artist_genres_can': '',
#  'artist_genres_pl': ['unknown',
#                       'canadian_hip_hop canadian_trap pop_rap rap trap',
#                       'cali_rap gangster_rap hip_hop pop_rap rap '
#                       'southern_hip_hop trap',
#                       'hip_hop melodic_rap mississippi_hip_hop pop_rap rap '
#                       'southern_hip_hop trap',
#                       'dance_pop edm electro_house moombahton pop pop_dance '
#                       'pop_rap tropical_house'],
#  'artist_name_can': '',
#  'artist_name_pl': ['Flyysoulja', 'Belly', 'YG', 'Rae Sremmurd', 'Major Lazer'],
#  'artist_pop_can': 0.0,
#  'artist_pop_pl': [34.0, 63.0, 76.0, 74.0, 75.0],
#  'artist_uri_can': '',
#  'artist_uri_pl': ['spotify:artist:5SOeefBn30MJhB0bMVtLU1',
#                    'spotify:artist:0FOWNUFHPnMy0vOw1siGqi',
#                    'spotify:artist:0A0FS04o6zMoto8OKPsDwY',
#                    'spotify:artist:7iZtZyCzp3LItcw1wtPI3D',
#                    'spotify:artist:738wLrAtLtCtFOLvQBXOXp'],
#  'artists_followers_pl': [15861.0, 232071.0, 3005943.0, 7025645.0, 6327219.0],
#  'duration_ms_can': 0.0,
#  'duration_ms_songs_pl': [147120.0, 224306.0, 299226.0, 206306.0, 176561.0],
#  'num_pl_albums_new': 14.0,
#  'num_pl_artists_new': 11.0,
#  'num_pl_songs_new': 16.0,
#  'pl_collaborative_src': 'false',
#  'pl_duration_ms_new': 0.0,
#  'pl_name_src': 'biebs weeknd',
#  'time_signature_can': '',
#  'time_signature_pl': ['4', '4', '4', '4', '4'],
#  'track_acousticness_can': 0.0,
#  'track_acousticness_pl': [0.0545, 0.296, 0.325, 0.291, 0.00346],
#  'track_danceability_can': 0.0,
#  'track_danceability_pl': [0.809, 0.792, 0.88, 0.732, 0.723],
#  'track_energy_can': 0.0,
#  'track_energy_pl': [0.641, 0.573, 0.67, 0.69, 0.809],
#  'track_instrumentalness_can': 0.0,
#  'track_instrumentalness_pl': [3.49e-05, 0, 4.17e-06, 0, 0.00123],
#  'track_key_can': '',
#  'track_key_pl': ['8', '1', '2', '0', '7'],
#  'track_liveness_can': 0.0,
#  'track_liveness_pl': [0.083, 0.0778, 0.0662, 0.116, 0.565],
#  'track_loudness_can': 0.0,
#  'track_loudness_pl': [-7.063, -4.714, -5.177, -4.992, -3.081],
#  'track_mode_can': '',
#  'track_mode_pl': ['0', '1', '1', '0', '0'],
#  'track_name_can': '',
#  'track_name_pl': ['Im An Island Boy',
#                    'Might Not',
#                    'Go Loko (feat. Tyga, Jon Z)',
#                    'This Could Be Us',
#                    'Lean On'],
#  'track_pop_can': 0.0,
#  'track_pop_pl': [48.0, 36.0, 66.0, 79.0, 74.0],
#  'track_speechiness_can': 0.0,
#  'track_speechiness_pl': [0.0755, 0.0861, 0.0468, 0.134, 0.0625],
#  'track_tempo_can': 0.0,
#  'track_tempo_pl': [99.934, 134.023, 101.008, 143.072, 98.007],
#  'track_uri_can': '',
#  'track_uri_pl': ['spotify:track:6JNUBfWGs8kNl7k47Hvpgd',
#                   'spotify:track:5hSrO8SL1d8x5uuf8tztX7',
#                   'spotify:track:1kK6DwzyXJSp58u5HYWwuD',
#                   'spotify:track:4jTiyLlOJVJj3mCr7yfPQD',
#                   'spotify:track:1Lim1Py7xBgbAkAys3AGAG'],
#  'track_valence_can': 0.0,
#  'track_valence_pl': [0.759, 0.278, 0.131, 0.758, 0.274]}