# Testing dict -> query embedding funcitons in merlin
This will be used to create a custom container for the query tower

In [20]:
import json 

TEST_INSTANCE = json.dumps({'collaborative': 'false',
                 'album_name_pl': ["There's Really A Wolf", 'Late Nights: The Album',
                       'American Teen', 'Crazy In Love', 'Pony'], 
                 'album_uri_can': 'spotify:album:5l83t3mbVgCrIe1VU9uJZR', 
                 'artist_followers_can': 4339757.0, 
                 'artist_genres_can': "'hawaiian hip hop', 'rap'", 
                 'artist_genres_pl': ["'hawaiian hip hop', 'rap'",
                       "'chicago rap', 'dance pop', 'pop', 'pop rap', 'r&b', 'southern hip hop', 'trap', 'urban contemporary'",
                       "'pop', 'pop r&b'", "'dance pop', 'pop', 'r&b'",
                       "'chill r&b', 'pop', 'pop r&b', 'r&b', 'urban contemporary'"], 
                 'artist_name_can': 'Russ', 
                 'artist_name_pl': ['Russ', 'Jeremih', 'Khalid', 'Beyonc\xc3\xa9',
                       'William Singe'], 
                 'artist_pop_can': 82.0, 
                 'artist_pop_pl': [82., 80., 90., 87., 65.], 
                 'artist_uri_can': 'spotify:artist:1z7b1Pr1rSlvWRzsW3HOrS', 
                 'artists_followers_pl': [ 4339757.,  5611842., 15046756., 30713126.,   603837.],  
                 'description_pl': '', 
                 'duration_ms_can': 237322.0, 
                 'duration_ms_songs_pl': [237506., 217200., 219080., 226400., 121739.], 
                 'n_songs_pl': 8.0, 
                 'name': 'Lit Tunes ', 
                 'num_albums_pl': 8.0, 
                 'num_artists_pl': 8.0, 
                 'track_name_can': 'We Just Havent Met Yet', 
                 'track_name_pl': ['Losin Control', 'Paradise', 'Location',
                       'Crazy In Love - Remix', 'Pony'], 
                 'track_pop_can': 57.0, 
                 'track_pop_pl': [79., 58., 83., 71., 57.],
                 'duration_ms_seed_pl': 51023.1,
                 'pid': 1,
                 'track_uri_can': 'spotify:track:0VzDv4wiuZsLsNOmfaUy2W', 
                 'track_uri_pl': ['spotify:track:4cxMGhkinTocPSVVKWIw0d',
                       'spotify:track:1wNEBPo3nsbGCZRryI832I',
                       'spotify:track:152lZdxL1OR0ZMW6KquMif',
                       'spotify:track:2f4IuijXLxYOeBncS60GUD',
                       'spotify:track:4Lj8paMFwyKTGfILLELVxt']
                     })

In [21]:
TEST_INSTANCE

'{"collaborative": "false", "album_name_pl": ["There\'s Really A Wolf", "Late Nights: The Album", "American Teen", "Crazy In Love", "Pony"], "album_uri_can": "spotify:album:5l83t3mbVgCrIe1VU9uJZR", "artist_followers_can": 4339757.0, "artist_genres_can": "\'hawaiian hip hop\', \'rap\'", "artist_genres_pl": ["\'hawaiian hip hop\', \'rap\'", "\'chicago rap\', \'dance pop\', \'pop\', \'pop rap\', \'r&b\', \'southern hip hop\', \'trap\', \'urban contemporary\'", "\'pop\', \'pop r&b\'", "\'dance pop\', \'pop\', \'r&b\'", "\'chill r&b\', \'pop\', \'pop r&b\', \'r&b\', \'urban contemporary\'"], "artist_name_can": "Russ", "artist_name_pl": ["Russ", "Jeremih", "Khalid", "Beyonc\\u00c3\\u00a9", "William Singe"], "artist_pop_can": 82.0, "artist_pop_pl": [82.0, 80.0, 90.0, 87.0, 65.0], "artist_uri_can": "spotify:artist:1z7b1Pr1rSlvWRzsW3HOrS", "artists_followers_pl": [4339757.0, 5611842.0, 15046756.0, 30713126.0, 603837.0], "description_pl": "", "duration_ms_can": 237322.0, "duration_ms_songs_pl": 

### Create the prediction scripts - we will use this in our Custom Prediction Routine in 01-z notebook

In [22]:
import nvtabular as nvt
import dask.dataframe as dd
import pandas as pd
import os
BUCKET = 'gs://spotify-beam-v3'
import merlin.models.tf as mm
from nvtabular.loader.tf_utils import configure_tensorflow
import json
configure_tensorflow()

import tensorflow as tf

def load(artifacts_uri):
        """Loads the model artifact.
        Args:
            artifacts_uri (str):
                Required. The value of the environment variable AIP_STORAGE_URI.
        """
        
        model = tf.keras.models.load_model(os.path.join(artifacts_uri, "query_model_merlin" ))
        workflow = nvt.Workflow.load(os.path.join(artifacts_uri, "workflow/2t-spotify-workflow"))
        # workflow.remove_inputs(['track_pop_can', 'track_uri_can', 
        #                     'duration_ms_can', 'track_name_can', 
        #                     'artist_name_can','album_name_can',
        #                     'album_uri_can','artist_followers_can',
        #                     'artist_genres_can','artist_name_can',
        #                     'artist_pop_can','artist_pop_pl','artist_uri_can',
        #                     'artists_followers_pl',])
        return(model, workflow)

def preprocess(prediction_input, model, workflow):
    """Preprocesses the prediction input before doing the prediction.
    Args:
        prediction_input (Any):
            Required. The prediction input that needs to be preprocessed.
    Returns:
        The preprocessed prediction input.
    """

    TEST_INSTANCE = json.loads(prediction_input)
    if type(TEST_INSTANCE) == list:
        pandas_instance = pd.DataFrame.from_dict(TEST_INSTANCE[0], orient='index').T
        if len(TEST_INSTANCE) > 1:
            for ti in TEST_INSTANCE[0:]:
                pandas_instance = pandas_instance.append(pd.DataFrame.from_dict(ti, orient='index').T)
    else:
        pandas_instance = pd.DataFrame.from_dict(TEST_INSTANCE, orient='index').T
        transformed_inputs = nvt.Dataset(pandas_instance)
        schema = transformed_inputs.schema.without(['track_pop_can', 'track_uri_can', 
                            'duration_ms_can', 'track_name_can', 
                            'artist_name_can','album_name_can',
                            'album_uri_can','artist_followers_can',
                            'artist_genres_can','artist_name_can',
                            'artist_pop_can','artist_pop_pl','artist_uri_can',
                            'artists_followers_pl',])
        transformed_inputs.schema = schema
        transformed_instance = workflow.transform(transformed_inputs)
        transformed_instance = workflow.transform(transformed_inputs)
        return transformed_instance

def predict(instances, model):
    """Performs prediction.
    Args:
        instances (Any):
            Required. The instance(s) used for performing prediction.
    Returns:
        Prediction results.
    """  

    loader = mm.Loader(instances, batch_size=instances.num_rows, shuffle=False)
    batch =next(iter(loader))
    return model(batch[0])

In [17]:
### Debug
dictinput = json.loads(TEST_INSTANCE)
pandas_instance = pd.DataFrame.from_dict(dictinput, orient='index').T
pandas_instance


Unnamed: 0,collaborative,album_name_pl,artist_genres_pl,artist_name_pl,description_pl,n_songs_pl,name,num_albums_pl,num_artists_pl,track_name_pl,duration_ms_seed_pl,pid,track_uri_pl
0,False,"[There's Really A Wolf, Late Nights: The Album...","['hawaiian hip hop', 'rap', 'chicago rap', 'da...","[Russ, Jeremih, Khalid, BeyoncÃ©, William Singe]",,8.0,Lit Tunes,8.0,8.0,"[Losin Control, Paradise, Location, Crazy In L...",51023.1,1,"[spotify:track:4cxMGhkinTocPSVVKWIw0d, spotify..."


In [18]:
nvt_dataset = nvt.Dataset(pandas_instance)
nvt_dataset.schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged
0,collaborative,(),object,False,False
1,album_name_pl,(),object,True,True
2,artist_genres_pl,(),object,True,True
3,artist_name_pl,(),object,True,True
4,description_pl,(),object,False,False
5,n_songs_pl,(),float64,False,False
6,name,(),object,False,False
7,num_albums_pl,(),float64,False,False
8,num_artists_pl,(),float64,False,False
9,track_name_pl,(),object,True,True


In [19]:
transformed_merlin_dataset = workflow.transform(nvt_dataset)

KeyError: 'artist_followers_can'

In [23]:
BUCKET = 'gs://spotify-beam-v3'
ARTIFACT_URI = f'{BUCKET}/merlin-processed'

model, workflow = load(ARTIFACT_URI)
preprocess(TEST_INSTANCE, model, workflow)





<merlin.io.dataset.Dataset at 0x7f1a30fe6280>

In [24]:
predict(preprocess(TEST_INSTANCE, model, workflow), model)

2022-10-25 19:15:16.913333: I tensorflow/stream_executor/cuda/cuda_blas.cc:1804] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[0.15263352, 0.        , 0.        , 0.        , 0.02353548,
        0.        , 0.        , 0.        , 0.00205901, 0.        ,
        0.        , 0.        , 0.07392506, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.00084471, 0.        ,
        0.01920758, 0.        , 0.05852018, 0.07206493, 0.        ,
        0.        , 0.        , 0.        , 0.0238607 , 0.01459057,
        0.        , 0.        , 0.06495419, 0.03169129, 0.        ,
        0.        , 0.02613383, 0.0674267 , 0.        , 0.01994274,
        0.        , 0.        , 0.04045154, 0.        , 0.        ,
        0.10970411, 0.        , 0.06672961, 0.03771982, 0.05934781,
        0.09782815, 0.        , 0.        , 0.0313582 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.00516027, 0.00352811, 0.        , 0.06148433, 0.        