# Two-Tower for Neural Deep Retrieval

> model baseline for spotify modeling experiments

In [1]:
# set variables
SEED = 41781897
PROJECT_ID = 'hybrid-vertex'
BQ_LOCATION='us-central1'

### Pip

In [2]:
!pip freeze | grep tensorflow

tensorflow @ file:///opt/conda/conda-bld/dlenv-tf-2-9-cpu_1656643241567/work/tensorflow-2.9.0rc2-cp37-cp37m-linux_x86_64.whl
tensorflow-cloud==0.1.16
tensorflow-datasets==4.4.0
tensorflow-estimator==2.9.0
tensorflow-hub==0.12.0
tensorflow-io==0.23.1
tensorflow-io-gcs-filesystem==0.26.0
tensorflow-metadata==1.9.0
tensorflow-probability==0.14.1
tensorflow-recommenders==0.6.0
tensorflow-serving-api==2.9.0
tensorflow-transform==1.9.0


### Import Package

In [5]:
import warnings
warnings.filterwarnings("ignore") #do this b/c there's an info-level bug that can safely be ignored

# from tensorflow.python.framework import ops
# from tensorflow.python.framework import dtypes
# from tensorflow_io.bigquery import BigQueryClient
# from tensorflow_io.bigquery import BigQueryReadSession

import json
import tensorflow as tf
import tensorflow_recommenders as tfrs
import datetime
from tensorflow.python.lib.io import file_io
from tensorflow.train import BytesList, Feature, FeatureList, Int64List, FloatList
from tensorflow.train import SequenceExample, FeatureLists

import os
import numpy as np
import pickle as pkl

from pprint import pprint

In [4]:
import tensorflow as tf
print(tf. __version__)

2.9.0-rc2


## TF-Record reader

In [191]:
# for _ in parsed_test_ds.batch(1).take(1):
    # print(_)
    
# for raw_record in raw_test_ds.take(5):
#     example = tf.train.Example()
#     example.ParseFromString(raw_record.numpy())
#     print(example)

In [152]:
from google.cloud import storage

client = storage.Client()

# all training data
train_files = []
BUCKET_NAME_TRAIN = 'spotify-tfrecords-blog'
OBJ_PATH_TRAIN = 'tfrecords_v1/train'
# SAMPLE_FILE_TRAIN = 'gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00000-of-00796.tfrecord'

for blob in client.list_blobs(f'{BUCKET_NAME_TRAIN}', prefix=f'{OBJ_PATH_TRAIN}/', delimiter='/'):
    train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

# all candidate data
candidate_files = []
BUCKET_NAME_CANDIDATE = 'spotify-tfrecords-blog'
OBJ_PATH_CANDIDATE = 'tfrecords_v1/candidate_records/train'
# SAMPLE_FILE_CANDIDATE = 'gs://spotify-tfrecords-blog/tfrecords_v1/candidate_records/train/output-00000-of-00004.tfrecord'

for blob in client.list_blobs(f'{BUCKET_NAME_CANDIDATE}', prefix=f'{OBJ_PATH_CANDIDATE}/', delimiter='/'):
    candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

### parsing function - train data

In [179]:
# =====================
# Training Data TF Reocrds
# =====================

train_context_features = {
    # playlist - context features
    # 'pid': tf.io.FixedLenFeature(dtype=tf.int64, shape=(1)),
    'name': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'collaborative': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'duration_ms_seed_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'n_songs_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'num_artists_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'num_albums_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'description_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    # seed track - context features
    'pos_seed_track': tf.io.FixedLenFeature(dtype=tf.int64, shape=(1)),
    'track_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'album_name_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'track_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'album_uri_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'duration_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'track_pop_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'artist_pop_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'artist_genres_seed_track': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_followers_seed_track': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    # candidate - context features
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
}

# VarLenFeature
train_sequence_features = {
    'track_name_pl': tf.io.RaggedFeature(tf.string),
    'artist_name_pl': tf.io.RaggedFeature(tf.string),
    'album_name_pl': tf.io.RaggedFeature(tf.string),
    'track_uri_pl': tf.io.RaggedFeature(tf.string),
    'duration_ms_songs_pl': tf.io.RaggedFeature(tf.float32),
    'artist_pop_pl': tf.io.RaggedFeature(tf.float32),
    'artists_followers_pl': tf.io.RaggedFeature(tf.float32),
    'track_pop_pl': tf.io.RaggedFeature(tf.float32),
    'artist_genres_pl': tf.io.RaggedFeature(tf.string),
}

# parse_sequence_example | parse_single_sequence_example
def parse_train_tfrecord_fn(example):
    example = tf.io.parse_single_sequence_example( 
        example, 
        sequence_features=train_sequence_features, 
        context_features=train_context_features
    )
    return example

# TF data input pipeline params
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

# parse training data - SAMPLE_FILE_TRAIN | train_files
raw_train_ds = tf.data.TFRecordDataset(SAMPLE_FILE_TRAIN)

parsed_train_ds = raw_train_ds.map(
    parse_train_tfrecord_fn, 
    num_parallel_calls=tf.data.AUTOTUNE
).with_options(options)

# Ragged 
def return_tensors2(context, sequence):
        a = sequence['track_name_pl'].to_tensor(default_value='', shape=[None, 375])
        context2 = context.copy()
        context2['track_name_pl'] = a
        return context2
    
parsed_train_ds_2 = parsed_train_ds.map(
    return_tensors2, 
    num_parallel_calls=tf.data.AUTOTUNE
)

In [181]:
for _ in parsed_train_ds_2.batch(2).take(1):
    print(_)

{'album_name_can': <tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'Glee: The Music, The Complete Season Three'],
       [b'Bobby Tarantino']], dtype=object)>, 'album_name_seed_track': <tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'Glee: The Music, The Complete Season Four'],
       [b'Bobby Tarantino']], dtype=object)>, 'album_uri_can': <tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'spotify:album:082BH67sSIDefLxUp8GgNm'],
       [b'spotify:album:0WMr3ulx5Mzi1B3C8LsHVA']], dtype=object)>, 'album_uri_seed_track': <tf.Tensor: shape=(2, 1), dtype=string, numpy=
array([[b'spotify:album:4fZXOp8No89WLudGy6brXd'],
       [b'spotify:album:0WMr3ulx5Mzi1B3C8LsHVA']], dtype=object)>, 'artist_followers_can': <tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[1452611.],
       [5661726.]], dtype=float32)>, 'artist_followers_seed_track': <tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[1452611.],
       [5661726.]], dtype=float32)>, 'artist_genres_can': <tf.Te

In [170]:
for raw_record in raw_train_ds.take(1):
    # example = tf.train.Example()
    # example.ParseFromString(raw_record.numpy())
    print(raw_record)

tf.Tensor(b'\n\xa8[\n\x10\n\x04name\x12\x08\n\x06\n\x04love\n\x1a\n\rcollaborative\x12\t\n\x07\n\x05false\n\x0e\n\x03pid\x12\x07\x1a\x05\n\x03\x80\x9a\x0c\n\x18\n\x0bmodified_at\x12\t\x1a\x07\n\x05\x80\xa4\xb4\xc8\x05\n\x13\n\nrow_number\x12\x05\x1a\x03\n\x01\x01\n \n\x14duration_ms_playlist\x12\x08\x1a\x06\n\x04\xfa\x93\x86\x0c\n\x1b\n\npid_pos_id\x12\r\n\x0b\n\t199936-51\n\x10\n\x07pos_can\x12\x05\x1a\x03\n\x013\n \n\x0fartist_name_can\x12\r\n\x0b\n\tGlee Cast\n9\n\rtrack_uri_can\x12(\n&\n$spotify:track:6KhJeYLg1AimCQjH6ii1Al\n9\n\ralbum_uri_can\x12(\n&\n$spotify:album:082BH67sSIDefLxUp8GgNm\n?\n\x0etrack_name_can\x12-\n+\n)You Get What You Give (Glee Cast Version)\n;\n\x0eartist_uri_can\x12)\n\'\n%spotify:artist:0SCbttzoZTnLFebDYmAWCm\n\x1b\n\x0fduration_ms_can\x12\x08\x12\x06\n\x04\xa0f\x8bH\n@\n\x0ealbum_name_can\x12.\n,\n*Glee: The Music, The Complete Season Three\n\x17\n\x0epos_seed_track\x12\x05\x1a\x03\n\x012\n\'\n\x16artist_name_seed_track\x12\r\n\x0b\n\tGlee Cast\nB\n\x15art

In [94]:
# # function for sequence features
# def return_rag_pl_tensors(context, sequence):
#         a = sequence['artist_name_pl'].to_tensor(default_value='', shape=[None, 375])
#         b = sequence['track_uri_pl'].to_tensor(default_value='', shape=[None, 375])
#         c = sequence['track_name_pl'].to_tensor(default_value='', shape=[None, 375])
#         d = sequence['duration_ms_songs_pl'].to_tensor(default_value=0.0, shape=[None, 375])
#         e = sequence['album_name_pl'].to_tensor(default_value='', shape=[None, 375])
#         f = sequence['artist_pop_pl'].to_tensor(default_value=0.0, shape=[None, 375])
#         g = sequence['artists_followers_pl'].to_tensor(default_value=0.0, shape=[None, 375])
#         h = sequence['track_pop_pl'].to_tensor(default_value=0, shape=[None, 375])
#         i = sequence['artist_genres_pl'].to_tensor(default_value='', shape=[None, 375])
#         context2 = context.copy()
#         context2['artist_name_pl'] = a
#         context2['track_uri_pl'] = b
#         context2['track_name_pl'] = c
#         context2['duration_ms_songs_pl'] = d
#         context2['album_name_pl'] = e
#         context2['artist_pop_pl'] = f
#         context2['artists_followers_pl'] = g
#         context2['track_pop_pl'] = h
#         context2['artist_genres_pl'] = i
#         return context2

# # playlists features (parse 2)
# parsed_train_ds_2 = parsed_train_ds.map(
#     return_rag_pl_tensors,
#     num_parallel_calls=tf.data.AUTOTUNE
# ).with_options(options)

In [158]:
# for _ in parsed_train_ds_2.batch(3).take(1):
#     print(_)

### parsing function - candidate data

In [155]:
# =====================
# Candidate TF Records
# =====================

candidate_features = {
    'track_name': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_name': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'album_name': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'track_uri': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_uri': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'album_uri': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'duration_ms': tf.io.FixedLenFeature(dtype=tf.int64, shape=(1)),
    'track_pop': tf.io.FixedLenFeature(dtype=tf.int64, shape=(1)),
    'artist_pop': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
    'artist_genres': tf.io.FixedLenFeature(dtype=tf.string, shape=(1)),
    'artist_followers': tf.io.FixedLenFeature(dtype=tf.float32, shape=(1)),
}

def parse_candidate_tfrecord_fn(example):
    example = tf.io.parse_single_example(
        example, 
        features=candidate_features, 
    )
    return example

# TF data input pipeline params
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

# parse candidate data
raw_candidate_ds = tf.data.TFRecordDataset(candidate_files)

parsed_candidate_ds = raw_candidate_ds.map(
    parse_candidate_tfrecord_fn, 
    num_parallel_calls=tf.data.AUTOTUNE
).with_options(options)

In [157]:
for _ in parsed_candidate_ds.batch(3).take(1):
    print(_)

{'album_name': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'Fragments Found'],
       [b'Subir Al Cielo'],
       [b'Los Joao']], dtype=object)>, 'album_uri': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'spotify:album:39LWVcWN0HG7SF27UiuW9b'],
       [b'spotify:album:6xOXb0I1NGCbdGdvrP9B5Q'],
       [b'spotify:album:5042nLFNM8Trr39JNYtWXE']], dtype=object)>, 'artist_followers': <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[ 90784.],
       [255041.],
       [ 16065.]], dtype=float32)>, 'artist_genres': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b"'compositional ambient', 'neo-classical'"],
       [b"'latin', 'latin pop', 'puerto rican pop'"],
       [b"'cumbia del sureste', 'cumbia sonidera'"]], dtype=object)>, 'artist_name': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'Fabrizio Paterlini'],
       [b'MDO'],
       [b'Los Joao']], dtype=object)>, 'artist_pop': <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[69.],
       

## Import Vocab & Stats

In [35]:
import pickle as pkl
from google.cloud import storage

BUCKET_NAME = 'spotify-v1'
FILE_PATH = 'vocabs/v1_string_vocabs'
FILE_NAME = 'string_vocabs_v1_20220705-202905.txt'
DESTINATION_FILE = 'downloaded_vocabs.txt'

client = storage.Client()

with open(f'{DESTINATION_FILE}', 'wb') as file_obj:
    client.download_blob_to_file(
        f'gs://{BUCKET_NAME}/{FILE_PATH}/{FILE_NAME}', file_obj)

    
with open(f'{DESTINATION_FILE}', 'rb') as pickle_file:
    vocab_dict_load = pkl.load(pickle_file)

In [93]:
# vocab_dict_load['unique_pids']
# vocab_dict_load

### TODO: Add stats to vocab_file

In [88]:
avg_duration_ms_seed_pl = 13000151.68
var_duration_ms_seed_pl = 133092900971233.58
vocab_dict_load['avg_duration_ms_seed_pl']=avg_duration_ms_seed_pl
vocab_dict_load['var_duration_ms_seed_pl']=var_duration_ms_seed_pl

avg_n_songs_pl = 55.21
var_n_songs_pl = 2317.54
vocab_dict_load['avg_n_songs_pl']=avg_n_songs_pl
vocab_dict_load['var_n_songs_pl']=var_n_songs_pl

avg_n_artists_pl = 30.56
var_n_artists_pl = 769.26
vocab_dict_load['avg_n_artists_pl']=avg_n_artists_pl
vocab_dict_load['var_n_artists_pl']=var_n_artists_pl

avg_n_albums_pl = 40.25
var_n_albums_pl = 1305.54
vocab_dict_load['avg_n_albums_pl']=avg_n_albums_pl
vocab_dict_load['var_n_albums_pl']=var_n_albums_pl

avg_artist_pop = 16.08
var_artist_pop = 300.64
vocab_dict_load['avg_artist_pop']=avg_artist_pop
vocab_dict_load['var_artist_pop']=var_artist_pop

avg_duration_ms_songs_pl = 234823.14
var_duration_ms_songs_pl = 5558806228.41
vocab_dict_load['avg_duration_ms_songs_pl']=avg_duration_ms_songs_pl
vocab_dict_load['var_duration_ms_songs_pl']=var_duration_ms_songs_pl

avg_artist_followers = 43337.77
var_artist_followers = 377777790193.57
vocab_dict_load['avg_artist_followers']=avg_artist_followers
vocab_dict_load['var_artist_followers']=var_artist_followers

avg_track_pop = 10.85
var_track_pop = 202.18
vocab_dict_load['avg_track_pop']=avg_track_pop
vocab_dict_load['var_track_pop']=var_track_pop

# Two-Tower

In [151]:
parsed_train_ds

<_OptionsDataset element_spec=({'album_name_can': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'album_name_seed_track': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'album_uri_can': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'album_uri_seed_track': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'artist_followers_can': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'artist_followers_seed_track': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'artist_genres_can': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'artist_genres_seed_track': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'artist_name_can': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'artist_name_seed_track': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'artist_pop_can': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'artist_pop_seed_track': TensorSpec(shape=(1,), dtype=tf.float32, name=None), 'artist_uri_can': TensorSpec(shape=(1,), dtype=tf.string, name=N

In [150]:
# Test # TODO: remove

for x in parsed_train_ds.take(5).as_numpy_iterator():
    pprint(x['track_pop_pl'])

AttributeError: 'RaggedTensor' object has no attribute '_numpy'

In [None]:
test_instance = {
    'name': np.asarray([b'Best Christmas']),
    'collaborative': np.asarray([b'false']),
    # 'pid': np.asarray([173671]),
    'description_pl': np.asarray([b'test description']),
    'duration_ms_seed_pl': np.asarray([5458995.]),
    'n_songs_pl': np.asarray([58.]),
    'num_artists_pl': np.asarray([19.]),
    'num_albums_pl': np.asarray([27.]),
    'artist_name_pl': np.asarray([[b'Juan Luis Guerra 4.40', b'Prince Royce', b'Luis Vargas']]),
    'track_uri_pl': np.asarray([[b'spotify:track:1g0IBPZTRP7VYkctJ4Qafg',b'spotify:track:43wUzbYxEFoXugYkgTzMWp']]),
    'track_name_pl': np.asarray([[b'Lover Come Back', b'White Lightning', b'Shake Me Down']]),
    'duration_ms_songs_pl': np.asarray([[245888., 195709., 283906., 271475., 300373., 275173., 236145.,]]),
    'album_name_pl': np.asarray([[b'Silsulim', b'Sara Shara', b'Muzika Vesheket', b'Ba La Lirkod']]),
    'artist_pop_pl': np.asarray([[81., 81., 70., 66., 66., 66., 46., 87.]]),
    'artists_followers_pl': np.asarray([[3.556710e+05, 8.200000e+02, 1.510000e+02, 1.098080e+05,]]),
    'artist_genres_pl': np.asarray([[b"'israeli pop', 'jewish pop'", b"'israeli pop', 'jewish pop'",]]),
    'track_pop_pl': np.asarray([[70, 77, 50, 44, 30, 28, 15, 26, 15, 18, 46, 38,]])
}

## Query Tower

In [None]:
EMBEDDING_DIM = 32
PROJECTION_DIM = 5
SEED = 1234
USE_CROSS_LAYER=True
DROPOUT='False'
DROPOUT_RATE='0.33'

class QueryModel(tf.keras.Model):
    def __init__(self, layer_sizes, vocab_dict):
        super().__init__()

        # ========================================
        # non-sequence playlist features
        # ========================================
        
        # Feature: playlist name
        self.pl_name_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    # max_tokens=len(vocab_dict["name"]), # not needed if passing vocab
                    vocabulary=vocab_dict['name'], 
                    name="pl_name_txt_vectorizer", 
                    ngrams=2
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["name"]),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="pl_name_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="pl_name_pooling"),
            ], name="pl_name_emb_model"
        )
        
        # Feature: collaborative
        collaborative_vocab = np.array([b'false', b'true'])
        
        self.pl_collaborative_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=collaborative_vocab, 
                    mask_token=None, 
                    name="pl_collaborative_lookup", 
                    output_mode='int'
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(collaborative_vocab),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="pl_collaborative_emb_layer",
                ),
            ], name="pl_collaborative_emb_model"
        )
        
        # # Feature: pid
        # self.pl_pid_embedding = tf.keras.Sequential(
        #     [
        #         tf.keras.layers.IntegerLookup(
        #             vocabulary=vocab_dict['unique_pids'], 
        #             mask_token=None, 
        #             name="pl_pid_lookup", 
        #             # output_mode='int'
        #         ),
        #         tf.keras.layers.Embedding(
        #             input_dim=len(vocab_dict['unique_pids']),
        #             output_dim=EMBEDDING_DIM,
        #             mask_zero=False,
        #             name="pl_pid_emb_layer",
        #         ),
        #     ], name="pl_pid_emb_model"
        # )
        
        # Feature: description_pl
        self.pl_description_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    # max_tokens=len(vocab_dict["description_pl"]), # not needed if passing vocab
                    vocabulary=vocab_dict['description_pl'], 
                    name="description_pl_vectorizer", 
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["description_pl"]),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="description_pl_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="description_pl_pooling"),
            ], name="pl_description_emb_model"
        )
        
        # Feature: duration_ms_seed_pl                      
        # TODO: Noramlize or Descritize?
        duration_ms_seed_pl_buckets = np.linspace(
            vocab_dict['min_duration_ms_seed_pl'], 
            vocab_dict['max_duration_ms_seed_pl'], 
            num=1000
        )
        self.duration_ms_seed_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(duration_ms_seed_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(duration_ms_seed_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="duration_ms_seed_pl_emb_layer",
                )
            ], name="duration_ms_seed_pl_emb_model"
        )
        # self.duration_ms_seed_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_duration_ms_seed_pl'],
        #     variance=vocab_dict['var_duration_ms_seed_pl'],
        #     axis=None
        # )
        
        # Feature: n_songs_pl
        # TODO: Noramlize or Descritize?
        n_songs_pl_buckets = np.linspace(
            vocab_dict['min_n_songs_pl'], 
            vocab_dict['max_n_songs_pl'], 
            num=100
        )
        self.n_songs_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(n_songs_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(n_songs_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="n_songs_pl_emb_layer",
                )
            ], name="n_songs_pl_emb_model"
        )
        # self.n_songs_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_n_songs_pl'],
        #     variance=vocab_dict['var_n_songs_pl'],
        #     axis=None
        # )
        
        # Feature: num_artists_pl
        # TODO: Noramlize or Descritize?
        n_artists_pl_buckets = np.linspace(
            vocab_dict['min_n_artists_pl'], 
            vocab_dict['max_n_artists_pl'], 
            num=100
        )
        self.n_artists_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(n_artists_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(n_artists_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="n_artists_pl_emb_layer",
                )
            ], name="n_artists_pl_emb_model"
        )
        # self.n_artists_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_n_artists_pl'],
        #     variance=vocab_dict['var_n_artists_pl'],
        #     axis=None
        # )
        
        # Feature: num_albums_pl
        n_albums_pl_buckets = np.linspace(
            vocab_dict['min_n_albums_pl'], 
            vocab_dict['max_n_albums_pl'],
            num=100
        )
        self.n_albums_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(n_albums_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(n_albums_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM, 
                    name="n_albums_pl_emb_layer",
                )
            ], name="n_albums_pl_emb_model"
        )
        # self.n_albums_pl_normalization = tf.keras.layers.Normalization(
        #     mean=vocab_dict['avg_n_albums_pl'],
        #     variance=vocab_dict['var_n_albums_pl'],
        #     axis=None
        # )
        
        # ========================================
        # sequence playlist features
        # ========================================
        
        # Feature: artist_name_pl
        self.artist_name_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['artist_name_pl'], mask_token=''),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['artist_name_pl']) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="artist_name_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="artist_name_pl_emb_model"
        )
        
        # Feature: track_uri_pl
        # 2.2M unique
        self.track_uri_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['track_uri_pl']) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="track_uri_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="track_uri_pl_emb_model"
        )
        
        # Feature: track_name_pl
        self.track_name_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['track_name_pl'], 
                    name="track_name_pl_lookup",
                    output_mode='int',
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['track_name_pl']), 
                    output_dim=EMBEDDING_DIM,
                    name="track_name_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="track_name_pl_emb_model"
        )
        
        # Feature: duration_ms_songs_pl
        duration_ms_songs_pl_buckets = np.linspace(
            vocab_dict['min_duration_ms_songs_pl'], 
            vocab_dict['max_duration_ms_songs_pl'], 
            num=100
        )
        self.duration_ms_songs_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(duration_ms_songs_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(duration_ms_songs_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="duration_ms_songs_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="duration_ms_songs_pl_emb_model"
        )
        
        # Feature: album_name_pl
        self.album_name_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['album_name_pl'], mask_token=None),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['album_name_pl']), 
                    output_dim=EMBEDDING_DIM,
                    name="album_name_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="album_name_pl_emb_model"
        )

        # Feature: artist_pop_pl
        artist_pop_pl_buckets = np.linspace(
            vocab_dict['min_artist_pop'], 
            vocab_dict['max_artist_pop'], 
            num=10
        )
        self.artist_pop_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(artist_pop_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(artist_pop_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="artist_pop_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="artist_pop_pl_emb_model"
        )
        
        # Feature: artists_followers_pl
        artists_followers_pl_buckets = np.linspace(
            vocab_dict['min_artist_followers'], 
            vocab_dict['max_artist_followers'], 
            num=10
        )
        self.artists_followers_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(artists_followers_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(artists_followers_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="artists_followers_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="artists_followers_pl_emb_model"
        )
        
        # Feature: track_pop_pl
        track_pop_pl_buckets = np.linspace(
            vocab_dict['min_track_pop'], 
            vocab_dict['max_track_pop'], 
            num=10
        )
        self.track_pop_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Discretization(track_pop_pl_buckets.tolist()),
                tf.keras.layers.Embedding(
                    input_dim=len(track_pop_pl_buckets) + 1, 
                    output_dim=EMBEDDING_DIM,
                    name="track_pop_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="track_pop_pl_emb_model"
        )
        
        # Feature: artist_genres_pl
        self.artist_genres_pl_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=vocab_dict['artist_genres_pl'], mask_token=None),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['artist_genres_pl']), 
                    output_dim=EMBEDDING_DIM,
                    name="artist_genres_pl_emb_layer",
                ),
                tf.keras.layers.GRU(EMBEDDING_DIM),
            ], name="artist_genres_pl_emb_model"
        )

        # ========================================
        # dense and cross layers
        # ========================================

        # Cross Layers
        if USE_CROSS_LAYER:
            self._cross_layer = tfrs.layers.dcn.Cross(
                projection_dim=PROJECTION_DIM,
                kernel_initializer="glorot_uniform", 
                name="pl_cross_layer"
            )
        else:
            self._cross_layer = None
            
        # Dense Layers
        self.dense_layers = tf.keras.Sequential(name="pl_dense_layers")
        initializer = tf.keras.initializers.GlorotUniform(seed=SEED)
        
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    activation="relu", 
                    kernel_initializer=initializer,
                )
            )
            if DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(DROPOUT_RATE))
                
        # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    kernel_initializer=initializer
                )
            )
        ### ADDING L2 NORM AT THE END
        self.dense_layers.add(
            tf.keras.layers.Lambda(
                lambda x: tf.nn.l2_normalize(
                    x, 1, epsilon=1e-12, name="normalize_dense"
                )
            )
        )
    # ========================================
    # call
    # ========================================
    def call(self, data):
        '''
        The call method defines what happens when
        the model is called
        '''
        
        all_embs = tf.concat(
            [
                self.pl_name_text_embedding(data['name']),
                self.pl_collaborative_embedding(data['collaborative']),
                # self.pl_pid_embedding(data["pid"]),
                self.pl_description_text_embedding(data['description_pl']),
                self.duration_ms_seed_pl_embedding(data["duration_ms_seed_pl"]),
                # tf.reshape(self.duration_ms_seed_pl_normalization(data["duration_ms_seed_pl"]), (-1, 1))      # Normalize or Discretize?
                self.n_songs_pl_embedding(data["n_songs_pl"]),
                # tf.reshape(self.n_songs_pl_normalization(data["n_songs_pl"]), (-1, 1))                        # Normalize or Discretize?
                self.n_artists_pl_embedding(data['num_artists_pl']),
                # tf.reshape(self.n_artists_pl_normalization(data["num_artists_pl"]), (-1, 1))                  # Normalize or Discretize?
                self.n_albums_pl_embedding(data["num_albums_pl"]),
                # tf.reshape(self.n_albums_pl_normalization(data["num_albums_pl"]), (-1, 1))                    # Normalize or Discretize?
                
                # sequence features
                # data["pos_pl"],
                self.artist_name_pl_embedding(data["artist_name_pl"]),
                self.track_uri_pl_embedding(data["track_uri_pl"]),
                self.track_name_pl_embedding(data["track_name_pl"]),
                self.duration_ms_songs_pl_embedding(data["duration_ms_songs_pl"]),
                self.album_name_pl_embedding(data["album_name_pl"]),
                self.artist_pop_pl_embedding(data["artist_pop_pl"]),
                self.artists_followers_pl_embedding(data["artists_followers_pl"]),
                self.track_pop_pl_embedding(data["track_pop_pl"]),
                self.artist_genres_pl_embedding(data["artist_genres_pl"]),
            ], axis=1)
        
        # Build Cross Network
        if self._cross_layer is not None:
            cross_embs = self._cross_layer(all_embs)
            return self.dense_layers(cross_embs)
        else:
            return self.dense_layers(all_embs)

In [None]:
layer_sizes=[64,32]

test_query_model = QueryModel(layer_sizes,vocab_dict_load)

pl_result = test_query_model(test_instance)

print(f"Shape of pl_result: {pl_result.shape}")
pl_result

## Candidate Tower

In [144]:
for x in parsed_candidate_ds.take(1).as_numpy_iterator():
    pprint(x['artist_genres'])

array([b"'compositional ambient', 'neo-classical'"], dtype=object)


In [145]:
can_test_instance = {
    'artist_name': np.asarray([b'Fabrizio Paterlini']),
    'track_name': np.asarray([b"Controvento, senz'olio"]),
    'album_name' : np.asarray([b'Fragments Found']),
    'track_uri' : np.asarray([b'spotify:track:78KGo61bPbtgSqFnIuhQbZ']),
    'artist_uri' : np.asarray([b'spotify:artist:0jrFMgW018F1XVnLtCXOKi']),
    'album_uri' : np.asarray([b'spotify:album:39LWVcWN0HG7SF27UiuW9b']),
    'duration_ms' : np.asarray([222747]),
    'track_pop' : np.asarray([28]),
    'artist_pop' : np.asarray([69.0]),
    'artist_followers' : np.asarray([90784.0]),
    'artist_genres' : np.asarray([b"'compositional ambient', 'neo-classical'"]),
}
pprint(can_test_instance)

{'album_name': array([b'Fragments Found'], dtype='|S15'),
 'album_uri': array([b'spotify:album:39LWVcWN0HG7SF27UiuW9b'], dtype='|S36'),
 'artist_followers': array([90784.]),
 'artist_genres': array([b"'compositional ambient', 'neo-classical'"], dtype='|S40'),
 'artist_name': array([b'Fabrizio Paterlini'], dtype='|S18'),
 'artist_pop': array([69.]),
 'artist_uri': array([b'spotify:artist:0jrFMgW018F1XVnLtCXOKi'], dtype='|S37'),
 'duration_ms': array([222747]),
 'track_name': array([b"Controvento, senz'olio"], dtype='|S22'),
 'track_pop': array([28]),
 'track_uri': array([b'spotify:track:78KGo61bPbtgSqFnIuhQbZ'], dtype='|S36')}


In [146]:
EMBEDDING_DIM = 32
PROJECTION_DIM = 5
SEED = 1234
USE_CROSS_LAYER=True
DROPOUT='False'
DROPOUT_RATE='0.33'

class CandidateModel(tf.keras.Model):
    def __init__(self, layer_sizes, vocab_dict):
        super().__init__()
        
        # ========================================
        # Candidate features
        # ========================================
        
        # Feature: artist_name_can
        self.artist_name_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["artist_name_can"],
                    name="artist_name_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["artist_name_can"]),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="artist_name_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="artist_name_pooling"),
            ], name="artist_name_emb_model"
        )

        # Feature: track_name_can
        self.track_name_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["track_name_can"],
                    name="track_name_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["track_name_can"]),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="track_name_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="track_name_pooling"),
            ], name="track_name_emb_model"
        )
        
        # Feature: album_name_can
        self.album_name_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["album_name_can"],
                    name="album_name_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["album_name_can"]),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="album_name_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="album_name_pooling"),
            ], name="album_name_emb_model"
        )
        
        # Feature: artist_uri_can
        self.artist_uri_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['artist_uri_can']), 
                    output_dim=EMBEDDING_DIM,
                    name="artist_uri_emb_layer",
                ),
            ], name="artist_uri_emb_model"
        )
        
        # Feature: track_uri_can
        self.track_uri_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['track_uri_can']), 
                    output_dim=EMBEDDING_DIM,
                    name="track_uri_emb_layer",
                ),
            ], name="track_uri_emb_model"
        )
        
        # Feature: album_uri_can
        self.album_uri_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.Hashing(num_bins=200_000),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict['album_uri_can']), 
                    output_dim=EMBEDDING_DIM,
                    name="album_uri_emb_layer",
                ),
            ], name="album_uri_emb_model"
        )
        
        # Feature: duration_ms_can
        self.duration_ms_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_duration_ms_songs_pl'],
            variance=vocab_dict['var_duration_ms_songs_pl'],
            axis=None
        )
        
        # Feature: track_pop_can
        self.track_pop_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_track_pop'],
            variance=vocab_dict['var_track_pop'],
            axis=None
        )
        
        # Feature: artist_pop_can
        self.artist_pop_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_artist_pop'],
            variance=vocab_dict['var_artist_pop'],
            axis=None
        )
        
        # Feature: artist_followers_can
        self.artist_followers_normalized = tf.keras.layers.Normalization(
            mean=vocab_dict['avg_artist_followers'],
            variance=vocab_dict['var_artist_followers'],
            axis=None
        )
        
        # Feature: artist_genres_can
        self.artist_genres_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(
                    vocabulary=vocab_dict["artist_genres_can"],
                    name="artist_genres_txt_vectorizer",
                    ngrams=2,
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(vocab_dict["artist_genres_can"]),
                    output_dim=EMBEDDING_DIM,
                    mask_zero=False,
                    name="artist_genres_emb_layer",
                ),
                tf.keras.layers.GlobalAveragePooling1D(name="artist_genres_pooling"),
            ], name="artist_genres_emb_model"
        )
        # ========================================
        # Dense & Cross Layers
        # ========================================
        
        # Cross Layers
        
        # Dense Layer
        self.dense_layers = tf.keras.Sequential(name="candidate_dense_layers")
        initializer = tf.keras.initializers.GlorotUniform(seed=SEED)
        
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    activation="relu", 
                    kernel_initializer=initializer,
                )
            )
            if DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(DROPOUT_RATE))
                
        # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(
                tf.keras.layers.Dense(
                    layer_size, 
                    kernel_initializer=initializer
                )
            )
            
    # ========================================
    # Call Function
    # ========================================
            
    def call(self, data):
        
        all_embs = tf.concat(
            [
                self.artist_name_text_embedding(data['artist_name']),
                self.track_name_text_embedding(data['track_name']),
                self.album_name_text_embedding(data['album_name']),
                self.artist_uri_embedding(data['artist_uri']),
                self.track_uri_embedding(data['track_uri']),
                self.album_uri_embedding(data['album_uri']),
                tf.reshape(self.duration_ms_normalized(data["duration_ms"]), (-1, 1)),
                tf.reshape(self.track_pop_normalized(data["track_pop"]), (-1, 1)),
                tf.reshape(self.artist_pop_normalized(data["artist_pop"]), (-1, 1)),
                tf.reshape(self.artist_followers_normalized(data["artist_followers"]), (-1, 1)),
                self.artist_genres_text_embedding(data['artist_genres']),
            ], axis=1
        )
        
        return self.dense_layers(all_embs)

In [147]:
layer_sizes=[64,32]

test_can_track_model = CandidateModel(layer_sizes, vocab_dict_load)

can_result = test_can_track_model(can_test_instance)

print(f"Shape of can_result: {can_result.shape}")
can_result

Shape of can_result: (1, 32)


<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.09066525, -0.0329502 , -0.15642835,  0.14253873, -0.22031482,
        -0.20585833,  0.00237853,  0.09689148, -0.23827414,  0.07929395,
         0.1594544 , -0.3280748 , -0.05949891,  0.323694  ,  0.0431367 ,
         0.51964384,  0.00217604, -0.19773087,  0.66890687, -0.25335404,
         0.13731247, -0.22903056,  0.00915089,  0.12287275,  0.15709822,
        -0.08462201,  0.33281654, -0.3906095 ,  0.21985911, -0.26035348,
        -0.21582597,  0.17136298]], dtype=float32)>

In [148]:
test_can_track_model.summary(expand_nested=True)

Model: "candidate_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 artist_name_emb_model (Sequ  (None, 32)               9206720   
 ential)                                                         
|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|
| artist_name_txt_vectorizer   (None, None)           0         |
| (TextVectorization)                                           |
|                                                               |
| artist_name_emb_layer (Embe  (None, None, 32)       9206720   |
| dding)                                                        |
|                                                               |
| artist_name_pooling (Global  (None, 32)             0         |
| AveragePooling1D)                                             |
¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
 track_name_emb_model (Seque  (None, 32)           

In [142]:
for raw_record in raw_train_ds.take(5):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

features {
  feature {
    key: "album_name_can"
    value {
      bytes_list {
        value: "Glee: The Music, The Complete Season Three"
      }
    }
  }
  feature {
    key: "album_name_pl"
    value {
      bytes_list {
        value: "Glee: The Music, The Complete Season One"
        value: "Glee: The Music, The Complete Season Three"
        value: "Glee: The Music, The Complete Season Three"
        value: "Glee: The Music, The Complete Season Three"
        value: "Glee: The Music, The Complete Season Two"
        value: "Glee: The Music, The Complete Season Four"
        value: "Glee: The Music, The Complete Season Two"
        value: "Glee: The Music, The Complete Season Two"
        value: "Glee: The Music, The Complete Season Two"
        value: "Glee: The Music presents Glease"
        value: "Glee: The Music, The Complete Season Four"
        value: "Glee: The Music, The Complete Season Four"
        value: "Glee: The Music, The Complete Season Four"
        value: "Gle