## SPOTIFY Create the tensorflow.io interface for the event and product table in Bigquery
Best practices from Google are in this blog post

In [1]:
# set variables
DROPOUT = False
DROPOUT_RATE = 0.2
EMBEDDING_DIM = 64
MAX_TOKENS = 100_000
BATCH_SIZE = 256
ARCH = [128, 64]
NUM_EPOCHS = 1
SEED = 41781897
PROJECT_ID = 'jtotten-project'
DROP_FIELDS = ['pid', 'track_uri', 'artist_uri', 'album_uri']

#### Quick counts on training data



#### Quick counts on the training records for track

In [2]:
%%bigquery
select count(1) from jtotten-project.spotify_mpd.playlists

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1573.85query/s]
Downloading: 100%|██████████| 1/1 [00:00<00:00,  1.29rows/s]


Unnamed: 0,f0_
0,66346428


#### Same with playlist

#### Quick counts (this time playlists) on the training records for track

In [3]:
%%bigquery
select count(1) from jtotten-project.spotify_mpd.track_audio

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1474.79query/s]
Downloading: 100%|██████████| 1/1 [00:00<00:00,  1.36rows/s]


Unnamed: 0,f0_
0,2261490


### Set the tf.io pipelines function from bigquery

[Great blog post here on it](https://towardsdatascience.com/how-to-read-bigquery-data-from-tensorflow-2-0-efficiently-9234b69165c8)

In [4]:
import tensorflow as tf
from tensorflow.python.framework import dtypes
from tensorflow_io.bigquery import BigQueryClient
from tensorflow_io.bigquery import BigQueryReadSession


def bq_to_tfdata(client, row_restriction, table_id, col_names, col_types, dataset, batch_size=BATCH_SIZE):
    TABLE_ID = table_id
    COL_NAMES = col_names
    COL_TYPES = col_types
    DATASET = dataset
    bqsession = client.read_session(
        "projects/" + PROJECT_ID,
        PROJECT_ID, TABLE_ID, DATASET,
        COL_NAMES, COL_TYPES,
        requested_streams=2,
        row_restriction=row_restriction)
    dataset = bqsession.parallel_read_rows()
    return dataset.prefetch(1).shuffle(batch_size*10).batch(batch_size)

## Get the song metadata

To get a pipeline working we need the metadata for the table along with the table information. The following functions are helpers that give us the metadata into the proper types for `tf`


For each table id, programatically get
* Column names
* Column types

In [9]:
%%bigquery schema
SELECT * FROM jtotten-project.spotify_mpd.INFORMATION_SCHEMA.TABLES
where table_name in ('track_audio', 'playlists');

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 836.85query/s]                          
Downloading: 100%|██████████| 2/2 [00:00<00:00,  2.63rows/s]


In [10]:
schema # we will get the fields out of the ddl field

Unnamed: 0,table_catalog,table_schema,table_name,table_type,is_insertable_into,is_typed,creation_time,base_table_catalog,base_table_schema,base_table_name,snapshot_time_ms,ddl
0,jtotten-project,spotify_mpd,track_audio,BASE TABLE,YES,NO,2022-04-06 17:46:25.801000+00:00,,,,NaT,CREATE TABLE `jtotten-project.spotify_mpd.trac...
1,jtotten-project,spotify_mpd,playlists,BASE TABLE,YES,NO,2022-04-06 17:03:52.068000+00:00,,,,NaT,CREATE TABLE `jtotten-project.spotify_mpd.play...


## Helper functions to pull metadata from ddl statements

In [46]:
# Function to convert string type representation to tf data types

def conv_dtype_to_tf(dtype_str):
    if dtype_str == 'FLOAT64':
        return dtypes.float64
    elif dtype_str == 'INT64':
        return dtypes.int64
    else: 
        return dtypes.string
        
def get_metadata_from_ddl(ddl, drop_field=None):
    fields = []
    types = []
    ddl = ddl.values[0]
    for line in ddl.splitlines():
        if line[:1] == ' ': #only pull indented lines for the fields
            # drop the comma
            line = line.replace(',','')
            space_delim = line.split(' ')
            if space_delim[2] in drop_field:
                pass
            else:
                fields.append(space_delim[2])
                types.append(conv_dtype_to_tf(space_delim[3]))
    return fields, types


track_audio_fields, track_audio_types = get_metadata_from_ddl(schema.ddl[schema.table_name == 'track_audio'], DROP_FIELDS)
playlist_fields, playlist_types = get_metadata_from_ddl(schema.ddl[schema.table_name == 'playlists'], DROP_FIELDS) 

In [48]:
# Quick check on data
for a, b in zip(playlist_fields, playlist_types):
    print(a +" : " + str(b))

name : <dtype: 'string'>
collaborative : <dtype: 'string'>
modified_at : <dtype: 'int64'>
num_tracks : <dtype: 'int64'>
num_albums : <dtype: 'int64'>
num_followers : <dtype: 'int64'>
num_edits : <dtype: 'int64'>
num_artists : <dtype: 'int64'>
description : <dtype: 'string'>
pos : <dtype: 'string'>
artist_name : <dtype: 'string'>
track_name : <dtype: 'string'>
album_name : <dtype: 'string'>
duration_ms : <dtype: 'string'>


In [49]:
# Quick check on data
for a, b in zip(track_audio_fields, track_audio_types):
    print(a +" : " + str(b))
    
DROP_TRACK_AUDIO_FIELDS = ['pid', 'track_uri', 'artist_uri', 'album_uri']

artist_name : <dtype: 'string'>
track_name : <dtype: 'string'>
album_name : <dtype: 'string'>
name : <dtype: 'string'>
danceability : <dtype: 'float64'>
energy : <dtype: 'float64'>
key : <dtype: 'float64'>
loudness : <dtype: 'float64'>
mode : <dtype: 'float64'>
speechiness : <dtype: 'float64'>
acousticness : <dtype: 'float64'>
instrumentalness : <dtype: 'float64'>
liveness : <dtype: 'float64'>
valence : <dtype: 'float64'>
tempo : <dtype: 'float64'>
type : <dtype: 'string'>
id : <dtype: 'string'>
uri : <dtype: 'string'>
track_href : <dtype: 'string'>
analysis_url : <dtype: 'string'>
time_signature : <dtype: 'float64'>
artist_pop : <dtype: 'int64'>
track_pop : <dtype: 'string'>
genres : <dtype: 'string'>
duration_ms : <dtype: 'int64'>


### Now the helper functions are set. Below tf.data pipelines are created from bigquery

In [50]:
track_train_pipeline = bq_to_tfdata(BigQueryClient(), row_restriction=None, table_id = 'track_train'
                                    , col_names=track_audio_fields, col_types=track_audio_types, dataset='spotify_mpd', batch_size=1)

2022-04-15 19:42:09.859341: W tensorflow_io/core/kernels/audio_video_mp3_kernels.cc:271] libmp3lame.so.0 or lame functions are not available
2022-04-15 19:42:09.860020: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX2 FMA
2022-04-15 19:42:09.995418: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-15 19:42:09.995904: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-15 19:42:10.006326: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-15 19:42:10.006779: 

In [51]:
### Validate we are getting records

for line in track_train_pipeline.take(1):
    print(line) #should come out based on batch size

2022-04-15 19:42:11.831366: E tensorflow/core/framework/dataset.cc:577] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.
2022-04-15 19:42:11.831415: E tensorflow/core/framework/dataset.cc:581] UNIMPLEMENTED: Cannot merge options for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.
2022-04-15 19:42:11.831888: E tensorflow/core/framework/dataset.cc:577] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.
2022-04-15 19:42:11.831924: E tensorflow/core/framework/dataset.cc:581] UNIMPLEMENTED: Cannot merge options for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.


OrderedDict([('acousticness', <tf.Tensor: shape=(1,), dtype=float64, numpy=array([0.58])>), ('album_name', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Such Sweet Thunder'], dtype=object)>), ('analysis_url', <tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'https://api.spotify.com/v1/audio-analysis/1J8sWejYdYZDAhalagOAUe'],
      dtype=object)>), ('artist_name', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Duke Ellington'], dtype=object)>), ('artist_pop', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>), ('danceability', <tf.Tensor: shape=(1,), dtype=float64, numpy=array([0.557])>), ('duration_ms', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([247493])>), ('energy', <tf.Tensor: shape=(1,), dtype=float64, numpy=array([0.196])>), ('genres', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'unknown'], dtype=object)>), ('id', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'1J8sWejYdYZDAhalagOAUe'], dtype=object)>), ('instrumentalness', <tf.Tensor: sha

In [52]:
## Validate playlist data
playlist_train_pipeline = bq_to_tfdata(BigQueryClient(), row_restriction=None, table_id = 'playlists'
                                    , col_names=playlist_fields, col_types=playlist_types, dataset='spotify_mpd', batch_size=1)
for line in playlist_train_pipeline.take(1):
    print(line) #should come out based on batch size

2022-04-15 19:42:14.939738: E tensorflow/core/framework/dataset.cc:577] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.
2022-04-15 19:42:14.939786: E tensorflow/core/framework/dataset.cc:581] UNIMPLEMENTED: Cannot merge options for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.
2022-04-15 19:42:14.940148: E tensorflow/core/framework/dataset.cc:577] UNIMPLEMENTED: Cannot compute input sources for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.
2022-04-15 19:42:14.940186: E tensorflow/core/framework/dataset.cc:581] UNIMPLEMENTED: Cannot merge options for dataset of type IO>BigQueryDataset, because the dataset does not implement `InputDatasets`.


OrderedDict([('album_name', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Kuntry'], dtype=object)>), ('artist_name', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Jawga Boyz'], dtype=object)>), ('collaborative', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'false'], dtype=object)>), ('description', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b''], dtype=object)>), ('duration_ms', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'229914'], dtype=object)>), ('modified_at', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([1486857600])>), ('name', <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Country rap'], dtype=object)>), ('num_albums', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([111])>), ('num_artists', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([74])>), ('num_edits', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([51])>), ('num_followers', <tf.Tensor: shape=(1,), dtype=int64, numpy=array([3])>), ('num_tracks', <tf.Tensor: shape=(1,),