## Beam conversion from Bigquery to TF Records

In this notebook we use Apache Beam to convert to tfrecords
The applications can be found in `beam_candidates` and `beam_training` for candidate generation and training

`pip install --upgrade 'apache-beam[gcp]'`

#### IMPORTANT - make sure you upgrade Dataflow with the above command then restart the notebook

In [1]:
# !pip install --upgrade 'apache-beam[gcp]'

In [2]:
import tensorflow as tf

In [3]:
# ! gsutil mb -l us-central1 gs://spotify-beam-v3

In [3]:
# setup
PROJECT_ID = 'hybrid-vertex'
BUCKET_NAME = 'spotify-beam-v3' # 'spotify-tfrecords-blog' # Set your Bucket name
REGION = 'us-central1' # Set the region for Dataflow jobs
VERSION = 'v7'

### Run the Dataflow app to convert from BQ to TFrecords

Candidate generation can be found in `beam_candidates`
Training and Validation generation can be found in `beam_training`

Usage:

  Candidate generation 
  
  `beam_candidates\python3 main.py`
   
  Training generation
  
  `beam_training\python3 main-train.py <BQ_table> <gcs data subfolder> <desired partition size MB> <BQ dataset size MB> <version tag>`
  
  
##### Be careful with quotas - running more than two jobs can run into quota issues with defaults

Training data generation runs about 1 hour with 10 workers

In [1]:
!tree beam_candidates

[01;34mbeam_candidates[00m
├── README.md
├── __init__.py
├── [01;34m__pycache__[00m
│   └── __init__.cpython-37.pyc
├── [01;34mbq_to_tfr[00m
│   ├── __init__.py
│   ├── [01;34m__pycache__[00m
│   │   ├── __init__.cpython-37.pyc
│   │   └── candidate_pipeline.cpython-37.pyc
│   └── candidate_pipeline.py
├── main.py
├── requirements.txt
└── setup.py

3 directories, 10 files


In [2]:
!tree beam_training

[01;34mbeam_training[00m
├── README.MD
├── __init__.py
├── [01;34mbeam_training[00m
│   ├── __init__.py
│   ├── [01;34mcreate_tfrecords.egg-info[00m
│   │   ├── SOURCES.txt
│   │   └── requires.txt
│   ├── [01;34mcreate_tfrecords_training.egg-info[00m
│   │   ├── SOURCES.txt
│   │   └── requires.txt
│   ├── main-train.py
│   ├── main-valid.py
│   ├── setup.py
│   └── [01;34mtrain_pipeline[00m
│       ├── __init__.py
│       ├── test.py
│       ├── train_pipe.py
│       └── train_pipe_shape.py
├── main-train.py
├── setup.py
└── [01;34mtrain_pipeline[00m
    ├── __init__.py
    ├── [01;34m__pycache__[00m
    │   ├── __init__.cpython-37.pyc
    │   └── train_pipe.cpython-37.pyc
    ├── train_pipe.py
    └── train_pipe_shape.py

6 directories, 21 files


In [5]:
%cd beam_training

/home/jupyter/spotify_mpd_two_tower/beam_training


In [None]:
! python3 main-train.py train_flatten_last_5 train_last_5 100 88_940 $VERSION

In [None]:
! python3 main-train.py train_flatten_valid_last_5 valid_last_5 100 920 $VERSION

## Test the output

In [5]:
## testing output
candidate_features = {
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
}

from google.cloud import storage

client = storage.Client()
# # gs://spotify-beam-v3/v3/candidates/*.tfrecords

BUCKET = 'spotify-beam-v3'
CANDIDATE_PREFIX = 'v3/candidates/'

candidate_files = []
for blob in client.list_blobs(f"{BUCKET}", prefix=f'{CANDIDATE_PREFIX}', delimiter="/"):
    candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
candidate_dataset = tf.data.TFRecordDataset(candidate_files)

def parse_candidate_tfrecord_fn(example):
    example = tf.io.parse_single_example(
        example, 
        features=candidate_features
    )
    return example

# parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn, num_parallel_calls=-1)

parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn) ### THIS NEEDS TO BE FIXED SO THE UNIQUE PRODUCT DATASET HAS THE SAME FIELD NAMES (goes thru the same model)

In [6]:
for x in parsed_candidate_dataset.batch(2).take(1):
    print(x)

{'album_name_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'The Sound of Everything Rmx',
       b'World Psychedelic Classics 4: Nobody Can Live Forever: The Existential Soul of Tim Maia'],
      dtype=object)>, 'album_uri_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'spotify:album:4a8tMD6qq6GUuUwNae38VI',
       b'spotify:album:0NxPZv3nWPBMk1o51GfwEY'], dtype=object)>, 'artist_followers_can': <tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 277649., 1363781.], dtype=float32)>, 'artist_genres_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b"'downtempo', 'electronica', 'funk', 'latin alternative', 'nu jazz', 'nu-cumbia', 'trip hop', 'world'",
       b"'brazilian boogie', 'brazilian soul', 'mpb', 'samba'"],
      dtype=object)>, 'artist_name_can': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Quantic', b'Tim Maia'], dtype=object)>, 'artist_pop_can': <tf.Tensor: shape=(2,), dtype=float32, numpy=array([64., 64.], dtype=float32)>, 'artist_uri

In [11]:
MAX_PLAYLIST_LENGTH = 5
feats = {
    'track_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_name_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'album_uri_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'duration_ms_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'track_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_pop_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'artist_genres_can': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'artist_followers_can': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'name': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'collaborative': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'n_songs_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'num_artists_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'num_albums_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    'description_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    'track_name_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
    'artist_name_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
    'album_name_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
    'track_uri_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH)),
    'duration_ms_songs_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    'artist_pop_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    'artists_followers_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    'track_pop_pl': tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    'artist_genres_pl': tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
}

In [12]:
## testing output
from google.cloud import storage

client = storage.Client()
# # gs://spotify-beam-v3/v3/candidates/*.tfrecords

train_dir = 'spotify-beam-v3'
train_dir_prefix = 'v6/train_last_5_v2/'

valid_files = []
for blob in client.list_blobs(f"{train_dir}", prefix=f'{train_dir_prefix}', delimiter="/"):
    valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
valid = tf.data.TFRecordDataset(valid_files)

def parse_tfrecord(example):
    example = tf.io.parse_single_example(
        example, 
        features=feats
    )
    return example

# parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn, num_parallel_calls=-1)

valid_parsed = valid.map(parse_tfrecord) ### THIS NEEDS TO BE FIXED SO THE UNIQUE PRODUCT DATASET HAS THE SAME FIELD NAMES (goes thru the same model)

In [13]:
for x in valid_parsed.batch(2).take(1):
    print(x)

{'album_name_can': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Long Way Home', b'Another Level'], dtype=object)>, 'album_name_pl': <tf.Tensor: shape=(2, 5), dtype=string, numpy=
array([[b'Storyteller', b'Ripcord', b'The Album About Nothing',
        b'So Good', b'Cruel'],
       [b'Bad To The Bone', b'All Saints', b'The Rhythm Of The Night',
        b'Another Level', b'Another Level']], dtype=object)>, 'album_uri_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'spotify:album:6yhHW85d9Z6D3uyvLZSZxI',
       b'spotify:album:6nUnNpoLKWpb9qxhYiT98S'], dtype=object)>, 'artist_followers_can': <tf.Tensor: shape=(2,), dtype=float32, numpy=array([216415., 641020.], dtype=float32)>, 'artist_genres_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b"'electropop', 'gauze pop'",
       b"'boy band', 'contemporary r&b', 'dance pop', 'hip hop', 'hip pop', 'neo soul', 'new jack swing', 'pop rap', 'quiet storm', 'r&b', 'rap', 'urban contemporary'"],
      dtype=object)>, 'ar