## Beam conversion from Bigquery to TF Records

In this notebook we use Apache Beam to convert to tfrecords
The applications can be found in `beam_candidates` and `beam_training` for candidate generation and training

`pip install --upgrade 'apache-beam[gcp]'`

#### IMPORTANT - make sure you upgrade Dataflow with the above command then restart the notebook

In [13]:
# !pip install --upgrade 'apache-beam[gcp]' --user

In [2]:
import os 

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

import tensorflow as tf

2023-01-03 21:23:20.961872: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-tfrs-retrieval"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)

# setup
# PROJECT_ID = 'hybrid-vertex'
# BUCKET_NAME = 'spotify-beam-v3' # 'spotify-tfrecords-blog' # Set your Bucket name
# REGION = 'us-central1' # Set the region for Dataflow jobs

In [3]:
# TODO - parameterize these?

# BUCKET = 'spotify-data-regimes'
# CANDIDATE_PREFIX = 'jtv1-candidates'

train_dir = 'spotify-data-regimes'
train_dir_prefix = 'jtv8/train_flat_valid_last_5_v8/'

valid_dir = 'spotify-data-regimes'
valid_dir_prefix = 'jtv8/train_flat_last_5_v8/'

VERSION="jtv10"

In [3]:
! gsutil ls -al gs://$BUCKET_NAME

### Run the Dataflow app to convert from BQ to TFrecords

Candidate generation can be found in `beam_candidates`
Training and Validation generation can be found in `beam_training`

Usage:

  Candidate generation 
  
  `beam_candidates\python3 main.py`
   
  Training generation
  
  `beam_training\python3 main-train.py <BQ_table> <gcs data subfolder> <desired partition size MB> <BQ dataset size MB> <version tag>`
  
  
##### Be careful with quotas - running more than two jobs can run into quota issues with defaults

Training data generation runs about 1 hour with 10 workers

In [8]:
# !tree beam_candidates

In [5]:
!tree beam_training

[01;34mbeam_training[00m
├── README.MD
├── __init__.py
├── [01;34mbeam_training[00m
├── [01;34mcreate_tfrecords_training.egg-info[00m
│   ├── PKG-INFO
│   ├── SOURCES.txt
│   ├── dependency_links.txt
│   ├── requires.txt
│   └── top_level.txt
├── main-train.py
├── setup.py
└── [01;34mtrain_pipeline[00m
    ├── __init__.py
    ├── [01;34m__pycache__[00m
    │   ├── __init__.cpython-37.pyc
    │   ├── train_pipe.cpython-37.pyc
    │   └── train_pipe_shape.cpython-37.pyc
    └── train_pipe_shape.py

4 directories, 14 files


In [4]:
import os 
os.chdir('/home/jupyter/jw-repo/spotify_mpd_two_tower')
os.getcwd()

'/home/jupyter/jw-repo/spotify_mpd_two_tower'

In [7]:
%cd beam_training

/home/jupyter/jw-repo/spotify_mpd_two_tower/beam_training


### BQ Talble sizes (Mb)
train_bq_mb = 102780
valid_bq_mb = 1040

In [8]:
import time

### Validation set

In [9]:
target_shard_size_mb = 200
total_mb_train = 1_280
NUM_TF_RECORDS = int(total_mb_train) // int(target_shard_size_mb)
NUM_TF_RECORDS

6

In [7]:
start_time = time.time()

! python3 main-train.py valid_flat_last_5_v9 valid_v9 200 1_280 $VERSION

end_time = time.time()
runtime_mins = int((end_time - start_time) / 60)
print(f"total runtime_mins: {runtime_mins}")

### Tain set

In [3]:
target_shard_size_mb = 2000
total_mb_train = 126_410
NUM_TF_RECORDS = int(total_mb_train) // int(target_shard_size_mb)
NUM_TF_RECORDS

63

In [6]:
! python3 main-train.py train_flat_last_5_v9 train_v9 2000 126_410 $VERSION

# Now export the candidates

**TODO:** clean-up section

In [8]:
%cd beam_candidates

/home/jupyter/jw-repo/spotify_mpd_two_tower/beam_candidates


In [5]:
! python3 main.py

## Test the output

### Candidate files

In [11]:
## testing output
candidate_features = {
    "track_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),            
    "track_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "album_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),           
    "album_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
    "duration_ms_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
    "track_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
    "artist_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "artist_genres_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_followers_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    # new
    "track_pl_titles_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_danceability_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_energy_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_key_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_loudness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_mode_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_speechiness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_acousticness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_instrumentalness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_liveness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_valence_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_tempo_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "time_signature_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
}

from google.cloud import storage

client = storage.Client()

# BUCKET = 'spotify-data-regimes'
# CANDIDATE_PREFIX = 'jtv1-candidates'

# candidate_files = []
# for blob in client.list_blobs(f"{BUCKET_NAME}", prefix=f'{CANDIDATE_PREFIX}', delimiter="/"):
#     candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_files = f'gs://{BUCKET_NAME}/jtv8/candidates/candidates-00000-of-00001.tfrecords'
    
candidate_dataset = tf.data.TFRecordDataset(candidate_files)

def parse_candidate_tfrecord_fn(example):
    example = tf.io.parse_single_example(
        example, 
        features=candidate_features
    )
    return example

# parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn, num_parallel_calls=-1)

parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn) ### THIS NEEDS TO BE FIXED SO THE UNIQUE PRODUCT DATASET HAS THE SAME FIELD NAMES (goes thru the same model)

2022-12-09 16:38:22.028445: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-09 16:38:23.740482: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38238 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


In [12]:
from pprint import pprint

for x in parsed_candidate_dataset.batch(2).take(1):
    pprint(x)

{'album_name_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'Memories of a Time to Come - Best Of', b'Beyond The Red Mirror'],
      dtype=object)>,
 'album_uri_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'spotify:album:16dXyNDHXadRzHld2AAtfT',
       b'spotify:album:5bH9q5EMi147oTykVETEZn'], dtype=object)>,
 'artist_followers_can': <tf.Tensor: shape=(2,), dtype=float32, numpy=array([588753., 588753.], dtype=float32)>,
 'artist_genres_can': <tf.Tensor: shape=(2,), dtype=string, numpy=
array([b"'german metal', 'german power metal', 'hard rock', 'melodic metal', 'metal', 'neo classical metal', 'power metal', 'speed metal'",
       b"'german metal', 'german power metal', 'hard rock', 'melodic metal', 'metal', 'neo classical metal', 'power metal', 'speed metal'"],
      dtype=object)>,
 'artist_name_can': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Blind Guardian', b'Blind Guardian'], dtype=object)>,
 'artist_pop_can': <tf.Tensor: shape=(2,), dtype=float

### valid files

In [13]:
MAX_PLAYLIST_LENGTH = 5

feats = {
    # ===================================================
    # candidate track features
    # ===================================================
    "track_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),            
    "track_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "album_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),           
    "album_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
    "duration_ms_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
    "track_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
    "artist_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "artist_genres_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_followers_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_pl_titles_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_danceability_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_energy_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_key_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_loudness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_mode_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_speechiness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_acousticness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_instrumentalness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_liveness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_valence_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_tempo_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "time_signature_can": tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    # ===================================================
    # summary playlist features
    # ===================================================
    "pl_name_src" : tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
    'pl_collaborative_src' : tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
    # 'num_pl_followers_src' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
    'pl_duration_ms_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
    'num_pl_songs_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),  # num_pl_songs_new | n_songs_pl_new
    'num_pl_artists_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
    'num_pl_albums_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
    # 'avg_track_pop_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
    # 'avg_artist_pop_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    # 'avg_art_followers_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    # ===================================================
    # ragged playlist features
    # ===================================================
    # bytes / string
    "track_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "track_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "artist_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "artist_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
    "album_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "album_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "artist_genres_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "tracks_playlist_titles_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
    # Float List
    "duration_ms_songs_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)), 
    "track_pop_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)), 
    "artist_pop_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)), 
    "artists_followers_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_danceability_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_energy_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_key_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
    "track_loudness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_mode_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_speechiness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_acousticness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_instrumentalness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_liveness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)), 
    "track_valence_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    "track_tempo_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)), 
    "time_signature_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 
}


In [14]:
## testing output
from google.cloud import storage

client = storage.Client()
# # gs://spotify-beam-v3/v3/candidates/*.tfrecords

train_dir = 'spotify-data-regimes'
train_dir_prefix = 'jtv8/train_flat_valid_last_5_v8/'

valid_files = []
for blob in client.list_blobs(f"{train_dir}", prefix=f'{train_dir_prefix}', delimiter="/"):
    valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
valid = tf.data.TFRecordDataset(valid_files)

def parse_tfrecord(example):
    example = tf.io.parse_single_example(
        example, 
        features=feats
    )
    return example

# parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn, num_parallel_calls=-1)

valid_parsed = valid.map(parse_tfrecord)

In [4]:
valid_parsed

In [3]:
for x in valid_parsed.batch(1).take(1):
    print(x)

### train files

In [17]:
## testing output
from google.cloud import storage

client = storage.Client()

train_dir = 'spotify-data-regimes'
train_dir_prefix = 'jtv8/train_flat_last_5_v8/'


train_files = []
for blob in client.list_blobs(f"{train_dir}", prefix=f'{train_dir_prefix}', delimiter="/"):
    train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
train = tf.data.TFRecordDataset(train_files)

def parse_tfrecord(example):
    example = tf.io.parse_single_example(
        example, 
        features=feats
    )
    return example

# parsed_candidate_dataset = candidate_dataset.map(parse_candidate_tfrecord_fn, num_parallel_calls=-1)

train_parsed = train.map(parse_tfrecord) ### THIS NEEDS TO BE FIXED SO THE UNIQUE PRODUCT DATASET HAS THE SAME FIELD NAMES (goes thru the same model)

In [2]:
train_parsed

In [12]:
for x in train_parsed.batch(3).take(1):
    print(x)

{'album_name_can': <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'Alex Goot & Friends, Vol. 3', b'Punk Goes Pop, Vol. 7',
       b'B.o.B Presents: The Adventures of Bobby Ray'], dtype=object)>, 'album_name_pl': <tf.Tensor: shape=(3, 6), dtype=string, numpy=
array([[b'Be Not Nobody', b'Let Go', b'Goodbye Lullaby',
        b'The Best Damn Thing', b'Alex Goot & Friends, Vol. 3',
        b'Two Lanes Of Freedom'],
       [b'Starboy', b'HUMBLE.', b'Shake the Lights When You in Trouble',
        b'Punk Goes Pop, Vol. 7', b'Obscure: La Deuxi\xc3\xa8me',
        b'Problem'],
       [b'B.o.B Presents: The Adventures of Bobby Ray',
        b'Full Moon Fever', b"Blowin' Your Mind!",
        b'Camp Rock 2: The Final Jam', b'News Of The World',
        b'The Bodyguard - Original Soundtrack Album']], dtype=object)>, 'album_uri_can': <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'spotify:album:0GhppB5IFroTmdP5HxQKE0',
       b'spotify:album:0QnLGxrlc61tSudHIZr4Sz',
       b'spotify:album