# Build baseline tfrs model 

Look inside of `./two_tower_src/` for the source code and model code

In [1]:
PROJECT_ID = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
LOCATION = 'us-central1' 

In [2]:
# !pip install tensorflow-recommenders==0.6.0 --user

In [3]:
TF_GPU_THREAD_MODE='gpu_private'
TF_GPU_ALLOCATOR='cuda_malloc_async'

In [None]:
import json
import tensorflow as tf
import tensorflow_recommenders as tfrs

from google.cloud import storage

import numpy as np
import pickle as pkl
from pprint import pprint
import os

os.environ['TF_GPU_THREAD_MODE']='gpu_private'
os.environ['TF_GPU_THREAD_COUNT']='1'
    

from two_tower_src import two_tower as tt

In [None]:
import logging
# limiting GPU growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logging.info(f'detected: {len(gpus)} GPUs')
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        logging.info(e)

## Create Dataset for local training and testing

### Playlist dataset

In [None]:
batch_size = 40000

train_dir = 'spotify-beam-v3'
train_dir_prefix = 'v6/train_last_5/'

valid_dir = 'spotify-beam-v3'
valid_dir_prefix = 'v6/valid_last_5/'

client = storage.Client()

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO 

train_files = []
for blob in client.list_blobs(f'{train_dir}', prefix=f'{train_dir_prefix}', delimiter="/"):
    train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

train_dataset = tf.data.Dataset.from_tensor_slices(train_files).prefetch(
    tf.data.AUTOTUNE,
).with_options(options)
train_dataset = train_dataset.interleave(
    lambda x: tf.data.TFRecordDataset(x),
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False,
).map(
    tt.parse_tfrecord,
    num_parallel_calls=tf.data.AUTOTUNE,
).batch(
    batch_size 
).prefetch(
    tf.data.AUTOTUNE,
)


valid_files = []
for blob in client.list_blobs(f'{valid_dir}', prefix=f'{valid_dir_prefix}', delimiter="/"):
    valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))


valid_dataset = tf.data.Dataset.from_tensor_slices(valid_files).prefetch(
    tf.data.AUTOTUNE,
).with_options(options)
valid_dataset = valid_dataset.interleave(
    lambda x: tf.data.TFRecordDataset(x),
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False,
).map(
    tt.parse_tfrecord,
    num_parallel_calls=tf.data.AUTOTUNE,
      ).batch(
    batch_size
).prefetch(
    tf.data.AUTOTUNE,
)

# Local Training

In [None]:
layer_sizes=[256,128]
model = tt.TheTwoTowers(layer_sizes)

model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))

In [None]:
## Quick look at the layers
print("Playlist (query) Tower:")

for i, l in enumerate(model.query_tower.layers):
    print(i, l.name)

In [None]:
print("Track (candidate) Tower:")
for i, l in enumerate(model.candidate_tower.layers):
    print(i, l.name)

### Local training for one Epoch

In [None]:
import time

In [None]:
NUM_EPOCHS = 1
start_time = time.time()
layer_history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    validation_freq=1,
    epochs=NUM_EPOCHS,
    # steps_per_epoch=10,
    # callbacks=tensorboard_cb,
    # verbose=0
)
end_time = time.time()

print(f"Training for {NUM_EPOCHS} epoch")
accuracy = layer_history.history["factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top 100 categorical accuracy: {accuracy}")

In [None]:
accuracy