# Two - Tower Retreival Model

### Key resources:
* Many pages [here] include great techniques to 

In [1]:
from google.cloud import storage

client = storage.Client()
files = []
for blob in client.list_blobs('tfrs-tf-records'):
    files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
print(files[:2])

['gs://tfrs-tf-records/file_00-12227.tfrec', 'gs://tfrs-tf-records/file_01-12228.tfrec']


In [2]:
import json
import tensorflow as tf
import tensorflow_recommenders as tfrs


def parse_tfrecord_fn(example):
    feature_description = {
         #query features
        "query": tf.io.FixedLenFeature([1], tf.string),
        "last_viewed": tf.io.FixedLenFeature([1], tf.string),

        #candidate features
        "IVM_s": tf.io.FixedLenFeature([1], tf.string),
        "description": tf.io.FixedLenFeature([1], tf.string),
        "price_td": tf.io.FixedLenFeature([1], tf.float32),
        "PriceRange_s": tf.io.FixedLenFeature([1],  tf.string),
        "productTypeCombo_ss": tf.io.FixedLenFeature([1], tf.string),
        "visual": tf.io.FixedLenFeature([2048,], tf.float32),
        "month": tf.io.FixedLenFeature([1], tf.string),
        "hour": tf.io.FixedLenFeature([1], tf.string)
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example


EMBEDDING_DIM = 32
MAX_TOKENS = 1_000_000


TF_RECORDS_DIR = 'gs://tfrs-tf-records'

# Set dev dataset CHANGE THIS LATER TO THE WHOLE DIR
raw_dataset = tf.data.TFRecordDataset(files)

    
parsed_dataset = raw_dataset.map(parse_tfrecord_fn, num_parallel_calls=-1)



In [3]:
class QueryModel(tf.keras.Model):
    
    def __init__(self, layer_sizes):
        super().__init__()
        
        month_vocab = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
        hour_vocab = month_vocab + ["13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "00"]

        self.month_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=month_vocab, mask_token=None, name="month_lookup"),
            tf.keras.layers.Embedding(
                len(month_vocab)+1, EMBEDDING_DIM, mask_zero=True, name="month_emb"),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        
        self.hour_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=hour_vocab, mask_token=None, name="hour_lookup"),
            tf.keras.layers.Embedding(len(hour_vocab)+1, EMBEDDING_DIM, mask_zero=True, name="hour_emb"),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
    
        self.query_vectorizor = tf.keras.layers.experimental.preprocessing.TextVectorization(
            ngrams=2, name="query_tv")
        
        self.last_viewed_vectorizor = tf.keras.layers.experimental.preprocessing.TextVectorization(ngrams=2, name="last_viewed_tv")
        
        self.query_embedding = tf.keras.Sequential([
            self.query_vectorizor,
            tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM , mask_zero=True, name="query_emb"),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        
        self.last_viewed_embedding = tf.keras.Sequential([
            self.last_viewed_vectorizor,
            tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM , mask_zero=True, name="last_v_emb"),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        
        ### adapt stuff
        self.query_vectorizor.adapt(parsed_dataset.map(lambda x: x['query']))
        self.last_viewed_vectorizor.adapt(parsed_dataset.map(lambda x: x['last_viewed'])) 
        
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
        # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, kernel_regularizer='l2')) #l2 reg was in research - helps performance


    def call(self, data):    
        all_embs = tf.concat(
                [
                    self.month_embedding(data['month']),
                    self.hour_embedding(data['hour']),
                    self.query_embedding(data['query']),
                    self.last_viewed_embedding(data['last_viewed'])
                ], axis=1)
        return self.dense_layers(all_embs)

In [4]:
# test_qm = QueryModel([256, 128, 64, 32])

In [5]:
# #validate batch output

# # validate output
# #should roll out a EMB_DIM * 4 (for the four features in the query)
# batches = last_viewed_adapt_data.batch(128)
# qm_emb = batches.map(lambda x: test_qm(x))

# for line in qm_emb.take(1):
#     print(line)

In [6]:
class ProductModel(tf.keras.Model):
    def __init__(self, layer_sizes):
        super().__init__()
        
        #continuous example - allowing batch norms via layer below for standardization of inputs
        self.price_normalizer = tf.keras.layers.experimental.preprocessing.Normalization(name="price_norm")

        #categorical with vocabs
        pr_vocab = ['002_$100 - $299', '000_$0 - $49', '001_$50 - $99', 
                    '003_$300 - $599', '005_$1000 - $3999', '004_$600 - $999', 
                    '006_$4000+', '']

        self.price_range_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                  vocabulary=pr_vocab, mask_token=None, name="price_range_lu"),
            tf.keras.layers.Embedding(
                len(pr_vocab)+1, EMBEDDING_DIM, mask_zero=True, name="price_range_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="price_range_flatten")
        ])
        
        # categorical: description - below are all embeddings with unk vocabs - will be adapted 
        self.description_vectorizor = tf.keras.layers.experimental.preprocessing.TextVectorization(ngrams=2, name = "description_vectorizor")

        self.description_embedding = tf.keras.Sequential([
            self.description_vectorizor,
            tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM, mask_zero=True, name = "desc_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="desc_flatten")
        ])
        
        #categorical: sku
        self.sku_vectorizor = tf.keras.layers.experimental.preprocessing.TextVectorization(name = "sku_vectorizor")

        self.sku_embedding = tf.keras.Sequential([
            self.sku_vectorizor,
            tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM, mask_zero=True, name = "sku_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="sku_flat")
        ])
        
        ## product ragged stuff - 

        ## prod heirarcy
        def split_fn(string):
            return tf.strings.split(string, sep="|")

        self.prod_heir_vectorizor = tf.keras.layers.experimental.preprocessing.TextVectorization(ngrams=2, split=split_fn, name = "heir_vectorizor")

        self.heir_embedding = tf.keras.Sequential([
            self.prod_heir_vectorizor,
            tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM, mask_zero=True, name = "heir_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="heir_flat")
        ])
        
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
            # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, kernel_regularizer='l2')) #l2 reg was in research - helps performance
        
        #adapt stuff
        self.description_vectorizor.adapt(parsed_dataset.map(lambda x: x['description']))
        self.sku_vectorizor.adapt(parsed_dataset.map(lambda x: x['IVM_s']))
        self.prod_heir_vectorizor.adapt(parsed_dataset.map(lambda x: x['productTypeCombo_ss']))
        
        #continous adapts - look ahead batching - grab up to batch_size
#         batch_size = 1024 #set to whatever you want to look ahead to get a good sample
        self.price_normalizer.adapt(parsed_dataset.map(lambda x: x['price_td']))        


    def call(self, data):
        all_embs = tf.concat(
            [
                self.price_normalizer(data['price_td']),
                self.price_range_embedding(data['PriceRange_s']),
                self.description_embedding(data['description']),
                self.sku_embedding(data['IVM_s']),
                self.heir_embedding(data['productTypeCombo_ss']),
                data['visual']
            ], axis=1)
#         return self.dense_layers(all_embs)  #last plus for number continuous + 1 if you add other(s)
        return self.dense_layers(all_embs)  #last plus for number continuous + 1 if you add other(s) 2048 for visual


In [7]:
# test_pm = ProductModel([256, 128, 64, 32])

In [8]:
# # validate output

# batches = catalog_adapt_data.batch(128)
# pm_emb = batches.map(lambda x: test_pm(x))

# for line in pm_emb.take(1):
#     print(line)

In [9]:
#now combine them and assign retreival tasks, etc..

class TheTwoTowers(tfrs.models.Model):

    def __init__(self, layer_sizes):
        super().__init__()
        self.query_model = QueryModel(layer_sizes)
        self.candidate_model = ProductModel(layer_sizes)
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=catalog_adapt_data.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, data, training=False):
        query_embeddings = self.query_model(data)
        product_embeddings = self.candidate_model(data)

        return self.task(
            query_embeddings, product_embeddings) #TOGGLE,  compute_metrics=not training)#### turn off metrics to save time on training

In [10]:
# test_tt = TheTwoTowers([256, 128, 64, 32])

In [11]:
tf.random.set_seed(42)

num_records = 4293302 #sum(1 for _ in file_io.FileIO(SMALL_DATASET, 'rb')) #CHANGE THIS TO LARGE DATASET WHEN READY

shuffled = parsed_dataset.shuffle(num_records, seed=42, reshuffle_each_iteration=False)

train_records = int(round(num_records * 0.95,0))
test_records = num_records - train_records

# train_records
train = shuffled.take(train_records)
test = shuffled.skip(train_records).take(test_records)

cached_train = train.shuffle(train_records).batch(2048)
cached_test = test.batch(4096).cache()

In [None]:
model = TheTwoTowers([256, 128, 64, 32])

model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
model.summary()

In [None]:
num_epochs = 5

layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = layer_history.history["factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

In [None]:
from matplotlib import pyplt as plt

plt.plot(epochs, layer_history.history["factorized_top_k/top_100_categorical_accuracy"], label="{} - Deep Architecture".format([256, 128, 64, 32]))
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-100 accuracy");
plt.legend()