In [9]:
import os
import time

from absl import app
from absl import flags
from absl import logging

import numpy as np
import tensorflow

from google.cloud import aiplatform as vertex_ai
from google.cloud import bigquery

In [10]:
PREFIX = 'two-tower' 
PREFIX = 'css_retail'
DISPLAY_NAME = f'{PREFIX}-tensorboard'
PROJECT= 'babrams-recai-demo-final'
REGION='us-central1'

STAGING_BUCKET = """gs://{}_vertex_training""".format(PROJECT) #lowes-reccomendation-tensorboard-logs-us-central1 - this 
#TENSORBOARD = 'projects/258043323883/locations/us-central1/tensorboards/4236655796332527616' #note really can only get this after gcloud beta ai tensorboards create...
#VERTEX_SA = 'vertex-tb@lowes-reccomendation.iam.gserviceaccount.com'

# initialize vertex sdk
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

client = bigquery.Client()

In [None]:
!pip install -r notebook_requirements.txt --user

In [11]:
import json
import tensorflow as tf
import tensorflow_recommenders as tfrs
import os
from absl import app
from absl import flags
from absl import logging
from google.cloud import storage

FLAGS = flags.FLAGS
flags.DEFINE_float("LR", 0.01, "Learning Rate")
flags.DEFINE_integer("EMBEDDING_DIM", 16, "Embedding dimension")
flags.DEFINE_integer("MAX_TOKENS", 16, "Max embeddings for query and last_n products")
flags.DEFINE_integer("NUM_EPOCHS", 30, "Number of epochs")
flags.DEFINE_string("MODEL_DIR", 'model-dirs-lowes', "GCS Bucket to store the model artifact")
flags.DEFINE_bool("DROPOUT", False, "Use Dropout - T/F bool type")
flags.DEFINE_float("DROPOUT_RATE", 0.4, "Dropout rate only works with DROPOUT=True")
#flags.DEFINE_integer("N_PRODUCTS", 20000, "number of products considered for embedding")
flags.DEFINE_integer("BATCH_SIZE", 1024, "batch size")
flags.DEFINE_string("ARCH", '[128,64]', "deep architecture, expressed as a list of ints in string format - will be parsed into list")
flags.DEFINE_integer("SEED", 41781897, "random seed")
#flags.DEFINE_string("TF_RECORDS_DIR", "gs://tfrs-central-a", "source data in tfrecord format gcs location")


DuplicateFlagError: The flag 'LR' is defined twice. First from /opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py, Second from /opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py.  Description from first occurrence: Learning Rate

In [15]:
product_catalog_sql = """
with inner_q as (
    select 
        cast(id as int) as productId,
        title,
        description,
        product_metadata.exact_price.original_price as price,
        array_to_string(cats.categories, ' ') as categories
    from `babrams-recai-demo-final.css_retail.recommendation_ai_data` as rad
    , unnest(category_hierarchies) as cats
) select 
    productId,
    title,
    description,
    price,
    array_to_string(array_agg(categories), ",") as categories
from inner_q 
group by productId, title, description, price
"""

product_catalog_df = client.query(product_catalog_sql).to_dataframe()
product_catalog_df.describe()
product_categoricals = ['productId', 'title', 'description', 'categories']
# product_catalog_df.dtypes

class ProductModel(tf.keras.Model):
    def __init__(self, layer_sizes, adapt_data):
        super().__init__()
        
        #preprocess stuff
        self.sku_count = np.unique(np.concatenate(list(adapt_data.map(lambda x: x["productId"]).batch(1000))))
        #categorical: sku
        self.sku_lookup = tf.keras.layers.experimental.preprocessing.StringLookup(
            name="sku_monotic"
        )
        self.title_vectorizor = tf.keras.layers.TextVectorization(
            max_tokens=self.sku_count
            , name="title_vectorizor"
        )
        self.description_vectorizor = tf.keras.layers.TextVectorization(
            max_tokens=self.sku_count
            , name="description_vectorizor"
        )
        self.category_vectorizor = tf.keras.layers.TextVectorization(
            max_tokens=FLAGS.N_PRODUCTS
            , name="category_vectorizor"
        )
        
        #adapt stuff
        self.category_vectorizor.adapt(adapt_data.map(lambda x: x['categories']))
        self.title_vectorizor.adapt(adapt_data.map(lambda x: x['title']))
        self.description_vectorizor.adapt(adapt_data.map(lambda x: x['description']))
        self.sku_lookup.adapt(adapt_data.map(lambda x: x['productId']))
        
        #embed stuff
        self.sku_embedding = tf.keras.Sequential([
            self.sku_lookup,
            tf.keras.layers.Embedding(self.sku_count+1, FLAGS.EMBEDDING_DIM, mask_zero=True, name = "sku_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="sku_flat")
        ], name="sku_embedding")    
        self.title_embedding = tf.keras.Sequential(
            [
                self.title_vectorizor,
                tf.keras.layers.Embedding(
                    self.sku_count+1, 
                    FLAGS.EMBEDDING_DIM, 
                    mask_zero=True, 
                    name="title_emb"),
                tf.keras.layers.GlobalAveragePooling1D(
                    name="title_flatten"
                )
            ], 
            name="title_embedding"
        )
        self.description_embedding = tf.keras.Sequential(
            [
                self.description_vectorizor,
                tf.keras.layers.Embedding(
                    self.sku_count+1, 
                    FLAGS.EMBEDDING_DIM, 
                    mask_zero=True, 
                    name = "desc_emb"),
                tf.keras.layers.GlobalAveragePooling1D(
                    name="desc_flatten"
                )
            ], 
            name="description_embedding"
        )
        self.category_embedding = tf.keras.Sequential(
            [
                self.category_vectorizor,
                tf.keras.layers.Embedding(
                    self.category_vectorizer.vocab_size(), 
                    FLAGS.EMBEDDING_DIM, 
                    mask_zero=True, 
                    name = "category_emb"),
                tf.keras.layers.GlobalAveragePooling1D(
                    name="category_flatten"
                )
            ], 
            name="category_embedding"
        )
        
        
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(name="dense_layers_product")
        
        # Adding weight initialzier
        initializer = tf.keras.initializers.GlorotUniform(seed=FLAGS.SEED)
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu", kernel_initializer=initializer))
            if FLAGS.DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(FLAGS.DROPOUT_RATE))
            # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, kernel_initializer=initializer))
        ### ADDING L2 NORM AT THE END
        self.dense_layers.add(tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, 1, epsilon=1e-12, name="normalize_dense")))
        
        
    def call(self, data):
        all_embs = tf.concat(
            [
                tf.reshape(data["price"], (-1, 1)),
                self.description_embedding(data['description']),
                self.sku_embedding(data['productId']),
                self.category_embedding(data['categories']),
                self.title_embedding(data['title'])
            ], axis=1)
        return self.dense_layers(all_embs)  #last plus for number continuous + 1 if you add other(s) 2048 for visual




In [14]:
  
customer_data_sql = """
select
    id as userId,
    age,
    gender,
    latitude,
    longitude,
    zip,
    traffic_source,
    TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), created_at, DAY) as customer_lifetime_days
from `babrams-recai-demo-final.css_retail.customers` as customers
"""
customer_data = client.query(customer_data_sql)
customer_data_df = customer_data.to_dataframe()
customer_data_df.describe()

class UserModel(tf.keras.Model):
    def __init__(self, layer_sizes, adapt_data):
        super().__init__()
        
        #preprocess stuff
        self.user_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()
        self.max_age = adapt_data.map(lambda x: x['age']).reduce(tensorflow.cast(0, tensorflow.int64), tensorflow.maximum).numpy().max()
        self.min_age = adapt_data.map(lambda x: x['age']).reduce(np.int64(1e9), tf.minimum).numpy().min()
        self.age_buckets = np.linspace(min_age, max_age, num=20)
        self.max_lifetime = adapt_data.map(lambda x: x['customer_lifetime_days']).reduce(tensorflow.cast(0, tensorflow.int64), tensorflow.maximum).numpy().max()
        self.min_lifetime = adapt_data.map(lambda x: x['customer_lifetime_days']).reduce(np.int64(1e9), tf.minimum).numpy().min()
        self.lifetime_buckets = np.linspace(min_lifetime, max_lifetime, num=100)
        self.traffic_source_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()
        self.zip_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()
        self.gender_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()
        
        #adapt stuff
        self.user_lookup.adapt(adapt_data.map(lambda x: x['productId']))
        self.zip_lookup.adapt(adapt_data.map(lambda x: x['zip']))
        self.gender_lookup.adapt(adapt_data.map(lambda x: x['gender']))
         
        
        #embed stuff
        self.user_embedding = tf.keras.Sequential([
            self.user_lookup,
            tf.keras.layers.Embedding(FLAGS.N_PRODUCTS+1, FLAGS.EMBEDDING_DIM, mask_zero=True, name = "user_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="user_flat")
        ], name="user_embedding")
        self.age_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.Discretization(age_buckets.tolist()),  
            tf.keras.layers.Embedding(len(age_buckets) + 1, 32)
        ], name="age_embedding")
        self.lifetime_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.Discretization(lifetime_buckets.tolist(), name = "lifetime_disc"),  
            tf.keras.layers.Embedding(len(lifetime_buckets) + 1, 32)
        ], name="customer_lifetime_embedding")
        self.traffic_source_embedding = tf.keras.Sequential([
            self.traffic_source_lookup,
            tf.keras.layers.Embedding(self.traffic_source_lookup.vocab_size()+1, FLAGS.EMBEDDING_DIM, mask_zero=True, name = "traffic_source_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="traffic_source_flat")
        ], name="traffic_source_embedding")
        self.zip_embedding = tf.keras.Sequential([
            self.zip_lookup,
            tf.keras.layers.Embedding(self.zip_lookup.vocab_size()+1, FLAGS.EMBEDDING_DIM, mask_zero=True, name = "zip_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="zip_flat")
        ], name="zip_embedding")
        self.gender_embedding = tf.keras.Sequential([
            self.genderlookup,
            tf.keras.layers.Embedding(self.gender_lookup.vocab_size()+1, FLAGS.EMBEDDING_DIM, mask_zero=True, name = "gender_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="gender_flat")
        ], name="gender_embedding")
        
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(name="dense_layers_product")
        
        # Adding weight initialzier
        initializer = tf.keras.initializers.GlorotUniform(seed=FLAGS.SEED)
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu", kernel_initializer=initializer))
            if FLAGS.DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(FLAGS.DROPOUT_RATE))
            # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, kernel_initializer=initializer))
        ### ADDING L2 NORM AT THE END
        self.dense_layers.add(tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, 1, epsilon=1e-12, name="normalize_dense")))
        

    def call(self, data):
        all_embs = tf.concat(
            [
                self.user_embedding(data['userId']),
                self.age_embedding(data['age']),
                self.lifetime_embedding(data['customer_lifetime_days']),
                self.traffic_source_embedding(data['traffic_source']),
                self.zip_embedding(data['zip']),
                self.gender_embedding(data['gender'])
            ], axis=1)
        return self.dense_layers(all_embs)  #last plus for number continuous + 1 if you add other(s) 2048 for visual



In [6]:
purchase_data_sql = """
select
    cast(userInfo.userId as int) as userId, 
    unix_millis(safe_cast(eventTime as timestamp)) as eventTime,
    productEventDetail.cartId,
    productEventDetail.purchaseTransaction.revenue,
    products.id as productId,
    products.quantity,
    products.displayPrice as price
from `babrams-recai-demo-final.css_retail.purchase_complete` as purchase
, UNNEST(productEventDetail.productDetails) products
"""

purchase_data_df = client.query(purchase_data_sql).to_dataframe()
purchase_data_df.describe()
purchase_categoricals = ['userId', 'cartId', 'productId']
# purchase_data_df.dtypes

In [None]:
class EventModel(tf.keras.Model):
    def __init__(self, layer_sizes, adapt_data):
        super().__init__()
        
        #month_vocab = tf.constant(["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"], name="month_vocab")
        #hour_vocab = tf.constant(["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
        #    "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "00"], name="hour_vocab")

        #self.month_embedding = tf.keras.Sequential([
        #    tf.keras.layers.StringLookup(
        #        vocabulary=month_vocab, mask_token=None, name="month_lookup", output_mode='count')
        #], name="month")
        
        #self.hour_embedding = tf.keras.Sequential([
        #    tf.keras.layers.StringLookup(
        #        vocabulary=hour_vocab, mask_token=None, name="hour_lookup", output_mode='count')
        #], name="hour")
    
        #self.query_vectorizor = tf.keras.layers.TextVectorization(
        #    max_tokens=FLAGS.MAX_TOKENS, name="query_tv", ngrams=2)
        
        #self.last_viewed_vectorizor = tf.keras.layers.TextVectorization(
        #    max_tokens=FLAGS.MAX_TOKENS, name="last_viewed_tv", ngrams=2)
        
        #self.query_embedding = tf.keras.Sequential([
        #    self.query_vectorizor,
        #    tf.keras.layers.Embedding(FLAGS.MAX_TOKENS+1, FLAGS.EMBEDDING_DIM , mask_zero=True, name="query_emb"),
        #    tf.keras.layers.GlobalAveragePooling1D()
        #], name="query_embedding_model")
        
        #self.last_viewed_embedding = tf.keras.Sequential([
        #    self.last_viewed_vectorizor,
        #    tf.keras.layers.Embedding(FLAGS.MAX_TOKENS+1, FLAGS.EMBEDDING_DIM , mask_zero=True, name="last_v_emb"),
        #    tf.keras.layers.GlobalAveragePooling1D()
        #], name="last_viewed_embedding")
        
        ### adapt stuff
        #self.query_vectorizor.adapt(adapt_data.map(lambda x: x['query']))
        #self.last_viewed_vectorizor.adapt(adapt_data.map(lambda x: x['last_viewed'])) 
        
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(name="dense_layers_query")
        
        initializer = tf.keras.initializers.GlorotUniform(seed=FLAGS.SEED)
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu", kernel_initializer=initializer))
            if FLAGS.DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(FLAGS.DROPOUT_RATE))
        # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, kernel_initializer=initializer))
        ### ADDING L2 NORM AT THE END
        self.dense_layers.add(tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, 1, epsilon=1e-12, name="normalize_dense")))


    def call(self, data):    
        all_embs = tf.concat(
                [
                    #self.month_embedding(data['month']),
                    #self.hour_embedding(data['hour']),
                    #self.query_embedding(data['query']),
                    #self.last_viewed_embedding(data['last_viewed'])
                ], axis=1)
        return self.dense_layers(all_embs)

In [7]:
#customers_tf = tensorflow.convert_to_tensor(customer_data_df)
#purchase_tf = tensorflow.convert_to_tensor(purchase_data_df)



In [None]:
class TheTwoTowers(tfrs.models.Model):
    def __init__(self, layer_sizes, query_adapt_data, cat_adapt_data):
        super().__init__()
        self.query_model = QueryModel(layer_sizes, query_adapt_data)
        self.candidate_model = ProductModel(layer_sizes, cat_adapt_data)
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=cat_adapt_data.batch(128).cache().map(self.candidate_model),
            )
        )

    def compute_loss(self, data, training=False):
        query_embeddings = self.query_model(data)
        product_embeddings = self.candidate_model(data)

        return self.task(
            query_embeddings, product_embeddings, compute_metrics=not training)#### turn off metrics to save time on training
