In [32]:
import os
import time

from absl import app
from absl import flags
from absl import logging

import tensorflow

from google.cloud import aiplatform as vertex_ai
from google.cloud import bigquery

In [100]:
PREFIX = 'two-tower' 
PREFIX = 'css_retail'
DISPLAY_NAME = f'{PREFIX}-tensorboard'
PROJECT= 'babrams-recai-demo-final'
REGION='us-central1'

STAGING_BUCKET = """gs://{}_vertex_training""".format(PROJECT) #lowes-reccomendation-tensorboard-logs-us-central1 - this 
#TENSORBOARD = 'projects/258043323883/locations/us-central1/tensorboards/4236655796332527616' #note really can only get this after gcloud beta ai tensorboards create...
#VERTEX_SA = 'vertex-tb@lowes-reccomendation.iam.gserviceaccount.com'

# initialize vertex sdk
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

client = bigquery.Client()

In [102]:
import json
import tensorflow as tf
import tensorflow_recommenders as tfrs
import os
from absl import app
from absl import flags
from absl import logging
from google.cloud import storage

FLAGS = flags.FLAGS
flags.DEFINE_float("LR", 0.01, "Learning Rate")
flags.DEFINE_integer("EMBEDDING_DIM", 16, "Embedding dimension")
flags.DEFINE_integer("MAX_TOKENS", 16, "Max embeddings for query and last_n products")
flags.DEFINE_integer("NUM_EPOCHS", 30, "Number of epochs")
flags.DEFINE_string("MODEL_DIR", 'model-dirs-lowes', "GCS Bucket to store the model artifact")
flags.DEFINE_bool("DROPOUT", False, "Use Dropout - T/F bool type")
flags.DEFINE_float("DROPOUT_RATE", 0.4, "Dropout rate only works with DROPOUT=True")
flags.DEFINE_integer("N_PRODUCTS", 20000, "number of products considered for embedding")
flags.DEFINE_integer("BATCH_SIZE", 1024, "batch size")
flags.DEFINE_string("ARCH", '[128,64]', "deep architecture, expressed as a list of ints in string format - will be parsed into list")
flags.DEFINE_integer("SEED", 41781897, "random seed")
flags.DEFINE_string("TF_RECORDS_DIR", "gs://tfrs-central-a", "source data in tfrecord format gcs location")


ModuleNotFoundError: No module named 'tensorflow_recommenders'

In [86]:
product_catalog_sql = """
with inner_q as (
    select 
        cast(id as int) as productId,
        title,
        description,
        product_metadata.exact_price.original_price as price,
        array_to_string(cats.categories, '|') as categories
    from `babrams-recai-demo-final.css_retail.recommendation_ai_data` as rad
    , unnest(category_hierarchies) as cats
) select 
    productId,
    title,
    description,
    price,
    array_to_string(array_agg(categories), ",") as categories
from inner_q 
group by productId, title, description, price
"""

product_catalog_df = client.query(product_catalog_sql).to_dataframe()
product_catalog_df.describe()
product_categoricals = ['productId', 'title', 'description', 'categories']
# product_catalog_df.dtypes


Unnamed: 0,productId,price
count,29120.0,29120.0
mean,14560.5,28.481775
std,8406.364256,30.624681
min,1.0,0.0083
25%,7280.75,11.27565
50%,14560.5,19.6751
75%,21840.25,34.44
max,29120.0,557.151


In [91]:
genders = {'Male': 0, 'Female': 1}
def gender_map(gender):
    return genders[gender] if gender in genders else -1
    
traffic_sources = {'Organic': 0, 'Email': 1, 'Search': 2, 'Facebook': 3, 'Display': 4}
def traffic_source_map(traffic_source):
    return traffic_sources[traffic_source] if traffic_source in traffic_sources else -1
    
customer_data_sql = """
select
    id as userId,
    age,
    gender,
    latitude,
    longitude,
    traffic_source,
    TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), created_at, DAY) as customer_lifetime_days
from `babrams-recai-demo-final.css_retail.customers` as customers
"""
customer_data = client.query(customer_data_sql)
customer_data_df = customer_data.to_dataframe()
customer_data_df.describe()
customer_categoricals = ['userId', 'gender', 'traffic_source']

#customer_tf = 
#customer_data_df['gender'] = customer_data_df['gender'].apply(gender_map)
#customer_data_df['traffic_source'] = customer_data_df['traffic_source'].apply(traffic_source_map)
#customer_data_df.describe()
#customer_data_df.dtypes

In [92]:
purchase_data_sql = """
select
    cast(userInfo.userId as int) as userId, 
    unix_millis(safe_cast(eventTime as timestamp)) as eventTime,
    productEventDetail.cartId,
    productEventDetail.purchaseTransaction.revenue,
    products.id as productId,
    products.quantity,
    products.displayPrice as price
from `babrams-recai-demo-final.css_retail.purchase_complete` as purchase
, UNNEST(productEventDetail.productDetails) products
"""

purchase_data_df = client.query(purchase_data_sql).to_dataframe()
purchase_data_df.describe()
purchase_categoricals = ['userId', 'cartId', 'productId']
# purchase_data_df.dtypes

In [89]:
#customers_tf = tensorflow.convert_to_tensor(customer_data_df)
#purchase_tf = tensorflow.convert_to_tensor(purchase_data_df)

class ProductModel(tf.keras.Model):
    def __init__(self, layer_sizes, adapt_data):
        super().__init__()

        #categorical with vocabs
        #pr_vocab = tf.constant(['002_$100 - $299', '000_$0 - $49', '001_$50 - $99', 
        #            '003_$300 - $599', '005_$1000 - $3999', '004_$600 - $999', 
        #            '006_$4000+', ''])

        #self.price_range_embedding = tf.keras.Sequential([
        #    tf.keras.layers.StringLookup(
        #          vocabulary=pr_vocab, mask_token=None, name="price_range_lu", output_mode='count')
        #], name="price_range_embedding")
        
        # categorical: description - below are all embeddings with unk vocabs - will be adapted 
        self.description_vectorizor = tf.keras.layers.TextVectorization(
            max_tokens=FLAGS.N_PRODUCTS
            , name = "description_vectorizor"
        )

        self.description_embedding = tf.keras.Sequential(
            [
                self.description_vectorizor,
                tf.keras.layers.Embedding(
                    FLAGS.N_PRODUCTS+1, 
                    FLAGS.EMBEDDING_DIM, 
                    mask_zero=True, 
                    name = "desc_emb"),
                tf.keras.layers.GlobalAveragePooling1D(
                    name="desc_flatten"
                )
            ], 
            name="description_embedding"
        )
        
        #categorical: sku
        self.sku_vectorizor = tf.keras.layers.TextVectorization(
            max_tokens=FLAGS.N_PRODUCTS, name = "sku_vectorizor")

        self.sku_embedding = tf.keras.Sequential([
            self.sku_vectorizor,
            tf.keras.layers.Embedding(FLAGS.N_PRODUCTS+1, FLAGS.EMBEDDING_DIM, mask_zero=True, name = "sku_emb"),
            tf.keras.layers.GlobalAveragePooling1D(name="sku_flat")
        ], name="sku_embedding")
        
        
        ## prod hierarcy
#         def split_fn(string):
#             return tf.strings.split(string, sep="|")

#         self.prod_hier_vectorizor = tf.keras.layers.TextVectorization(
#             max_tokens=N_HIER, split=split_fn, name = "hier_vectorizor")
#         #54724 - count unique
#         self.hier_embedding = tf.keras.Sequential([
#             self.prod_hier_vectorizor,
#             tf.keras.layers.Embedding(N_HIER+1, EMBEDDING_DIM, mask_zero=True, name = "hier_emb"),
#             tf.keras.layers.GlobalAveragePooling1D(name="hier_flat")
#         ], name="prod_hier")
        
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(name="dense_layers_product")
        
        # Adding weight initialzier
        initializer = tf.keras.initializers.GlorotUniform(seed=FLAGS.SEED)
        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu", kernel_initializer=initializer))
            if FLAGS.DROPOUT:
                self.dense_layers.add(tf.keras.layers.Dropout(FLAGS.DROPOUT_RATE))
            # No activation for the last layer
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, kernel_initializer=initializer))
        ### ADDING L2 NORM AT THE END
        self.dense_layers.add(tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, 1, epsilon=1e-12, name="normalize_dense")))
        
        #adapt stuff
#         self.prod_hier_vectorizor.adapt(adapt_data.map(lambda x: x['productTypeCombo_ss']))
        self.description_vectorizor.adapt(adapt_data.map(lambda x: x['description']))
        self.sku_vectorizor.adapt(adapt_data.map(lambda x: x['IVM_s']))
        

        
        
    def call(self, data):
        all_embs = tf.concat(
            [
                tf.reshape(data["price_td"], (-1, 1)),
                self.price_range_embedding(data['PriceRange_s']),
                self.description_embedding(data['description']),
                self.sku_embedding(data['IVM_s']),
#                 self.hier_embedding(data['productTypeCombo_ss']),
                data['visual']
            ], axis=1)
        return self.dense_layers(all_embs)  #last plus for number continuous + 1 if you add other(s) 2048 for visual



<tf.Tensor: shape=(323423, 7), dtype=float64, numpy=
array([[2.99000000e+04, 1.61845640e+12,            nan, ...,
        1.36060000e+04, 1.00000000e+00,            nan],
       [7.83180000e+04, 1.61041771e+12,            nan, ...,
        1.36060000e+04, 1.00000000e+00,            nan],
       [3.06130000e+04, 1.61086903e+12,            nan, ...,
        1.36060000e+04, 1.00000000e+00,            nan],
       ...,
       [1.72260000e+04, 1.64057820e+12,            nan, ...,
        3.69000000e+02, 1.00000000e+00,            nan],
       [1.72260000e+04, 1.64057820e+12,            nan, ...,
        1.87000000e+02, 1.00000000e+00,            nan],
       [1.72260000e+04, 1.64057820e+12,            nan, ...,
        3.69000000e+02, 1.00000000e+00,            nan]])>