# Model Development

## Data

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [0]:
def get_sampled_features(labeled_features, sample_size=.25):
    sample = labeled_features.select('customer_id').distinct().sample(sample_size)
    labeled_features.join(sample, 'customer_id')
    
    return labeled_features

In [0]:
labeled_features = spark.read.parquet('gs://h-and-m-tx/features/labels-bq/')

## ALS Model

In [0]:
def als_model(df, n=12):
    indexer = StringIndexer(inputCol="customer_id", outputCol="customer_id_enc") 
    df = indexer.fit(df).transform(df)
    
    # Cast all fields to integers

    df = df.withColumn('customer_id_enc', col('customer_id_enc').cast('int')) \
        .withColumn('article_id', col('article_id').cast('int')) \
        .withColumn('narticles', col('narticles').cast('int'))
    
    (training, test) = df.randomSplit([0.8, 0.2])
    
    als = ALS(maxIter=5, 
          implicitPrefs=True, # Set implicit preferences to True since these are purchases
          userCol='customer_id_enc', 
          itemCol='article_id', 
          ratingCol='narticles', 
          coldStartStrategy='drop')
    model = als.fit(training)
    predictions = model.transform(test)

    reco_at_n = model.recommendForAllUsers(n)
    
    return reco_at_n
    
    


In [0]:
from pyspark.sql.functions import count

# Get only one year data and summarize/count by number of articles purchased per customer
# where(col('t_dat') < '2019-01-01') \

als_features = (
    labeled_features
    .groupBy('customer_id_labeled', 'article_id_labeled')
    .agg(count('*').alias("narticles"))
    #.where(col('narticles') >= 5)
)

print(f'length als_features {als_features.count()}')

length als_features 6802387


In [0]:
# Label encode or transform string customer_id to integers

# from pyspark.ml.feature import StringIndexer

# indexer = StringIndexer(inputCol="customer_id", outputCol="customer_id_enc") 
# tx_all = indexer.fit(tx_all).transform(tx_all)

In [0]:
# Cast all fields to integers

als_features = (
    als_features
    .withColumn('customer_id_labeled', col('customer_id_labeled').cast('int'))
    .withColumn('article_id_labeled', col('customer_id_labeled').cast('int'))
    .withColumn('narticles', col('narticles').cast('int'))
)
# tx_all.printSchema()

In [0]:
(training, test) = als_features.randomSplit([0.8, 0.2])

In [0]:
from pyspark.ml.recommendation import ALS


als = ALS(maxIter=5, 
          implicitPrefs=True, # Set implicit preferences to True since these are purchases
          userCol='customer_id_labeled', 
          itemCol='article_id_labeled', 
          ratingCol='narticles', 
          coldStartStrategy='drop')
model = als.fit(training)

In [0]:
predictions = model.transform(test)
#predictions.show()

### Measure ALS Model

In [0]:
def get_top_n_articles(customer_id_enc, n):
    return tx_all.where(col('customer_id_enc') == customer_id_enc).select('*').limit(n)

In [0]:
users = test.select('customer_id_labeled').distinct()
#reco_at_n = model.recommendForAllUsers(12)
reco_at_n = model.recommendForUserSubset(users, 12)
reco_at_n.createOrReplaceTempView("reco_at_n")
print(f'reco_at_n count: {reco_at_n.count()}')

reco_at_n count: 208452


In [0]:
from pyspark.sql import functions as F

actuals = test.groupBy('customer_id_labeled').agg(F.collect_list('article_id_labeled').alias('actuals'))
print(f'actuals count: {actuals.count()}')

actuals count: 244175


In [0]:
predicted = spark.sql("""
select customer_id_labeled, recommendations.article_id_labeled as predicted
from reco_at_n
""")

print(f'number of predicted: {predicted.count()}')

number of predicted: 208452


In [0]:
actuals.printSchema()
predicted.printSchema()

root
 |-- customer_id_labeled: integer (nullable = true)
 |-- actuals: array (nullable = false)
 |    |-- element: integer (containsNull = false)

root
 |-- customer_id_labeled: integer (nullable = false)
 |-- predicted: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [0]:
predicted_actuals = actuals.join(predicted, actuals.customer_id_labeled ==  predicted.customer_id_labeled, "inner")
#print(f'predicted_actuals count: {predicted_actuals.count()}')

predicted_list = predicted_actuals \
    .select('predicted').rdd.flatMap(lambda x: x).collect()

actuals_list = predicted_actuals \
    .select('actuals').rdd.flatMap(lambda x: x).collect()

In [0]:
display(predicted_actuals.limit(10))

customer_id_labeled,actuals,customer_id_labeled.1,predicted
22,"List(22, 22)",22,"List(202136, 599338, 206526, 1294648, 401006, 9413, 984315, 585922, 320523, 846206, 762945, 589362)"
26,List(26),26,"List(1142105, 541654, 666150, 645169, 591034, 692990, 890773, 1011218, 675747, 750608, 401134, 918826)"
185,"List(185, 185)",185,"List(341231, 195393, 207533, 870189, 112384, 1169022, 655283, 405328, 1275105, 486879, 108895, 427993)"
209,"List(209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209)",209,"List(894215, 1312230, 1037656, 297193, 849452, 900269, 927361, 333062, 912525, 1221515, 70815, 116161)"
210,List(210),210,"List(479183, 737350, 114523, 853737, 854182, 755133, 603615, 936743, 867941, 1221515, 983866, 557547)"
224,List(224),224,"List(1149740, 1099813, 1245394, 78373, 912550, 215153, 7014, 1251751, 561418, 976183, 764358, 339239)"
291,"List(291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291, 291)",291,"List(750608, 332385, 199529, 8794, 488577, 931206, 714143, 387267, 669453, 1127123, 554300, 319937)"
325,List(325),325,"List(760857, 1008948, 635538, 207533, 525628, 1254083, 1035550, 514690, 341231, 1011711, 641271, 383983)"
329,"List(329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329)",329,"List(329, 885175, 158770, 682067, 1039678, 368550, 894215, 138706, 576306, 611897, 36184, 312549)"
353,List(353),353,"List(63494, 1000015, 1147141, 341501, 1064341, 314365, 1207359, 605886, 1262832, 269630, 335662, 750608)"


In [0]:
from pyspark.mllib.evaluation import RankingMetrics

def calculate_metrics(actuals, predict, show=False):
    labels_list = list(zip(predict, actuals))
    labels = spark.sparkContext.parallelize(labels_list)
    
    metrics = RankingMetrics(labels)
    
    mapat12 = round(metrics.meanAveragePrecisionAt(12),6)
    mapat5 = round(metrics.meanAveragePrecisionAt(5),6)
    
    if show == True:
        print(f'Precision at 12 {mapat12}')
        print(f'Precision at 5 {mapat5}')
    
    return mapat12, mapat5

In [0]:
calculate_metrics(actuals_list, predicted_list, show=True)

Precision at 12 9e-06
Precision at 5 5e-06
Out[22]: (9e-06, 5e-06)

## Null Model (top 12 most popular articles)

In [0]:
popular = labeled_features.groupBy('article_id_labeled').agg(count("article_id_labeled").alias("narticles")).sort('narticles', ascending=False).limit(12)
 
#popular.select('article_id_labeled').rdd.flatMap(lambda x: x).collect()

In [0]:
# print(len(actuals_list))
# labels_list = (popular_predict_list, actuals_list)

popular_predict_list = [popular.select('article_id_labeled').rdd.flatMap(lambda x: x).collect()] * len(actuals_list)

calculate_metrics(actuals_list, popular_predict_list, show=True)

# labels_list = list(zip(popular_predict_list, actuals_list))

# labels = spark.sparkContext.parallelize(labels_list)
# metrics = RankingMetrics(labels)
# print(f'Precision at 12 {metrics.meanAveragePrecisionAt(12)}')
# print(f'Precision at 5 {metrics.meanAveragePrecisionAt(5)}')

Precision at 12 0.003401
Precision at 5 0.003721
Out[19]: (0.003401, 0.003721)

### Revenue instead of count of articles

In [0]:
transactions \
    .groupBy('customer_id', 'article_id') \
    .agg(sum('price').alias("revenue")) \
    .show()

In [0]:
transactions_full = transactions.join(articles, transactions.article_id ==  articles.article_id, "inner")
transactions_full.cache()

print(f'number of transactions {transactions_full.count()}')

In [0]:
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

features = ['product_code', 
          'product_type_no',
          'graphical_appearance_no',
          'colour_group_code',
          'perceived_colour_value_id',
          'perceived_colour_master_id',
          'department_no',
          'garment_group_no']

for feature in features:
    transactions_full = transactions_full.withColumn(feature, col(feature).cast('int'))
    
print(transactions_full.printSchema())

#Convert qualificationIndex and genderIndex
onehot_encoder = OneHotEncoder(inputCols=['product_code', 
                                          'product_type_no',
                                          'graphical_appearance_no',
                                          'colour_group_code',
                                          'perceived_colour_value_id',
                                          'perceived_colour_master_id',
                                          'department_no',
                                          'garment_group_no'], 
                               outputCols=['product_code_vec', 
                                          'product_type_no_vec',
                                          'graphical_appearance_no_vec',
                                          'colour_group_code_vec',
                                          'perceived_colour_value_id_vec',
                                          'perceived_colour_master_id_vec',
                                          'department_no_vec',
                                          'garment_group_no_vec'], )

#Merge multiple columns into a vector column
vector_assembler = VectorAssembler(inputCols=['product_code_vec', 
                                          'product_type_no_vec',
                                          'graphical_appearance_no_vec',
                                          'colour_group_code_vec',
                                          'perceived_colour_value_id_vec',
                                          'perceived_colour_master_id_vec',
                                          'department_no_vec',
                                          'garment_group_no_vec'], 
                                   outputCol='features')
#Create pipeline and pass it to stages
pipeline = Pipeline(stages=[
           onehot_encoder,
           vector_assembler
])
#fit and transform
transactions_feats = pipeline.fit(transactions_full).transform(transactions_full)
transactions_feats.show()

# |-- article_id: string (nullable = true)
#  |-- product_code: string (nullable = true)
#  |-- prod_name: string (nullable = true)
#  |-- product_type_no: string (nullable = true)
#  |-- product_type_name: string (nullable = true)
#  |-- product_group_name: string (nullable = true)
#  |-- graphical_appearance_no: string (nullable = true)
#  |-- graphical_appearance_name: string (nullable = true)
#  |-- colour_group_code: string (nullable = true)
#  |-- colour_group_name: string (nullable = true)
#  |-- perceived_colour_value_id: string (nullable = true)
#  |-- perceived_colour_value_name: string (nullable = true)
#  |-- perceived_colour_master_id: string (nullable = true)
#  |-- perceived_colour_master_name: string (nullable = true)
#  |-- department_no: string (nullable = true)
#  |-- department_name: string (nullable = true)
#  |-- index_code: string (nullable = true)
#  |-- index_name: string (nullable = true)
#  |-- index_group_no: string (nullable = true)
#  |-- index_group_name: string (nullable = true)
#  |-- section_no: string (nullable = true)
#  |-- section_name: string (nullable = true)
#  |-- garment_group_no: string (nullable = true)
#  |-- garment_group_name: string (nullable = true)
#  |-- detail_desc: string (nullable = true)

## Deep Learning

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
# identify categorical feature columns
# identify label column
label_col = 'label'
cat_features = ('product_group_name','graphical_appearance_name','customer_id','article_id')
cat_features_labeled = [x + "_labeled" for x in cat_features]

In [0]:
labeled_features = spark.read.parquet('gs://h-and-m-tx/features/labels-bq/')
labeled_features = labeled_features.drop(*cat_features)
labeled_features.cache()

Out[3]: DataFrame[label: bigint, id: bigint, product_shr_distinct_customers_last_30_days: double, product_shr_distinct_orders_last_30_days: double, product_shr_distinct_customers_last_90_days: double, product_shr_distinct_orders_last_90_days: double, product_shr_distinct_customers_last_180_days: double, product_shr_distinct_orders_last_180_days: double, product_shr_distinct_customers_last_360_days: double, product_shr_distinct_orders_last_360_days: double, product_shr_distinct_customers_last_720_days: double, product_shr_distinct_orders_last_720_days: double, product_shr_distinct_customers_last_1080_days: double, product_shr_distinct_orders_last_1080_days: double, user_cnt_distinct_order_number_last_30_days: bigint, user_cnt_distinct_order_number_last_90_days: bigint, user_cnt_distinct_order_number_last_180_days: bigint, user_cnt_distinct_order_number_last_360_days: bigint, user_cnt_distinct_order_number_last_720_days: bigint, user_cnt_distinct_order_number_last_1080_days: bigint, user

In [0]:
# fraction to hold for training
train_fraction = 0.6
 
# sample data, stratifying on labels, for training
train = (
  labeled_features
    .sampleBy(label_col, fractions={0: train_fraction, 1: train_fraction})
  )
 
# split remaining data into validation & testing datasets (with same stratification)
valid = (
  labeled_features
    .join(train, on='id', how='leftanti') # not in()
    .sampleBy(label_col, fractions={0:0.5, 1:0.5})
  )
 
test = (
  labeled_features
    .join(train, on='id', how='leftanti') # not in()
    .join(valid, on='id', how='leftanti') # not in()
  )

#print(f'train: {train.count()} valid: {valid.count()} test: {test.count()}')

In [0]:
train.write.parquet('gs://h-and-m-tx/features/train', mode='overwrite')
valid.write.parquet('gs://h-and-m-tx/features/valid', mode='overwrite')
test.write.parquet('gs://h-and-m-tx/features/test', mode='overwrite')

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-112983542186438>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mtrain[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mparquet[0m[0;34m([0m[0;34m'gs://h-and-m-tx/features/train'[0m[0;34m,[0m [0mmode[0m[0;34m=[0m[0;34m'overwrite'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mvalid[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mparquet[0m[0;34m([0m[0;34m'gs://h-and-m-tx/features/valid'[0m[0;34m,[0m [0mmode[0m[0;34m=[0m[0;34m'overwrite'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0mtest[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mparquet[0m[0;34m([0m[0;34m'gs://h-and-m-tx/features/test'[0m[0;34m,[0m [0mmode[0m[0;34m=[0m[0;34m'overwrite'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mNameError[0m: name 'train'

## Train Model

In [0]:
train = spark.read.parquet('gs://h-and-m-tx/features/train')
valid = spark.read.parquet('gs://h-and-m-tx/features/valid')
test = spark.read.parquet('gs://h-and-m-tx/features/test')

In [0]:
import pyspark.sql.functions as F
# identify categorical feature columns
cat_features = cat_features_labeled
# capture keys for each of the categorical feature columns
cat_keys={}
for col in cat_features:
    cat_keys[col] = (
        labeled_features
        .selectExpr('{0} as key'.format(col))
        .distinct()
        .orderBy('key')
        .groupBy()
          .agg(F.collect_list('key').alias('keys'))
        .collect()[0]['keys']
    )

# all other columns (except id) are continous features
num_features = labeled_features.drop(*(['id',label_col]+cat_features)).columns

In [0]:
!pip install tensorflow
!pip install petastorm
!pip install hyperopt
!pip install mlflow

Collecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[?25l[K     |                                | 10 kB 18.2 MB/s eta 0:00:33[K     |                                | 20 kB 5.4 MB/s eta 0:01:50[K     |                                | 30 kB 7.6 MB/s eta 0:01:18[K     |                                | 40 kB 3.5 MB/s eta 0:02:47[K     |                                | 51 kB 3.8 MB/s eta 0:02:35[K     |                                | 61 kB 4.5 MB/s eta 0:02:12[K     |                                | 71 kB 3.7 MB/s eta 0:02:40[K     |                                | 81 kB 3.5 MB/s eta 0:02:51[K     |                                | 92 kB 3.8 MB/s eta 0:02:34[K     |                                | 102 kB 4.0 MB/s eta 0:02:27[K     |                                | 112 kB 4.0 MB/s eta 0:02:27[K     |                                | 122 kB 4.0 MB/s eta 0:02:27[K     |                

In [0]:
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
 
from petastorm.spark import SparkDatasetConverter, make_spark_converter
from petastorm import TransformSpec
 
from hyperopt import hp, fmin, tpe, SparkTrials, STATUS_OK, space_eval
 
import mlflow
from mlflow.tracking import MlflowClient
 
import platform
 
import numpy as np
import pandas as pd
 
import datetime
import os
import requests

  original_result = python_builtin_import(name, globals, locals, fromlist, level)
  from pyarrow import LocalFileSystem




In [0]:
# configure temp cache for petastorm files
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, 'file:///dbfs/tmp/instacart_wide_deep/pstorm_cache') # the file:// prefix is required by petastorm
 
# persist dataframe data to petastorm cache location
train_pstorm = make_spark_converter(train.repartition(4))  
valid_pstorm = make_spark_converter(valid.repartition(4)) 
test_pstorm = make_spark_converter(test.repartition(4)) 

  self._filesystem = pyarrow.localfs


In [0]:
def get_data_specs(epochs=1, batch_size=128):
    epochs = int(epochs)
    batch_size = int(batch_size)
    # define functions to transform data into req'ed format
    def get_input_fn(dataset_context_manager):
    
        # re-structure a row as ({features}, label)
        def _to_tuple(row): 
            features = {}
            for col in cat_features + num_features:
                features[col] = getattr(row, col)
            return features, getattr(row, label_col)
    
        def fn(): # called by estimator to perform row structure conversion
            return dataset_context_manager.__enter__().map(_to_tuple)
      
        return fn
 
    # access petastorm cache as tensorflow dataset
    train_ds = train_pstorm.make_tf_dataset(batch_size=batch_size)
    valid_ds = valid_pstorm.make_tf_dataset()
  
    # define spec to return transformed data for model training & evaluation
    train_spec = tf.estimator.TrainSpec(
        input_fn=get_input_fn(train_ds), 
        max_steps=int( (train_pstorm.dataset_size * epochs) / batch_size )
    )
    eval_spec = tf.estimator.EvalSpec(
        input_fn=get_input_fn(valid_ds)
    )
  
    return train_spec, eval_spec

In [0]:
# Verify Specs

# retrieve specs
specs = get_data_specs()
 
# retrieve first batch from first (training) spec
next(
  iter(
    specs[0].input_fn().take(1)
    )
  )

Out[11]: ({'product_group_name_labeled': <tf.Tensor: shape=(128,), dtype=int64, numpy=
  array([ 7,  1,  7,  7,  6,  6,  7, 17,  8,  6, 17,  8,  8, 14,  8, 17,  8,
          7,  8, 16,  8, 17,  7, 16, 16,  8,  6,  8,  7,  8, 12,  8, 13,  7,
          8,  8, 17,  8,  8,  7,  8,  8,  7,  8,  7, 16,  8,  8,  6,  6,  7,
          8, 17,  8,  8,  8,  8,  8,  8,  8, 16,  7, 16,  7,  8,  8,  8,  7,
         17,  8,  1,  7,  7,  8,  6,  7,  7,  8,  8,  8,  8,  7, 16,  8,  8,
          8,  8, 16,  1, 19, 17,  7,  7,  8,  8,  8,  8,  8,  8,  8,  7,  8,
          1,  8,  8,  8,  8,  8, 13, 17,  7,  8,  7, 17, 16,  6, 14, 17,  7,
          7,  7,  8,  6,  7,  8,  1,  6,  7])>,
  'graphical_appearance_name_labeled': <tf.Tensor: shape=(128,), dtype=int64, numpy=
  array([ 8,  1, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,  1,  2, 26, 26,
          8, 16,  1, 26,  1,  6, 22, 27, 26, 26, 26, 16, 26, 27, 26, 18,  8,
         16, 26,  7, 26, 26, 22, 26,  1, 27, 11,  1,  1, 26, 27,  1,  5,  8,
         2

## Define the Model

In [0]:
def get_wide_features():
    wide_columns = []

    wide_columns += [
    tf.feature_column.categorical_column_with_hash_bucket(
       key='customer_id_labeled', 
       hash_bucket_size=1000,
       dtype=tf.dtypes.int64 # create one bucket for each value from 0 to max
       )]
 
    wide_columns += [
    tf.feature_column.categorical_column_with_hash_bucket(
       key='article_id_labeled', 
       hash_bucket_size=100,
       dtype=tf.dtypes.int64 # create one bucket for each value from 0 to max
       )]
 
    # user-product cross-column (set column spec to ensure presented as int64)
    wide_columns += [
    tf.feature_column.crossed_column(
      [ tf.feature_column.categorical_column_with_identity(key='customer_id_labeled', num_buckets=np.max(cat_keys['customer_id_labeled'])+1),
        tf.feature_column.categorical_column_with_identity(key='customer_id_labeled', num_buckets=np.max(cat_keys['customer_id_labeled'])+1)
        ], 
      hash_bucket_size=1000
      )] 
 
    return wide_columns

In [0]:
def get_deep_features():
  
    deep_columns = []

    # categorical features
    for col in cat_features:

        # don't use user ID or product ID
        if col not in ['customer_id_labeled','article_id_labeled']:

            # base column definition
            col_def = tf.feature_column.categorical_column_with_identity(
                key=col, 
                num_buckets=np.max(cat_keys[col])+1 # create one bucket for each value from 0 to max
            )

            # define embedding on base column def
            deep_columns += [tf.feature_column.embedding_column(
                              col_def, 
                              dimension=int(np.max(cat_keys[col])**0.25)
                              )] 

    # continous features
    for col in num_features:
        deep_columns += [tf.feature_column.numeric_column(col)]  

    return deep_columns

In [0]:
def get_model(hidden_layers, hidden_layer_nodes_initial_count, hidden_layer_nodes_count_decline_rate, dropout_rate):  
  
    # determine hidden_units structure
    hidden_units = [None] * int(hidden_layers)
    for i in range(int(hidden_layers)):
        # decrement the nodes by the decline rate
        hidden_units[i] = int(hidden_layer_nodes_initial_count * (hidden_layer_nodes_count_decline_rate**i))

    # get features
    wide_features = get_wide_features()
    deep_features = get_deep_features()

    # define model
    estimator = tf.estimator.DNNLinearCombinedClassifier(
        linear_feature_columns=wide_features,
        #linear_optimizer=tf.keras.optimizers.Ftrl,
        linear_optimizer=tf.keras.optimizers.legacy.Ftrl,
        dnn_feature_columns=deep_features,
        dnn_hidden_units=hidden_units,
        dnn_dropout=dropout_rate,
        #dnn_optimizer=tf.keras.optimizers.Adagrad
        dnn_optimizer=tf.keras.optimizers.legacy.Adagrad
    )

    return estimator

## Tune the Model

In [0]:
# Adapted from: https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/Recommendation/WideAndDeep/utils/metrics.py
def map_custom_metric(features, labels, predictions):
  
    user_ids = tf.reshape(features['customer_id_labeled'], [-1])
    predictions = predictions['probabilities'][:, 1]

    # sort user IDs 
    sorted_ids = tf.argsort(user_ids)

    # resort values to align with sorted user IDs
    user_ids = tf.gather(user_ids, indices=sorted_ids)
    predictions = tf.gather(predictions, indices=sorted_ids)
    labels = tf.gather(labels, indices=sorted_ids)

    # get unique user IDs in dataset
    _, user_ids_idx, user_ids_items_count = tf.unique_with_counts(
        user_ids, 
        out_idx=tf.int64
    )

    # remove any user duplicates
    pad_length = 30 - tf.reduce_max(user_ids_items_count)
    pad_fn = lambda x: tf.pad(x, [(0, 0), (0, pad_length)])
    preds = tf.RaggedTensor.from_value_rowids(
        predictions, user_ids_idx).to_tensor()
    labels = tf.RaggedTensor.from_value_rowids(
        labels, user_ids_idx).to_tensor()
    labels = tf.argmax(labels, axis=1)

    # calculate average precision at k
    return {
      'map@k': tf.compat.v1.metrics.average_precision_at_k(
          predictions=pad_fn(preds),
          labels=labels,
          k=12,
          name="streaming_map")
        }

In [0]:
def train_and_evaluate_model(hparams):
  
    # retrieve the basic model
    model = get_model(
        hparams['hidden_layers'], 
        hparams['hidden_layer_nodes_initial_count'], 
        hparams['hidden_layer_nodes_count_decline_rate'], 
        hparams['dropout_rate']
    )

    # add map@k metric
    model = tf.estimator.add_metrics(model, map_custom_metric)

    # retrieve data specs
    train_spec, eval_spec = get_data_specs( int(hparams['epochs']), int(hparams['batch_size']))

    # train and evaluate
    results = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)

    # return loss metric
    return results

In [0]:
hparams = {
  'hidden_layers':2,
  'hidden_layer_nodes_initial_count':100,
  'hidden_layer_nodes_count_decline_rate':0.5,
  'dropout_rate':0.25,
  'epochs':1,
  'batch_size':128
  }
 
results_train = train_and_evaluate_model(hparams)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use output_signature instead
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [0]:
results_train

In [0]:
model = get_model(
    hparams['hidden_layers'], 
    hparams['hidden_layer_nodes_initial_count'], 
    hparams['hidden_layer_nodes_count_decline_rate'], 
    hparams['dropout_rate']
    )
model = tf.estimator.add_metrics(model, map_custom_metric)
 
train_spec, eval_spec = get_data_specs(hparams['epochs'], hparams['batch_size']) 
 
results = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)

# Borrowed from get_data_specs() (defined above)
# ---------------------------------------------------------
# define functions to transform data into req'ed format
def get_input_fn(dataset_context_manager):
    def _to_tuple(row): # re-structure a row as ({features}, label)
        features = {}
        for col in cat_features + num_features:
            features[col] = getattr(row, col)
        return features, getattr(row, label_col)
 
    def fn(): # called by estimator to perform row structure conversion
        return dataset_context_manager.__enter__().map(_to_tuple)
 
    return fn
# ---------------------------------------------------------
 
# define batch size and number of steps
batch_size = 128
steps = int(test_pstorm.dataset_size/batch_size)
 
# retrieve test data
test_ds = test_pstorm.make_tf_dataset(batch_size=batch_size)
 
# evaulate against test data
results = model.evaluate(get_input_fn(test_ds), steps=steps)



In [0]:
results

Out[20]: {'accuracy': 1.0,
 'accuracy_baseline': 1.0,
 'auc': 0.0,
 'auc_precision_recall': 0.99999994,
 'average_loss': 9.1953734e-05,
 'label/mean': 1.0,
 'loss': 9.1953734e-05,
 'map@k': 0.9999996224215165,
 'precision': 1.0,
 'prediction/mean': 0.9999272,
 'recall': 1.0,
 'global_step': 33301}

In [0]:
predictions = model.predict(get_input_fn(test_ds))

In [0]:
for pred in predictions:
    print(pred)

In [0]:
# results

# prediction = model.predict(specs[0].input_fn().take(1))
# print(prediction)

# get features
wide_features = get_wide_features()
deep_features = get_deep_features()
 
# use features to generate an input specification
feature_spec = tf.feature_column.make_parse_example_spec(
    wide_features + deep_features
    )
 
# make function to apply specification to incoming data
fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
    feature_spec
    )
 
# export the model
saved_model_path = model.export_saved_model(
    export_dir_base='gs://h-and-m-tx/model/model-1',
    serving_input_receiver_fn=fn
    ).decode("utf-8")

In [0]:
results