# Data Science Festival x ASOS
## Build and Deploy a Recommender System in 3 Hours.

# Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Import training data

In [2]:
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)

# Define a Recommender Model

The embedding layer gives a list of random numbers for each user and each product.

In [3]:
embedl=tf.keras.layers.Embedding(5,8)

In [4]:
embedl(2)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-0.02031223, -0.00652564, -0.00809907, -0.00073647,  0.010148  ,
        0.03576307,  0.03502304,  0.01398713], dtype=float32)>

In [5]:
embedl.get_weights()

[array([[ 0.0433723 ,  0.0340793 ,  0.0307896 , -0.02576753, -0.03491094,
          0.00023193,  0.02533541, -0.00245831],
        [-0.03951852, -0.00593704,  0.0107357 ,  0.01175468, -0.01882541,
          0.04822339,  0.04872878,  0.01896887],
        [-0.02031223, -0.00652564, -0.00809907, -0.00073647,  0.010148  ,
          0.03576307,  0.03502304,  0.01398713],
        [-0.00018657, -0.00194967,  0.00296764,  0.01016546,  0.04235235,
         -0.04009102,  0.01080426, -0.01414054],
        [ 0.02586014,  0.02664882, -0.02230787, -0.04147078,  0.04201274,
         -0.03548731, -0.00069854,  0.02628198]], dtype=float32)]

Scores can be found using the dot product.

In [6]:
dummy_users

array(['pmfkU4BNZhmtLgJQwJ7x', 'UDRRwOlzlWVbu7H8YCCi',
       'QHGAef0TI6dhn0wTogvW', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype='<U20')

In [7]:
dummy_user_embedding = tf.keras.layers.Embedding(len(dummy_users), 6)
product_embedding = tf.keras.layers.Embedding(len(products),6)

In [8]:
dummy_user_embedding(1)

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([-0.03877491, -0.01857798,  0.03060961, -0.03985148,  0.03403682,
        0.04765603], dtype=float32)>

In [9]:
product_embedding(99)

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([-0.04018484,  0.01880142,  0.01420439, -0.01192355,  0.04829713,
       -0.00769155], dtype=float32)>

We can score multiple products at the same time, which is what we need to create a ranking.

In [10]:
example_product = tf.constant([1,77,104,2026])
product_embedding(example_product)

<tf.Tensor: shape=(4, 6), dtype=float32, numpy=
array([[ 0.04389915, -0.02098562,  0.0093928 , -0.01212398, -0.04824851,
         0.03075099],
       [-0.00081811, -0.04762458,  0.00214016, -0.04533006,  0.04554219,
        -0.00096775],
       [ 0.00892078, -0.0465144 ,  0.0340861 ,  0.00752052,  0.00444318,
         0.00236888],
       [-0.03004528,  0.03948194,  0.03390433,  0.04078564,  0.04174389,
        -0.02734616]], dtype=float32)>

And we can score multiple users for multiple products which we will need to do if we are to train quickly.

In [11]:
tf.tensordot(dummy_user_embedding(1), product_embedding(example_product), axes=[[0],[1]])

<tf.Tensor: shape=(4,), dtype=float32, numpy=
array([-7.1840239e-04,  4.2924620e-03,  1.5260228e-03, -3.8441623e-05],
      dtype=float32)>

But we need to map product ids to embedding ids.

In [12]:
product_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(tf.constant(products, dtype=tf.int32), 
                                        range(len(products))), -1)

Let's put those two things together

In [15]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, length_of_embedding):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)
        
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), length_of_embedding)
        self.product_embedding = tf.keras.layers.Embedding(len(products), length_of_embedding)

        self.dot = tf.keras.layers.Dot(axes = -1)
        
    def call(self, inputs):
        user = inputs[0]
        products = inputs[1]
        user_embedding_index = self.dummy_user_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)
        user_embedding_values = self.user_embedding(user_embedding_index)
        product_embedding_values = self.product_embedding(product_embedding_index)

        return tf.squeeze(self.dot([user_embedding_values, product_embedding_values]), 1)
    
    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

In [16]:
srl = SimpleRecommender(dummy_users, products, 15)
srl([tf.constant([['pmfkU4BNZhmtLgJQwJ7x'], ['UDRRwOlzlWVbu7H8YCCi']]),
     tf.constant([[8650774,9306139,9961521], [12058614,12058615,11927550]])])

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 4.9216463e-03, -3.6392447e-03, -5.0791702e-04],
       [ 1.4764969e-05, -1.6119204e-03, -7.9826411e-04]], dtype=float32)>

# Creating a dataset

First create a tf.data.Dataset from the user purchase pairs.

In [17]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


In [18]:
random_negatives = tf.random.uniform((7,), minval = 0, maxval = len(products), dtype = tf.int32 )
random_negatives

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([25148, 16185, 29442,  5660, 28729, 26461, 13512], dtype=int32)>

In [19]:
tf.gather(products, random_negatives)

<tf.Tensor: shape=(7,), dtype=int64, numpy=
array([12039919, 11475282, 11140017, 10379216, 10875377,  6802634,
       10414804])>

In [20]:
possible_products = [x for x in products if x not in random_negatives]
possible_products_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

For each purchase let's sample a number of products that the user did not purchase. Then the model can score each of the products and we will know we are doing a good job if the product with the highest score is the product that the user actually purchased.

We can do this using dataset.map

In [21]:
tf.one_hot(0, depth = 11)

<tf.Tensor: shape=(11,), dtype=float32, numpy=array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [22]:
class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products
        self.y = tf.one_hot(0, num_negative_products+1)
    
    def __call__(self, user, product):
      random_negatives = tf.random.uniform((self.num_possible_products, ), minval = 0, maxval = self.num_possible_products, dtype = tf.int32)
      negatives = tf.gather(self,possible_products_tensor, random_negatives)
      candidates = tf.concat([product, negatives], axis = 0)
      return (user, candidates), self.y

Let's bring the steps together to define a function which creates a dataset 

In [23]:
def get_dataset(df, products, num_negative_products):
  dummy_user_tensor = tf.constant(df[["dummyUserId"]].values, dtype=tf.string)
  product_tensor = tf.constant(df[["productId"]].values, dtype=tf.int32)

  dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
  dataset = dataset.map(Mapper(products, num_negative_products))
  dataset = dataset.batch(1024)
  return dataset

# Train a model

We need to compile a model, set the loss and create an evaluation metric. Then we need to train the model.

Let's do a manual check on whether the model is any good.

In [None]:
model = SimpleRecommender(dummy_users, products, 15)
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True),
              optimizer = tf.keras.optimizers.SGD(learning_rate = 100.),
              metrics = [tf.keras.metrics.CategoricalAccuracy()])
model.fit(get_dataset(train, products, 100), validation_data=get_dataset(valid, products, 100), epochs=5)

In [None]:
test_product = 11698965

In [None]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))

# Save the model

In [None]:
model_path = "models/recommender/1"

In [None]:
inpute_signature = tf.TensorSpec(shape=(), dtype=tf.int32)

In [None]:
signatures = { 'call_item_item': r1.call_item_item.get_concrete_function(inpute_signature)}

In [None]:
imported_model = tf.saved_model.load('models/recommeder/1')
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['call_item_item'](tf.constant([14844847]))

In [None]:
os.makedirs("dummy/0")
tf.saved_model.save(model, 'dummy/0')    
imported = tf.saved_model.load("dummy/0")
imported(tf.constant([14844847]))

In [None]:
os.makedirs("dummy/1")
tf.saved_model.save(model, 'dummy/1',
                    model.call_item_item.get_concrete_function(tf.TensorSpec(shape=(), dtype=tf.int32)))      
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['serving_default'](tf.constant([14844847]))

Zipping the saved model will make it easier to download.

In [None]:
from zipfile import ZipFile
import os
# create a ZipFile object
with ZipFile('recommender.zip', 'w') as zipObj:
   # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk("models"):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath)