In [1]:
import os
import time
import pprint

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from typing import Dict, Text

plt.style.use('seaborn-whitegrid')

2022-04-24 01:19:36.541562: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-24 01:19:36.541583: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Load Dataset

In [20]:
article_df = pd.read_csv('dataset/articles.csv')
customer_df = pd.read_csv('dataset/customers.csv')
trans_df = pd.read_csv('dataset/transactions_train.csv')

In [21]:
article_df.fillna(value='', inplace=True)

In [22]:
trans_df['t_dat'] = pd.to_datetime(trans_df['t_dat']).values.astype(int) / 10**9

In [5]:
trans_df.customer_id.values[0]

'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318'

In [6]:
trans_df.rename(columns={"t_dat":"timestamp"}, inplace=True)

In [7]:
article_map = dict(article_df[['article_id', 'prod_name']].values)

In [8]:
trans_df['prod_name'] = trans_df.article_id.map(article_map)

In [9]:
articles = tf.data.Dataset.from_tensor_slices(dict(article_df)).map(lambda x: {
    'article_id' : tf.strings.as_string(x['article_id']),
    'prod_name' : x['prod_name'],
})

trans = tf.data.Dataset.from_tensor_slices(dict(trans_df[:1000000])).map(lambda x: {
    'customer_id' : x['customer_id'],
    'article_id' : tf.strings.as_string(x['article_id']),
    'prod_name' : x['prod_name'],
    'timestamp' : x['timestamp'],
})


2022-04-24 01:20:12.563060: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-24 01:20:12.563082: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-24 01:20:12.563098: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iftikar): /proc/driver/nvidia/version does not exist
2022-04-24 01:20:12.564205: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Features Preprocessing

In [10]:
timestamps = np.concatenate(list(trans.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [11]:
### USE THIS TO AVOID STRINGLOOKUP
article_ids = articles.map(lambda x: x['article_id']).batch(1_000)
unique_article_ids = np.unique(np.concatenate(list(article_ids)))

customer_ids = trans.map(lambda x: x['customer_id']).batch(1_000)
unique_customer_ids = np.unique(np.concatenate(list(customer_ids)))

## Two-tower Model

In [12]:
class CustomerModel(tf.keras.Model):
  def __init__(self, use_timestamps):
    super().__init__()

    self._use_timestamps = use_timestamps
    
    embedding_dim = 32
    
    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_customer_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dim),
    ])

    if use_timestamps:
      self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 1, embedding_dim),
      ])
      self.normalized_timestamp = tf.keras.layers.Normalization(
          axis=None
      )

      self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    print(input)
    if not self._use_timestamps:
      return self.user_embedding(inputs["customer_id"])

    return tf.concat([
        self.user_embedding(inputs["customer_id"]),
        self.timestamp_embedding(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
    ], axis=1)

In [13]:
class ArticleModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000
    
    embedding_dim = 32

    self.article_id_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_article_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dim)
    ])

    self.article_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.article_text_embedding = tf.keras.Sequential([
      self.article_vectorizer,
      tf.keras.layers.Embedding(max_tokens, embedding_dim, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.article_vectorizer.adapt(articles.map(lambda x: x["prod_name"]))

  def call(self, inputs):
    return tf.concat([
        self.article_id_embedding(inputs["article_id"]),
        self.article_text_embedding(inputs["prod_name"]),
    ], axis=1)

In [14]:
class HNMModel(tfrs.models.Model):

  def __init__(self, use_timestamps):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      CustomerModel(use_timestamps),
      tf.keras.layers.Dense(32)
    ])
    self.candidate_model = tf.keras.Sequential([
      ArticleModel(),
      tf.keras.layers.Dense(32)
    ])

    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    query_embedding = self.query_model({
        "customer_id": features["customer_id"],
        "timestamp": features["timestamp"],
    })
    candidate_embedding = self.candidate_model({
        "article_id": features["article_id"],
        "prod_name": features["prod_name"],
    })

    return self.task(query_embedding, candidate_embedding)

## Model Training

In [25]:
EVAL_FREQ = 1
EPOCHS = 2

In [26]:
shuffled = trans.shuffle(100_000, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [27]:
checkpoint_path = "checkpoints/test_1/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1,
    period=1)



In [28]:
model = HNMModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

wo_timestamp_hist = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=EVAL_FREQ,
    epochs=EPOCHS,
    callbacks=[cp_callback])

Consider rewriting this model with the Functional API.
Epoch 1/2
Consider rewriting this model with the Functional API.
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f0a4996f970>>
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f0a4996f970>>
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f0a4996f970>>
Consider rewriting this model with the Functional API.

Epoch 00001: saving model to checkpoints/test_1/cp-0001.ckpt
Epoch 2/2
 8/40 [=====>........................] - ETA: 1:31 - factorized_top_k/top_1_categorical_accuracy: 0.0214 - factorized_top_k/top_5_categorical_accuracy: 0.0563 - factorized_top_k/top_10_categorical_accuracy: 0.0778 - factorized_top_k/top_50_categori

KeyboardInterrupt: 

In [None]:
model = HNMModel(use_timestamps=True)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

w_timestamp_hist = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=EVAL_FREQ,
    epochs=EPOCHS)

In [None]:
num_validation_runs = len(wo_timestamp_hist.history["val_factorized_top_k/top_10_categorical_accuracy"])
epochs = [(x + 1)* EVAL_FREQ for x in range(num_validation_runs)]

plt.plot(epochs, wo_timestamp_hist.history["val_factorized_top_k/top_10_categorical_accuracy"], label="w/o timesteps")
plt.plot(epochs, w_timestamp_hist.history["val_factorized_top_k/top_10_categorical_accuracy"], label="w/ timesteps")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-100 accuracy");
plt.legend()

## Get Recommendation

In [None]:
articles.batch(100).map(lambda x: model.candidate_model({
    'article_id': x['article_id'],
    'prod_name' : x['prod_name'],
}))

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)

identifiers = articles.batch(100).map(lambda x: x['article_id'])

candidates = articles.batch(100).map(lambda x: model.candidate_model({
        'article_id': x['article_id'],
        'prod_name' : x['prod_name'],
    }))

index.index_from_dataset(tf.data.Dataset.zip((identifiers, candidates)))

test_query = dict(trans_df[['customer_id',
                           'timestamp',
                        ]].iloc[0].map(lambda x: tf.expand_dims(x, axis=0)))

_, titles = index(test_query, k=12)
print(f"Top 12 recommendations for user 40: {titles}")

In [None]:
article_map[684080001]