In [33]:
import os

import time
import pprint

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from typing import Dict, Text

plt.style.use('seaborn-whitegrid')

## Load Dataset

In [2]:
article_df = pd.read_csv('dataset/articles.csv')
customer_df = pd.read_csv('dataset/customers.csv')
trans_df = pd.read_csv('dataset/transactions_train.csv')

In [3]:
article_df.fillna(value='', inplace=True)

In [6]:
trans_df['t_dat'] = pd.to_datetime(trans_df['t_dat']).values.astype(int) / 10**9

In [7]:
trans_df.customer_id.values[0]

'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318'

In [8]:
trans_df.rename(columns={"t_dat":"timestamp"}, inplace=True)

In [9]:
article_map = dict(article_df[['article_id', 'prod_name']].values)

In [10]:
trans_df['prod_name'] = trans_df.article_id.map(article_map)

In [11]:
articles = tf.data.Dataset.from_tensor_slices(dict(article_df)).map(lambda x: tf.strings.as_string(x['article_id']))

trans = tf.data.Dataset.from_tensor_slices(dict(trans_df[:1000000])).map(lambda x: {
    'customer_id' : x['customer_id'],
    'article_id' : tf.strings.as_string(x['article_id']),
    'prod_name' : x['prod_name'],
    'timestamp' : x['timestamp'],
})

### Preprocessing

In [12]:
timestamps = np.concatenate(list(trans.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [13]:
# ### USE THIS TO USE STRINGLOOKUP
# customer_vocab = tf.keras.layers.StringLookup(mask_token=None)
# customer_vocab.adapt(trans.map(lambda x: x["customer_id"]))

# article_vocab = tf.keras.layers.StringLookup(mask_token=None)
# article_vocab.adapt(articles)

In [14]:
### USE THIS TO AVOID STRINGLOOKUP
article_ids = articles.batch(1_000)
unique_article_ids = np.unique(np.concatenate(list(article_ids)))

customer_ids = trans.map(lambda x: x['customer_id']).batch(1_000)
unique_customer_ids = np.unique(np.concatenate(list(customer_ids)))

In [15]:
d = articles.batch(1_000)
len(list(d.as_numpy_iterator()))

106

## Two-tower Model

In [16]:
# ### USE THIS TO USE STRINGLOOKUP
# embedding_dim = 32

# customer_model = tf.keras.Sequential([
#     customer_vocab,
#     tf.keras.layers.Embedding(customer_vocab.vocabulary_size(), embedding_dim)
# ])
# article_model = tf.keras.Sequential([
#     article_vocab,
#     tf.keras.layers.Embedding(article_vocab.vocabulary_size(), embedding_dim)
# ])

# task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
#     articles.batch(128).map(article_model)
#   )
# )

In [17]:
# ### USE THIS TO AVOID STRINGLOOKUP
# embedding_dim = 32

# customer_model = tf.keras.Sequential([
#     tf.keras.layers.StringLookup(vocabulary=unique_customer_ids, mask_token=None),
#     tf.keras.layers.Embedding(len(unique_customer_ids)+1, embedding_dim)
# ])
# article_model = tf.keras.Sequential([
#     tf.keras.layers.StringLookup(vocabulary=unique_article_ids, mask_token=None),
#     tf.keras.layers.Embedding(len(unique_article_ids)+1, embedding_dim)
# ])

# task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
#     articles.batch(128).map(article_model)
#   )
# )

In [18]:
class CustomerModel(tf.keras.Model):
  def __init__(self, use_timestamps):
    super().__init__()

    self._use_timestamps = use_timestamps
    
    embedding_dim = 32
    
    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_customer_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dim),
    ])

    if use_timestamps:
      self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 1, embedding_dim),
      ])
      self.normalized_timestamp = tf.keras.layers.Normalization(
          axis=None
      )

      self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    print(input)
    if not self._use_timestamps:
      return self.user_embedding(inputs["customer_id"])

    return tf.concat([
        self.user_embedding(inputs["customer_id"]),
        self.timestamp_embedding(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
    ], axis=1)

In [19]:
class ArticleModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000
    
    embedding_dim = 32

    self.article_id_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_article_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dim)
    ])

#     self.article_vectorizer = tf.keras.layers.TextVectorization(
#         max_tokens=max_tokens)

#     self.article_text_embedding = tf.keras.Sequential([
#       self.article_vectorizer,
#       tf.keras.layers.Embedding(max_tokens, embedding_dim, mask_zero=True),
#       tf.keras.layers.GlobalAveragePooling1D(),
#     ])

#     self.article_vectorizer.adapt(articles.map(lambda x: x["prod_name"]))

  def call(self, inputs):
    return tf.concat([
        self.article_id_embedding(inputs),
#         self.article_text_embedding(inputs["prod_name"]),
    ], axis=1)

In [20]:
class HNMModel(tfrs.models.Model):

  def __init__(self, use_timestamps):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      CustomerModel(use_timestamps),
      tf.keras.layers.Dense(32)
    ])
    self.candidate_model = tf.keras.Sequential([
      ArticleModel(),
      tf.keras.layers.Dense(32)
    ])

    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    query_embedding = self.query_model({
        "customer_id": features["customer_id"],
        "timestamp": features["timestamp"],
    })
    candidate_embedding = self.candidate_model({
        "article_id": features["article_id"],
#         "prod_name": features["prod_name"],
    })

    return self.task(query_embedding, candidate_embedding)

## Model Training

In [21]:
shuffled = trans.shuffle(100_000, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [22]:
list(cached_train.as_numpy_iterator())[0]

{'customer_id': array([b'023f824c14a72c9e34021bffbd5da1097f7f02cc931a1af78ba138d8e7dd8ef2',
        b'8f4e9d2a9bbe405873a55d1d4c31178b5684c781d448bcf90ec075c8dca02fe4',
        b'caa3ed432d26d8c4b1c29c43e67c232e7de4dbe93cb658c9ea2cbaf3131f3f31',
        ...,
        b'0af8b2efbdac8376e2c24db5783ce64dfc5f24ecc26612d2b4d52410f7b19165',
        b'4e6d744bf1dac1bbe4832725cdb37124304164276af58fe8f6b501c51b00c006',
        b'fd81dbfa6009110f8455616347ea56518e6b0161057346a47203f56f3e506744'],
       dtype=object),
 'article_id': array([b'676539002', b'562245059', b'657815001', ..., b'703193001',
        b'500435003', b'671505001'], dtype=object),
 'prod_name': array([b'Jewels', b'Luna skinny RW', b'GEMINI trousers', ...,
        b'EDC GABRIELLE DRESSS', b'Cool Bree bandana', b'EDC ROMAN BLOUSE'],
       dtype=object),
 'timestamp': array([-1.e-09,  0.e+00,  0.e+00, ..., -1.e-09, -1.e-09, -1.e-09])}

In [23]:
ds = tf.data.Dataset.range(100)
ds = ds.shuffle(10)

tr = ds.take(8)
test = ds.skip(8).take(2)

c_tr = tr.shuffle(10)

In [24]:
list(c_tr.as_numpy_iterator())

[10, 6, 7, 11, 12, 9, 13, 4]

In [43]:
checkpoint_path = "checkpoints/test_1/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1,
                                                period=1)



In [44]:
model = HNMModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model_def = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=100,
    callbacks=[cp_callback]
)

# train_accuracy = model.evaluate(
#     cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
# test_accuracy = model.evaluate(
#     cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

# print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
# print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Epoch 1/100
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x000002614642F9D0>>
<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x000002614642F9D0>>
Epoch 1: saving model to checkpoints/test_1\cp-0001.ckpt
Epoch 2/100
Epoch 2: saving model to checkpoints/test_1\cp-0002.ckpt
Epoch 3/100

KeyboardInterrupt: 

In [None]:
model = HNMModel(use_timestamps=True)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model_ts = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=100)

# train_accuracy = model.evaluate(
#     cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
# test_accuracy = model.evaluate(
#     cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

# print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
# print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

In [None]:
# model = RecModel(customer_model, article_model, task)
# model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# model.fit(trans.batch(4096), epochs=3)

In [None]:
num_validation_runs = len(model_def.history["val_factorized_top_k/top_10_categorical_accuracy"])
epochs = [(x + 1)* 5 for x in range(num_validation_runs)]

plt.plot(epochs, model_def.history["val_factorized_top_k/top_10_categorical_accuracy"], label="w/o timesteps")
plt.plot(epochs, model_ts.history["val_factorized_top_k/top_10_categorical_accuracy"], label="w/ timesteps")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-100 accuracy");
plt.legend()

## Get Recommendation

In [None]:
articles.batch(100)

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)

# print(articles.batch(100).map(lambda title: (title, model.candidate_model(title))))

# index.index_from_dataset(
#     articles.batch(100).map(lambda title: (title, model.candidate_model(title))))

index.index_from_dataset(
    articles.batch(100).map(lambda x: model.candidate_model({
        'article_id': x['article_id'],
        'prod_name' : x['prod_name'],
    })))

# index.index(articles.batch(100).map(model.candidate_model),
#            articles.batch(100).map(lambda x: x['article_id']))

# model.candidate_model('')

test_query = dict(trans_df[['customer_id',
                           'timestamp',
                        ]].iloc[10].map(lambda x: tf.expand_dims(x, axis=0)))

_, titles = index(test_query, k=3)
print(f"Top 12 recommendations for user 40: {titles[0, :20]}")

In [None]:
article_map[501323011]

In [None]:
trans_df.iloc[0].customer_id

In [None]:
trans_df[['customer_id','timestamp']].iloc[0]