In [1]:
%load_ext autoreload
%autoreload 2

In [67]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import layers

tf.enable_eager_execution()
pd.set_option('float_format', '{:f}'.format)
tf.__version__

'1.13.1'

In [3]:
%%time
df_meta = pd.read_csv('item_metadata.csv', index_col='item_id')
df_train = pd.read_csv('train.csv')

Wall time: 36.7 s


In [4]:
%%time
max_meta_len = df_meta['properties'].apply(lambda prop: len(prop.split('|'))).max()
max_meta_len

Wall time: 1.36 s


In [13]:
class FastLabelEncoder:
    def __init__(self):
        self.token_to_id = dict()
        self.id_to_token = dict()
        self.unk_id = 1
        self.pad_id = 0
        self.current_id = 2

    def __len__(self):
        return len(self.token_to_id)

    def _pad(self, seq, pad):
        if pad < 0: return seq
        if len(seq) > pad: return seq[:pad]
        return seq + [self.pad_id] * (pad - len(seq))

    def fit(self, X):
        for row in X:
            for token in row:
                if token not in self.token_to_id:
                    self.token_to_id[token] = self.current_id
                    self.id_to_token[self.current_id] = token
                    self.current_id += 1

        return self

    def transform(self, X, pad=-1):
        return np.array([self._pad([
              self.token_to_id[token] if token in self.token_to_id else self.unk_id
              for token in row
        ], pad) for row in X])
        
    def fit_transform(self, X, pad=-1):
        return self.fit(X).transform(X, pad)

In [36]:
class Input:
    col_shapes = {
        'type': tf.TensorShape([None]),
        'value': tf.TensorShape([None, max_meta_len]),
        'impr': tf.TensorShape([None]),
        'label': tf.TensorShape([])
    }
    out_shapes = tuple(v for k, v in col_shapes.items())

    def __init__(self):
        self.action_types_encoder = FastLabelEncoder()
        self.action_vals_encoder = FastLabelEncoder()
        self.impressions_encoder = FastLabelEncoder()
        self.labels_encoder = FastLabelEncoder()

    def generator(self):
        last_session_id = None
        action_types = []
        action_vals = []
        MISSING_REF_VAL = '?'

        for _, row in df_train.iterrows():
            if row['session_id'] != last_session_id:
                action_types = []
                action_vals = []

            action_val = row['reference']

            if row['action_type'] != 'clickout item':
                action_types.append(row['action_type'])

                if action_val.isdigit():
                    int_id = int(action_val)
                    if int_id in df_meta.index:
                        action_vals.append(df_meta.loc[int_id]['properties'].split('|'))
                    else:
                        action_vals.append([int_id])
                else:
                    action_vals.append([action_val])
            else:
                yield self.action_types_encoder.fit_transform([action_types])[0], \
                      self.action_vals_encoder.fit_transform(action_vals, pad=max_meta_len), \
                      self.impressions_encoder.fit_transform([row['impressions'].split('|')])[0], \
                      self.labels_encoder.fit_transform([[int(action_val)]])[0][0]

            last_session_id = row['session_id']

    def as_dataset(self, batch_size):
        dataset = tf.data.Dataset.from_generator(
            self.generator,
            (tf.int32,) * 4,
            self.out_shapes
        )

        dataset = dataset.padded_batch(batch_size, padded_shapes=self.out_shapes)

        return dataset

In [8]:
next(Input().generator())

(array([2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
 array([[ 2,  0,  0, ...,  0,  0,  0],
        [ 3,  4,  5, ...,  0,  0,  0],
        [ 3,  4,  5, ...,  0,  0,  0],
        ...,
        [28, 29, 30, ...,  0,  0,  0],
        [28, 29, 30, ...,  0,  0,  0],
        [28, 29, 30, ...,  0,  0,  0]]),
 array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26]),
 2)

In [9]:
%%time
i = Input().generator()
next(i)
for _ in range(5000):
    next(i)

Wall time: 12.5 s


too slow 

https://stackoverflow.com/questions/24870953/does-pandas-iterrows-have-performance-issues

In [10]:
inp = Input()
dataset = inp.as_dataset(batch_size=3)

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


In [11]:
dataset.make_one_shot_iterator().get_next()

Instructions for updating:
Colocations handled automatically by placer.


(<tf.Tensor: id=47, shape=(3, 34), dtype=int32, numpy=
 array([[2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4]])>,
 <tf.Tensor: id=48, shape=(3, 34, 112), dtype=int32, numpy=
 array([[[ 2,  0,  0, ...,  0,  0,  0],
         [ 3,  4,  5, ...,  0,  0,  0],
         [ 3,  4,  5, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0]],
 
        [[ 2,  0,  0, ...,  0,  0,  0],
         [ 3,  4,  5, ...,  0,  0,  0],
         [ 3,  4,  5, ...,  0,  0,  0],
         ...,
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0],
         [ 0,  0,  0, ...,  0,  0,  0]],
 
     

---

In [81]:
action_type_emb_size = 20
action_val_emb_size = 20
output_emb_size = 20

input_action_type = layers.Input(shape=Input.col_shapes['type'])
input_action_val = layers.Input(shape=Input.col_shapes['value'])
label = layers.Input(shape=Input.col_shapes['label'])

embedding_action_type = layers.Embedding(input_dim=len(inp.action_types_encoder),
                                         output_dim=action_type_emb_size,
                                         mask_zero=True)(input_action_type)

embedding_action_val = layers.Embedding(input_dim=len(inp.action_vals_encoder),
                                         output_dim=action_val_emb_size,
                                         mask_zero=True)(input_action_val)

embedding_output = layers.Embedding(input_dim=len(inp.labels_encoder),
                                    output_dim=output_emb_size,
                                    mask_zero=True)(label)

dimension_of_seq_features = 2
embedding_action_val_reduced = tf.reduce_mean(embedding_action_val, dimension_of_seq_features)

embedding_input = layers.Concatenate(axis=2)([embedding_action_type, embedding_action_val_reduced])

In [63]:
# (batch, sequence, embedding of items in sequence)
input_embedding, embedding_output

(<tf.Tensor 'concat_8:0' shape=(?, ?, 40) dtype=float32>,
 <tf.Tensor 'embedding_lookup_27/Identity_2:0' shape=(?, 20) dtype=float32>)

In [51]:
import SASRec.modules as sas_module

In [78]:
num_blocks = 2
hidden_units = 40
num_heads = 1
dropout_rate = 0.5
is_training = True

seq = input_embedding
mask = tf.to_float(tf.not_equal(input_embedding, 0))

for i in range(num_blocks):
    with tf.variable_scope('num_blocks_%d' % i):
        # self-attention
        seq = sas_module.multihead_attention(
            queries=normalize(seq),
            keys=seq,
            num_units=hidden_units,
            num_heads=num_heads,
            dropout_rate=dropout_rate,
            is_training=is_training,
            causality=True,
            scope='self_attention'
        )

        # Feed forward
        seq = sas_module.feedforward(
            normalize(seq),
            num_units=[hidden_units, hidden_units],
            dropout_rate=dropout_rate,
            is_training=is_training
        )

        seq *= mask

seq = sas_module.normalize(seq)

AttributeError: 'Tensor' object has no attribute '_copy'