In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import layers

# tf.enable_eager_execution()
pd.set_option('float_format', '{:f}'.format)
tf.__version__

'1.13.1'

In [4]:
%%time
df_meta = pd.read_csv('item_metadata.csv', index_col='item_id')
df_train = pd.read_csv('train.csv')

CPU times: user 27.7 s, sys: 2.5 s, total: 30.2 s
Wall time: 30.3 s


In [5]:
%%time
max_meta_len = df_meta['properties'].apply(lambda prop: len(prop.split('|'))).max()
print(max_meta_len)

112
CPU times: user 1.17 s, sys: 7.59 ms, total: 1.18 s
Wall time: 1.18 s


In [6]:
grouped = df_train.groupby(['session_id'])
grouped['user_id'].count().describe()

count   910683.000000
mean        17.495651
std         48.181687
min          1.000000
25%          2.000000
50%          4.000000
75%         13.000000
max       3522.000000
Name: user_id, dtype: float64

In [34]:
max_seq_len = 100

In [42]:
class FastLabelEncoder:
    def __init__(self):
        self.token_to_id = dict()
        self.id_to_token = dict()
        self.unk_id = 1
        self.pad_id = 0
        self.current_id = 2

    def __len__(self):
        return len(self.token_to_id)

    def _pad(self, seq, pad):
        if pad < 0: return seq
        if len(seq) > pad: return seq[:pad]
        return seq + [self.pad_id] * (pad - len(seq))

    def fit(self, X):
        for row in X:
            for token in row:
                if token not in self.token_to_id:
                    self.token_to_id[token] = self.current_id
                    self.id_to_token[self.current_id] = token
                    self.current_id += 1

        return self

    def transform(self, X, pad=-1):
        return np.array([self._pad([
              self.token_to_id[token] if token in self.token_to_id else self.unk_id
              for token in row
        ], pad) for row in X])
        
    def fit_transform(self, X, pad=-1):
        return self.fit(X).transform(X, pad)

In [43]:
class Input:
    pad_shapes = {
        'type': tf.TensorShape([max_seq_len]),
        'value': tf.TensorShape([max_seq_len, max_meta_len]),
        'neg': tf.TensorShape([25]),
        'pos': tf.TensorShape([])
    }
    out_shapes = tuple(v for k, v in pad_shapes.items())

    def __init__(self):
        self.action_types_encoder = FastLabelEncoder()
        self.action_vals_encoder = FastLabelEncoder()
        self.labels_encoder = FastLabelEncoder()

    def generator(self):
        last_session_id = None
        action_types = []
        action_vals = []
        MISSING_REF_VAL = '?'

        for _, row in df_train.iterrows():
            if row['session_id'] != last_session_id:
                action_types = []
                action_vals = []

            action_val = row['reference']

            if row['action_type'] != 'clickout item':
                action_types.append(row['action_type'])

                if action_val.isdigit():
                    int_id = int(action_val)
                    if int_id in df_meta.index:
                        action_vals.append(df_meta.loc[int_id]['properties'].split('|'))
                    else:
                        action_vals.append([int_id])
                else:
                    action_vals.append([action_val])
            else:
                neg = [int(i) for i in row['impressions'].split('|') if i != action_val]
                yield self.action_types_encoder.fit_transform([action_types])[0], \
                      self.action_vals_encoder.fit_transform(action_vals, pad=max_meta_len), \
                      self.labels_encoder.fit_transform([neg])[0], \
                      self.labels_encoder.fit_transform([[int(action_val)]])[0][0]

            last_session_id = row['session_id']

    def as_dataset(self, batch_size):
        dataset = tf.data.Dataset.from_generator(
            self.generator,
            (tf.int32,) * 4,
            (
                tf.TensorShape([None]),
                tf.TensorShape([None, None]),
                tf.TensorShape([None]),
                tf.TensorShape([])
            )
        )

        dataset = dataset.padded_batch(batch_size, padded_shapes=self.out_shapes)

        return dataset

In [29]:
next(Input().generator())

(array([2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
 array([[ 2,  0,  0, ...,  0,  0,  0],
        [ 3,  4,  5, ...,  0,  0,  0],
        [ 3,  4,  5, ...,  0,  0,  0],
        ...,
        [28, 29, 30, ...,  0,  0,  0],
        [28, 29, 30, ...,  0,  0,  0],
        [28, 29, 30, ...,  0,  0,  0]]),
 array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25]),
 26)

In [10]:
%%time
i = Input().generator()
next(i)
for _ in range(5000):
    next(i)

Wall time: 13.1 s


too slow 

https://stackoverflow.com/questions/24870953/does-pandas-iterrows-have-performance-issues

In [44]:
inp = Input()
dataset = inp.as_dataset(batch_size=3)

In [45]:
dataset.make_one_shot_iterator().get_next()

(<tf.Tensor 'IteratorGetNext_4:0' shape=(?, 100) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_4:1' shape=(?, 100, 112) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_4:2' shape=(?, 25) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_4:3' shape=(?,) dtype=int32>)

---

In [46]:
import SASRec.modules as sas_module

In [50]:
def mk_model():
    action_type_emb_size = 20
    action_val_emb_size = 20
    output_emb_size = 20
    
    type_input = layers.Input(shape=Input.pad_shapes['type'])
    value_input = layers.Input(shape=Input.pad_shapes['value'])
    pos_input = layers.Input(shape=Input.pad_shapes['pos'])
    neg_input = layers.Input(shape=Input.pad_shapes['neg'])

    action_type = layers.Embedding(
        input_dim=len(inp.action_types_encoder),
        output_dim=action_type_emb_size,
        mask_zero=True
    )(type_input)

    action_val = layers.Embedding(
        input_dim=len(inp.action_vals_encoder),
        output_dim=action_val_emb_size,
        mask_zero=True
    )(value_input)

    pos = layers.Embedding(
        input_dim=len(inp.labels_encoder),
        output_dim=output_emb_size,
        mask_zero=True
    )(pos_input)

    neg = layers.Embedding(
        input_dim=len(inp.labels_encoder),
        output_dim=output_emb_size,
        mask_zero=True
    )(neg_input)

    dimension_of_seq_features = 2
    action_val_reduced = tf.reduce_mean(action_val, dimension_of_seq_features)

    seq = (action_type + action_val_reduced) / 2

    # (batch, sequence, embedding of items in sequence)
    seq, pos, neg
    # (<tf.Tensor 'Reshape_3:0' shape=(?, 1000) dtype=float32>,
    #  <tf.Tensor 'embedding_lookup_22/Identity_2:0' shape=(?, 20) dtype=float32>,
    #  <tf.Tensor 'Mean_6:0' shape=(?, 20) dtype=float32>)

    num_blocks = 2
    hidden_units = output_emb_size
    num_heads = 2
    dropout_rate = 0.5
    is_training = True

    mask = tf.to_float(tf.not_equal(seq, 0))

    for i in range(num_blocks):
        with tf.variable_scope('num_blocks_%d' % i):
            # self-attention
            seq = sas_module.multihead_attention(
                queries=sas_module.normalize(seq),
                keys=seq,
                num_units=hidden_units,
                num_heads=num_heads,
                dropout_rate=dropout_rate,
                is_training=is_training,
                causality=True,
                scope='self_attention'
            )

            # Feed forward
            seq = sas_module.feedforward(
                sas_module.normalize(seq),
                num_units=[hidden_units, hidden_units],
                dropout_rate=dropout_rate,
                is_training=is_training
            )

            seq *= mask

    seq = sas_module.normalize(seq)
    
    seq = tf.reshape(seq, [tf.shape(seq)[0], max_seq_len * output_emb_size])
    neg_emb = tf.reduce_mean(neg, 1)
    pos_emb = pos

    seq_logits = tf.keras.layers.Dense(pos.shape[1].value)(seq)
    pos_logits = tf.reduce_sum(pos_emb * seq_logits, -1)
    neg_logits = tf.reduce_sum(neg_emb * seq_logits, -1)
    
    loss = tf.reduce_sum(
        - tf.log(tf.sigmoid(pos_logits) + 1e-24)
        - tf.log(1 - tf.sigmoid(neg_logits) + 1e-24)
    )
    
    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss += sum(reg_losses)
    
    tf.summary.scalar('loss', loss)
    
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta2=0.98).minimize(loss)
    
    return type_input, value_input, pos_input, neg_input, loss, optimizer

In [53]:
with tf.Graph().as_default():
    with tf.Session() as sess:
        type_input, value_input, pos_input, neg_input, loss, optimizer = mk_model()
        dataset = inp.as_dataset(batch_size=1)
        t, v, n, p = dataset.make_one_shot_iterator().get_next()
        t, v, n, p = t.eval(session=sess), v.eval(session=sess), n.eval(session=sess), p.eval(session=sess)

        sess.run(tf.global_variables_initializer())
        _, _ = sess.run(
            [loss, optimizer],
            {type_input: t, value_input: v, pos_input: p, neg_input: n}
        )

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')