In [1]:
%load_ext autoreload
%autoreload 2

In [175]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import layers
from tqdm.auto import tqdm

# tf.enable_eager_execution()
tqdm.pandas()
pd.set_option('float_format', '{:f}'.format)
tf.__version__

'1.13.1'

In [3]:
%%time
df_meta = pd.read_csv('item_metadata.csv', index_col='item_id')
df_train = pd.read_csv('train.csv')

Wall time: 36.5 s


In [4]:
%%time
max_meta_len = df_meta['properties'].apply(lambda prop: len(prop.split('|'))).max()
print(max_meta_len)

112
Wall time: 1.38 s


In [5]:
grouped = df_train.groupby(['session_id'])
grouped['user_id'].count().describe()

count   910683.000000
mean        17.495651
std         48.181687
min          1.000000
25%          2.000000
50%          4.000000
75%         13.000000
max       3522.000000
Name: user_id, dtype: float64

In [6]:
max_seq_len = 100

In [279]:
class FastLabelEncoder:
    def __init__(self):
        self.token_to_id = dict()
        self.id_to_token = dict()
        self.unk_id = 1
        self.pad_id = 0
        self.current_id = 2

    def __len__(self):
        return len(self.token_to_id) + 2

    def _pad(self, seq, pad):
        if pad < 0: return seq
        if len(seq) > pad: return seq[:pad]
        return seq + [self.pad_id] * (pad - len(seq))

    def fit(self, X):
        for row in X:
            for token in row:
                if token not in self.token_to_id:
                    self.token_to_id[token] = self.current_id
                    self.id_to_token[self.current_id] = token
                    self.current_id += 1

        return self

    def transform(self, X, pad=-1):
        return np.array([self._pad([
              self.token_to_id[token] if token in self.token_to_id else self.unk_id
              for token in row
        ], pad) for row in X])
        
    def fit_transform(self, X, pad=-1):
        return self.fit(X).transform(X, pad)

In [74]:
%%time
ref_vals = df_train[df_train['action_type'].isin([
    'search for poi',
    'filter selection',
    'change of sort order',
    'search for destination'
])]['reference'].unique()

print('ref_vals', len(ref_vals))

ref_vals 36403
Wall time: 683 ms


In [93]:
%%time
meta_vals = np.array(list({i for items in df_meta['properties'].apply(lambda x: x.split('|')) for i in items}))

print('meta_vals', len(meta_vals))

meta_vals 157
Wall time: 5.02 s


In [130]:
%%time
action_val_dict = np.array(list(meta_vals) + list(ref_vals))

print('action_val_dict', len(action_val_dict))

action_val_dict 36560
Wall time: 62.5 ms


In [101]:
action_type_dict = df_train['action_type'].unique()

print('action_type_dict', len(action_type_dict))

action_type_dict 10


In [122]:
%%time
from collections import Counter

impr_counter = Counter()

def add_to_counter(l):
    for i in l:
        impr_counter[i] += 1

df_train[df_train['action_type'] == 'clickout item']['impressions'].progress_apply(
    lambda i: i.split('|')
).progress_apply(add_to_counter)

print('impr_counter', len(impr_counter))

HBox(children=(IntProgress(value=0, max=1586586), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1586586), HTML(value='')))

impr_counter 853540
Wall time: 27 s


In [127]:
impr_dict = [v[0] for v in impr_counter.most_common(100_000)]

print('impr_dict', len(impr_dict))

impr_dict 100000


In [167]:
impr_seq_len = len(df_train[df_train['action_type'] == 'clickout item']['impressions'].iloc[0].split('|'))
impr_seq_len

25

In [267]:
class Input:
    pad_shapes = {
        'type': tf.TensorShape([max_seq_len]),
        'value': tf.TensorShape([max_seq_len, max_meta_len]),
        'neg': tf.TensorShape([impr_seq_len]),
        'pos': tf.TensorShape([])
    }
    out_shapes = tuple(v for k, v in pad_shapes.items())

    def __init__(self):
        self.action_types_encoder = FastLabelEncoder().fit([action_type_dict])
        self.action_vals_encoder = FastLabelEncoder().fit([action_val_dict])
        self.labels_encoder = FastLabelEncoder().fit([impr_dict])
        

    def generator(self):
        last_session_id = None
        action_types = []
        action_vals = []
        MISSING_REF_VAL = '?'

        
        for _, row in df_train.iterrows(): # tqdm(df_train.iterrows(), total=len(df_train)):
            if row['session_id'] != last_session_id:
                action_types = []
                action_vals = []

            action_val = row['reference']

            if row['action_type'] != 'clickout item':
                action_types.append(row['action_type'])

                if action_val.isdigit():
                    int_id = int(action_val)
                    if int_id in df_meta.index:
                        action_vals.append(df_meta.loc[int_id]['properties'].split('|'))
                    else:
                        action_vals.append([action_val])
                else:
                    action_vals.append([action_val])
                    

                if len(action_types) > max_seq_len:
                    action_types.pop(0)
                    action_vals.pop(0)
            elif len(action_types) > 0:
                neg = [i for i in row['impressions'].split('|') if i != action_val]
                output = self.action_types_encoder.transform([action_types])[0], \
                         self.action_vals_encoder.transform(action_vals, pad=max_meta_len), \
                         self.labels_encoder.transform([neg])[0], \
                         self.labels_encoder.transform([[action_val]])[0][0]
                
                yield output

            last_session_id = row['session_id']

    def as_dataset(self, batch_size):
        dataset = tf.data.Dataset.from_generator(
            self.generator,
            (tf.int32,) * 4,
            (
                tf.TensorShape([None]),
                tf.TensorShape([None, None]),
                tf.TensorShape([None]),
                tf.TensorShape([])
            )
        )

        dataset = dataset.padded_batch(batch_size, padded_shapes=self.out_shapes).repeat()

        return dataset

In [245]:
# next(Input().generator())

In [246]:
# %%time
# i = Input().generator()
# next(i)
# for _ in range(5000):
#     next(i)

too slow 

https://stackoverflow.com/questions/24870953/does-pandas-iterrows-have-performance-issues

In [247]:
%%time
Input().as_dataset(batch_size=3).make_one_shot_iterator().get_next()

Wall time: 114 ms


(<tf.Tensor 'IteratorGetNext_11:0' shape=(?, 100) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_11:1' shape=(?, 100, 112) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_11:2' shape=(?, 25) dtype=int32>,
 <tf.Tensor 'IteratorGetNext_11:3' shape=(?,) dtype=int32>)

---

In [248]:
import SASRec.modules as sas_module

In [280]:
inp = Input()
dataset = inp.as_dataset(batch_size=3)

def mk_model():
    action_type_emb_size = 20
    action_val_emb_size = 20
    output_emb_size = 20
    
    type_input = layers.Input(shape=Input.pad_shapes['type'])
    value_input = layers.Input(shape=Input.pad_shapes['value'])
    pos_input = layers.Input(shape=Input.pad_shapes['pos'])
    neg_input = layers.Input(shape=Input.pad_shapes['neg'])

    action_type = layers.Embedding(
        input_dim=len(inp.action_types_encoder),
        output_dim=action_type_emb_size,
        mask_zero=True
    )(type_input)

    action_val = layers.Embedding(
        input_dim=len(inp.action_vals_encoder),
        output_dim=action_val_emb_size,
        mask_zero=True
    )(value_input)

    pos = layers.Embedding(
        input_dim=len(inp.labels_encoder),
        output_dim=output_emb_size,
        mask_zero=True
    )(pos_input)

    neg = layers.Embedding(
        input_dim=len(inp.labels_encoder),
        output_dim=output_emb_size,
        mask_zero=True
    )(neg_input)

    dimension_of_seq_features = 2
    action_val_reduced = tf.reduce_mean(action_val, dimension_of_seq_features)

    seq = (action_type + action_val_reduced) / 2

    # (batch, sequence, embedding)
    # seq, pos, neg
    # (<tf.Tensor 'Reshape_3:0' shape=(?, 1000) dtype=float32>,
    #  <tf.Tensor 'embedding_lookup_22/Identity_2:0' shape=(?, 20) dtype=float32>,
    #  <tf.Tensor 'Mean_6:0' shape=(?, 20) dtype=float32>)

    num_blocks = 2
    hidden_units = output_emb_size
    num_heads = 2
    dropout_rate = 0.5
    is_training = True

    mask = tf.to_float(tf.not_equal(seq, 0))

    for i in range(num_blocks):
        with tf.variable_scope('num_blocks_%d' % i):
            # self-attention
            seq = sas_module.multihead_attention(
                queries=sas_module.normalize(seq),
                keys=seq,
                num_units=hidden_units,
                num_heads=num_heads,
                dropout_rate=dropout_rate,
                is_training=is_training,
                causality=True,
                scope='self_attention'
            )

            # Feed forward
            seq = sas_module.feedforward(
                sas_module.normalize(seq),
                num_units=[hidden_units, hidden_units],
                dropout_rate=dropout_rate,
                is_training=is_training
            )

            seq *= mask

    seq = sas_module.normalize(seq)
    
    seq = tf.reshape(seq, [tf.shape(seq)[0], max_seq_len * output_emb_size])
    neg_emb = tf.reduce_mean(neg, 1)
    pos_emb = pos

    seq_logits = tf.keras.layers.Dense(pos.shape[1].value)(seq)
    pos_logits = tf.reduce_sum(pos_emb * seq_logits, -1)
    neg_logits = tf.reduce_sum(neg_emb * seq_logits, -1)
    
    loss = tf.reduce_sum(
        - tf.log(tf.sigmoid(pos_logits) + 1e-24)
        - tf.log(1 - tf.sigmoid(neg_logits) + 1e-24)
    )
    
    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss += sum(reg_losses)
    
    tf.summary.scalar('loss', loss)
    
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta2=0.98).minimize(loss)
    
    return type_input, value_input, pos_input, neg_input, loss, optimizer

In [None]:
with tf.Graph().as_default():
    with tf.Session() as sess:
        type_input, value_input, pos_input, neg_input, loss, optimizer = mk_model()
        iterator = inp.as_dataset(batch_size=512).make_one_shot_iterator()
        t, v, n, p = iterator.get_next()

        for i in tqdm(range(1000)):
            t_val = t.eval(session=sess)
            v_val = v.eval(session=sess)
            n_val = n.eval(session=sess)
            p_val = p.eval(session=sess)

            sess.run(tf.global_variables_initializer())
            cost, _ = sess.run(
                [loss, optimizer],
                {type_input: t_val, value_input: v_val, pos_input: p_val, neg_input: n_val}
            )
            
            if i % 10 == 0:
                print('#', i, cost)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

# 0 704.35315
# 10 712.4947
# 20 716.7414
# 30 701.52844
# 40 709.5875
# 50 713.99927


In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')