In [1]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
from itertools import islice
from collections import OrderedDict

from utils import get_stats, PackNumericFeatures, get_normalize_function

tf.__version__, tf.keras.__version__

('2.2.0', '2.3.0-tf')

In [2]:
train = tf.data.experimental.make_csv_dataset("data/train_small.tsv",
                                           field_delim="\t",
                                           batch_size=1_000,
#                                            compression_type="GZIP",
                                           label_name="Click",
                                           num_epochs=1)

test = tf.data.experimental.make_csv_dataset("data/test.tsv",
                                           field_delim="\t",
                                           batch_size=10_000,
                                           label_name="Click",
                                           num_epochs=1)


NUMERIC_FEATURES = ["Depth", "Position", "Gender", "Age", "UserID", "AdvertiserId"]
TOKENS_FEATURES = ["AdKeyword_tokens", "AdTitle_tokens", "AdDescription_tokens", "Query_tokens"]

def tokenize(features, labels):
    # TODO: improve
    tokenized_features = []
    feature_dict = OrderedDict()
    for column in TOKENS_FEATURES:
        tokens_list_list = tf.strings.split(features[column], sep="|")
        print(tokens_list_list)
        feature_dict[column] = tokens_list_list
    return feature_dict, labels
    
            
train_transformed = train\
    .map(PackNumericFeatures(NUMERIC_FEATURES))\
    .map(tokenize)

test_transformed = test\
    .map(PackNumericFeatures(NUMERIC_FEATURES))\
    .map(tokenize)


numeric_column = tf.feature_column.numeric_column(
    "numeric", shape=(len(NUMERIC_FEATURES), ), normalizer_fn=get_normalize_function(NUMERIC_FEATURES)
)
numeric_columns = [numeric_column]

next(iter(train_transformed))[0]["AdKeyword_tokens"].shape

tf.RaggedTensor(values=Tensor("StringSplit/StringSplitV2:1", shape=(None,), dtype=string), row_splits=Tensor("StringSplit/RaggedFromValueRowIds/concat:0", shape=(None,), dtype=int64))
tf.RaggedTensor(values=Tensor("StringSplit_1/StringSplitV2:1", shape=(None,), dtype=string), row_splits=Tensor("StringSplit_1/RaggedFromValueRowIds/concat:0", shape=(None,), dtype=int64))
tf.RaggedTensor(values=Tensor("StringSplit_2/StringSplitV2:1", shape=(None,), dtype=string), row_splits=Tensor("StringSplit_2/RaggedFromValueRowIds/concat:0", shape=(None,), dtype=int64))
tf.RaggedTensor(values=Tensor("StringSplit_3/StringSplitV2:1", shape=(None,), dtype=string), row_splits=Tensor("StringSplit_3/RaggedFromValueRowIds/concat:0", shape=(None,), dtype=int64))
tf.RaggedTensor(values=Tensor("StringSplit/StringSplitV2:1", shape=(None,), dtype=string), row_splits=Tensor("StringSplit/RaggedFromValueRowIds/concat:0", shape=(None,), dtype=int64))
tf.RaggedTensor(values=Tensor("StringSplit_1/StringSplitV2:1", shape

TensorShape([1000, None])

---
#### Token columns
AdKeyword_tokens, AdTitle_tokens, AdDescription_tokens, Query_tokens

In [3]:
ad_keyword_tokens = tf.feature_column.categorical_column_with_vocabulary_file("AdKeyword_tokens", "data/vocab_adkeyword.txt", vocabulary_size=2_000, default_value=0)
ad_title_tokens = tf.feature_column.categorical_column_with_vocabulary_file("AdTitle_tokens", "data/vocab_adtitle.txt", vocabulary_size=2_000, default_value=0)
ad_description_tokens = tf.feature_column.categorical_column_with_vocabulary_file("AdDescription_tokens", "data/vocab_addescription.txt", vocabulary_size=2_000, default_value=0)
query_tokens = tf.feature_column.categorical_column_with_vocabulary_file("Query_tokens", "data/vocab_query.txt", vocabulary_size=2_000, default_value=0)

In [13]:
ad_keyword_column = tf.feature_column.embedding_column(ad_keyword_tokens, dimension=10, combiner='mean', trainable=True)
ad_title_column = tf.feature_column.embedding_column(ad_title_tokens, dimension=10, combiner='mean', trainable=True)
ad_description_column = tf.feature_column.embedding_column(ad_description_tokens, dimension=10, combiner='mean', trainable=True)
query_column = tf.feature_column.embedding_column(query_tokens, dimension=10, combiner='mean', trainable=True)

In [5]:
numeric_columns = [numeric_column]
embedding_columns = [ad_keyword_column, ad_title_column, ad_description_column, query_column]
feature_columns = [numeric_column, *embedding_columns]

In [8]:
numeric_layer = tf.keras.layers.DenseFeatures(feature_columns)
model = tf.keras.Sequential([
    numeric_layer,
    tf.keras.layers.Dense(1024, activation="relu"),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

In [9]:
def schedule(epoch, lr):
    print(epoch)
    if epoch == 0:
        return 0.1
    elif epoch == 1:
        return 0.01
    else:
        return 0.001
scheduler = tf.keras.callbacks.LearningRateScheduler(schedule, verbose=1)

history = model.fit(train_transformed, validation_data=test_transformed, epochs=10, callbacks=[scheduler])

0

Epoch 00001: LearningRateScheduler reducing learning rate to 0.1.
Epoch 1/10


ValueError: in user code:

    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/sequential.py:291 call
        outputs = layer(inputs, **kwargs)
    /home/przemyslaw/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:891 __call__
        'input to an uniform tensor.' % (self.name, inputs))

    ValueError: Layer dense_features_2 does not support RaggedTensors as input. Inputs received: OrderedDict([('AdKeyword_tokens', tf.RaggedTensor(values=Tensor("RaggedFromVariant_1/RaggedTensorFromVariant:1", shape=(None,), dtype=string), row_splits=Tensor("RaggedFromVariant_1/RaggedTensorFromVariant:0", shape=(None,), dtype=int64))), ('AdTitle_tokens', tf.RaggedTensor(values=Tensor("RaggedFromVariant_2/RaggedTensorFromVariant:1", shape=(None,), dtype=string), row_splits=Tensor("RaggedFromVariant_2/RaggedTensorFromVariant:0", shape=(None,), dtype=int64))), ('AdDescription_tokens', tf.RaggedTensor(values=Tensor("RaggedFromVariant/RaggedTensorFromVariant:1", shape=(None,), dtype=string), row_splits=Tensor("RaggedFromVariant/RaggedTensorFromVariant:0", shape=(None,), dtype=int64))), ('Query_tokens', tf.RaggedTensor(values=Tensor("RaggedFromVariant_3/RaggedTensorFromVariant:1", shape=(None,), dtype=string), row_splits=Tensor("RaggedFromVariant_3/RaggedTensorFromVariant:0", shape=(None,), dtype=int64)))]). You can try converting your input to an uniform tensor.
