In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
from itertools import islice
import functools

tf.__version__

'2.2.0'

In [2]:
train = tf.data.experimental.make_csv_dataset("data/train.tsv",
                                           field_delim="\t",
                                           batch_size=10_000,
#                                            compression_type="GZIP",
                                           label_name="Click",
                                           num_epochs=1)

test = tf.data.experimental.make_csv_dataset("data/test.tsv",
                                           field_delim="\t",
                                           batch_size=10_000,
                                           label_name="Click",
                                           num_epochs=1)

In [3]:
next(iter(train))
next(iter(test))

(OrderedDict([('DisplayURL',
               <tf.Tensor: shape=(10000,), dtype=float32, numpy=
               array([1.2057879e+19, 7.9039147e+18, 1.7682627e+19, ..., 1.5145480e+19,
                      1.1363724e+19, 1.7299639e+18], dtype=float32)>),
              ('AdId',
               <tf.Tensor: shape=(10000,), dtype=int32, numpy=
               array([20163506, 21162251, 21484741, ..., 21954124, 21096632,  9584481],
                     dtype=int32)>),
              ('AdvertiserId',
               <tf.Tensor: shape=(10000,), dtype=int32, numpy=array([27961,  1325, 37279, ..., 23807, 30405, 23637], dtype=int32)>),
              ('Depth',
               <tf.Tensor: shape=(10000,), dtype=int32, numpy=array([1, 1, 2, ..., 2, 1, 2], dtype=int32)>),
              ('Position',
               <tf.Tensor: shape=(10000,), dtype=int32, numpy=array([1, 1, 2, ..., 1, 1, 1], dtype=int32)>),
              ('UserID',
               <tf.Tensor: shape=(10000,), dtype=int32, numpy=
               a

In [4]:
raw_stats = pd.read_csv("data/stats.csv")
stats = raw_stats.T
stats.columns = raw_stats["column"]
stats = stats.drop("column")

In [5]:
def get_stats(column):
    print(mean)
    rows = stats.loc[stats["column"] == column]
    return rows["mean"].values[0], rows["sd"].values[0]

In [6]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names

    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features

        return features, labels

In [7]:
NUMERIC_FEATURES = ["Depth", "Position", "Gender", "Age", "UserID", "AdvertiserId"]

packed_train_data = train.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = test.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [8]:
def normalize(data):
    mean = stats.loc["mean", NUMERIC_FEATURES]
    std = stats.loc["sd", NUMERIC_FEATURES]
    return (data - mean) / std

In [9]:
numeric_column = tf.feature_column.numeric_column(
    "numeric", shape=(len(NUMERIC_FEATURES), ), normalizer_fn=normalize
)
numeric_columns = [numeric_column]

# Model

In [15]:
lr = 0.1

def schedule(epoch, lr):
    print(epoch)
    if epoch == 0:
        return 0.1
    elif epoch == 1:
        return 0.01
    else:
        return 0.001
scheduler = tf.keras.callbacks.LearningRateScheduler(schedule, verbose=1)

In [16]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(next(iter(packed_train_data))[0]).shape

TensorShape([10000, 6])

In [19]:
model = tf.keras.Sequential([
    numeric_layer,
#     tf.keras.layers.Dense(2048, activation="relu"),
    tf.keras.layers.Dense(2048, activation="relu"),
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=lr),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

In [None]:
history = model.fit(packed_train_data, validation_data=packed_test_data, epochs=4, callbacks=[scheduler])

0

Epoch 00001: LearningRateScheduler reducing learning rate to 0.1.
Epoch 1/4
   1309/Unknown - 807s 617ms/step - loss: 0.5794 - accuracy: 0.9501 - auc: 0.5062

In [68]:
nn = tf.keras.Sequential([
    numeric_layer,
    tf.keras.layers.Dense(1024, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

nn.compile(
    optimizer=tf.keras.optimizers.Adam(lr=lr),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

In [69]:
history = nn.fit(packed_train_data, validation_data=packed_test_data, epochs=3, callbacks=[scheduler])

0

Epoch 00001: LearningRateScheduler reducing learning rate to 0.1.
Epoch 1/10
1

Epoch 00002: LearningRateScheduler reducing learning rate to 0.01.
Epoch 2/10
2

Epoch 00003: LearningRateScheduler reducing learning rate to 0.001.
Epoch 3/10

KeyboardInterrupt: 