## Define Model
 - x1: item categorical feature
- x2: user categorical feature
- x3: item numeric features
- x4: user numeric features

In [1]:
import tensorflow as tf
import tensorflow.keras as keras

In [2]:
from tensorboard.plugins.hparams import api as hp

In [3]:
# HP_LAYERS = hp.HParam("layers", hp.IntInterval(2, 3))
# HP_LAYER_SIZE = hp.HParam("layer_size", hp.Discrete([64, 128, 256]))
# HP_LEARN_RATE = hp.HParam("learn_rate", hp.Discrete([0.001, 0.003, 0.01]))

HP_LAYERS = hp.HParam("layers", hp.IntInterval(2, 3))
HP_LAYER_SIZE = hp.HParam("layer_size", hp.Discrete([64, 128, 256]))
HP_LEARN_RATE = hp.HParam("learn_rate", hp.Discrete([0.001, 0.003, 0.01]))

HPARAMS = [HP_LAYERS, HP_LAYER_SIZE, HP_LEARN_RATE]

METRICS = [
    hp.Metric(
        "batch_loss",
        group="train",
        display_name="loss (train)",
    ),
    hp.Metric(
        "loss",
        group="validation",
        display_name="loss (val)",
    ),
]

In [4]:
def build_model(x1_shape, x2_shape, x3_shape, x4_shape, hparams):
    x1_input = keras.layers.Input(shape=(x1_shape,))
    x2_input = keras.layers.Input(shape=(x2_shape,))
    x3_input = keras.layers.Input(shape=(x3_shape,))
    x4_input = keras.layers.Input(shape=(x4_shape,))

    # compact embedding for x1 and x2
    compact_x1 = keras.layers.Dense(10)(x1_input) # 降维到10维
    compact_x2 = keras.layers.Dense(10)(x2_input) # 降维到10维

    # concat all
    merge = keras.layers.concatenate([compact_x1, compact_x2, x3_input, x4_input])

    # hidden layers (n-th)
    h_input = merge
    for _ in range(hparams[HP_LAYERS]):
        h = keras.layers.Dense(hparams[HP_LAYER_SIZE], activation="relu")(h_input)
        h_input = h

    # output (yes / no => sigmoid)
    output = keras.layers.Dense(1, activation="sigmoid")(h_input)
    model = keras.models.Model(
        inputs=[x1_input, x2_input, x3_input, x4_input], outputs=output
    )

    # optimizer
    opt = keras.optimizers.Adam(
        learning_rate=hparams[HP_LEARN_RATE]
    )

    # compile
    model.compile(
        loss="binary_crossentropy",
        optimizer=opt,
        metrics=["accuracy"]
    )

    return model

In [5]:
model = build_model(
    43,
    43,
    4,
    4,
    {HP_LAYERS: HP_LAYERS.domain.min_value, HP_LAYER_SIZE: 64, HP_LEARN_RATE: 0.001},
)

TypeError: build_model() missing 1 required positional argument: 'hparams'

In [None]:
keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)

## Feature extraction


In [6]:
import numpy as np
import pandas as pd

In [7]:
def get_item_numeric_features(df):
    numeric_feature_names = [
        "all_rating_min_max",
        "members_min_max",
        "aired_from_min_max",
        "aired_to_min_max",
    ]

    num_df = df[numeric_feature_names]
    return num_df.to_numpy()


def get_user_numeric_features(df):
    numeric_feature_names = [
        "user_rating_ave_min_max",
        "user_rating_std_min_max",
        "user_aired_from_ave_min_max",
        "user_aired_to_ave_min_max",
    ]

    num_df = df[numeric_feature_names]
    return num_df.to_numpy()


def get_multihot_feature(df, feat_name):
    feat_df = df[[feat_name]]
    feat_vecs = feat_df.to_numpy()
    feat_vec = np.apply_along_axis(lambda v: v[0], 1, feat_vecs)
    return feat_vec


def get_label(df):
    label_df = df[["label"]]
    return label_df.to_numpy()


def get_all_features(df):
    return (
        get_multihot_feature(df, "genres_multihot"),
        get_multihot_feature(df, "user_liked_genres_multihot"),
        get_item_numeric_features(df),
        get_user_numeric_features(df),
    )

In [8]:
import os

In [10]:
def data_files():
    filenames = []
    for root, dirs, files in os.walk("../anime-data/dnn_feat_eng"):
        for file in files:
            if file.endswith(".parquet"):
                filenames.append(os.path.join(root, file))

    return filenames

In [11]:
filenames = data_files()
filenames

['../anime-data/dnn_feat_eng/part-00008-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00003-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00004-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00012-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00005-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00009-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00002-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00010-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00007-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime-data/dnn_feat_eng/part-00000-d84f2fb3-ad0e-403f-8f8c-278b9701b758-c000.snappy.parquet',
 '../anime

## Load Data and Train


In [12]:
test_x1s = []
test_x2s = []
test_x3s = []
test_x4s = []
test_ys = []

In [13]:
def run_model(model_id, hparams):
    # build model
    model = build_model(43, 43, 4, 4, hparams)
    print(f"model id: {model_id}:")
    print({h.name: hparams[h] for h in hparams})

    # config hparam logs
    log_filename = f"{model_id}"
    for h in hparams:
        log_filename += f"_{h.name}-{hparams[h]}"

    log_dir = os.path.join("hparams", log_filename)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir, update_freq=10, profile_batch=0
    )
    hparams_callback = hp.KerasCallback(log_dir, hparams)

    # train model
    for filename in filenames[:1]:
        df = pd.read_parquet(filename)

        # shuffle and split train and test
        train_df = df
        #         train_df = df.sample(frac=0.8, random_state=666)
        #         test_df = df.drop(train_df.index)

        # get features
        train_x1, train_x2, train_x3, train_x4 = get_all_features(train_df)
        #         val_x1, val_x2, val_x3, val_x4 = get_all_features(test_df)

        # get label
        train_y = get_label(train_df)
        #         val_y = get_label(test_df)

        print("training on new dataset")

        model.fit(
            [train_x1, train_x2, train_x3, train_x4],
            train_y,
            validation_split=0.2,
            batch_size=16,
            epochs=4, # reduce from 10 to 4, b/c of overfitting, the loss is increasing after #4 epoch
            callbacks=[tensorboard_callback, hparams_callback],
        )

In [14]:
def test_params():
    with tf.summary.create_file_writer("hparams").as_default():
        hp.hparams_config(hparams=HPARAMS, metrics=METRICS)

    model_id = 0
    for layers in range(HP_LAYERS.domain.min_value, HP_LAYERS.domain.max_value + 1):
        for size in HP_LAYER_SIZE.domain.values:
            for rate in HP_LEARN_RATE.domain.values:
                hparams = {HP_LAYERS: layers, HP_LAYER_SIZE: size, HP_LEARN_RATE: rate}

                run_model(model_id, hparams)
                model_id += 1

In [15]:
test_params()

model id: 0:
{'layers': 2, 'layer_size': 64, 'learn_rate': 0.001}
training on new dataset
Epoch 1/4
[1m26012/26012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 736us/step - accuracy: 0.7157 - loss: 0.5517 - val_accuracy: 0.7454 - val_loss: 0.5181
Epoch 2/4
[1m26012/26012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 710us/step - accuracy: 0.7423 - loss: 0.5183 - val_accuracy: 0.7429 - val_loss: 0.5208
Epoch 3/4
[1m26012/26012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 738us/step - accuracy: 0.7439 - loss: 0.5154 - val_accuracy: 0.7490 - val_loss: 0.5108
Epoch 4/4
[1m26012/26012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 709us/step - accuracy: 0.7452 - loss: 0.5141 - val_accuracy: 0.7428 - val_loss: 0.5169
model id: 1:
{'layers': 2, 'layer_size': 64, 'learn_rate': 0.003}
training on new dataset
Epoch 1/4
[1m26012/26012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 744us/step - accuracy: 0.7178 - loss: 0.5500 - val_accuracy: 0.7428

KeyboardInterrupt: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir hparams

## Test

In [None]:
test_x1 = np.vstack(test_x1s)
test_x2 = np.vstack(test_x2s)
test_x3 = np.vstack(test_x3s)
test_x4 = np.vstack(test_x4s)
test_y = np.vstack(test_ys)

In [None]:
test_loss, test_accuracy = model.evaluate([test_x1, test_x2, test_x3, test_x4], test_y)

print("\n\nTest Loss {}, Test Accuracy {}".format(test_loss, test_accuracy))

In [None]:
n = 1
g = 0

for i in range(n):
    expect = test_y[i][0]
    x1 = test_x1[i]
    x2 = test_x2[i]
    x3 = test_x3[i]
    x4 = test_x4[i]
    xs = [np.array([x1]), np.array([x2]), np.array([x3]), np.array([x4])]
    print("xs")
    print(xs)

    predict = model.predict(xs)
    predict = predict[0][0]
    if predict > 0.5:
        predict = 1
    else:
        predict = 0

    if predict == expect:
        g += 1.0

#     print(f'Expect {expect}, predict {predict}')


print()
print("accuracy:")
print(g / n)

## Save Model


In [None]:
model.save("mlp_model")