# Define Features

In [11]:
import numpy as np
import pandas as pd



In [12]:
def get_item_numeric_features(df):
    numeric_feature_names = [
        'all_rating_min_max',
        'members_min_max',
        'aired_from_min_max',
        'aired_to_min_max'
    ]
    
    feat_df = df[numeric_feature_names]
    return feat_df.to_numpy()

def get_user_numeric_features(df):
    numeric_feature_names = [
        'user_rating_ave_min_max',
        'user_rating_std_min_max',
        'user_aired_from_ave_min_max',
        'user_aired_to_ave_min_max'
    ]
    
    feat_df = df[numeric_feature_names]
    return feat_df.to_numpy()

In [13]:
def get_multihot_feature(df, feat_name):
    feat_df = df[[feat_name]]
    feat_vecs = feat_df.to_numpy()
    feat_vec = np.apply_along_axis(lambda v: v[0], 1, feat_vecs)
    return feat_vec

In [14]:
def get_label(df):
    label_df = df[['label']]
    return label_df.to_numpy()

In [15]:
def get_all_features(df):
    return (
        get_multihot_feature(df, 'genres_multihot'),
        get_multihot_feature(df, 'user_liked_genres_multihot'),
        get_item_numeric_features(df),
        get_user_numeric_features(df)
    )

# MLP

x1: item categorical feature         
x2: user categorical feature        
x3: item numeric features         
x4: user numeric features

In [10]:
import tensorflow as tf
import tensorflow.keras as keras

In [1]:
from tensorboard.plugins.hparams import api as hp

In [43]:
HP_LAYERS = hp.HParam("layers", hp.IntInterval(2, 3))
HP_LAYER_SIZE = hp.HParam("layer_size", hp.Discrete([64, 128, 256]))
HP_LEARN_RATE = hp.HParam("learn_rate", hp.Discrete([0.001, 0.003, 0.01]))

HPARAMS = [
    HP_LAYERS,
    HP_LAYER_SIZE,
    HP_LEARN_RATE
]

METRICS = [
    hp.Metric(
        "loss",
        group="train",
        display_name="loss (train)",
    ),
    hp.Metric(
        "loss",
        group="validation",
        display_name="loss (val)",
    ),
]

In [30]:
def build_model(x1_shape, x2_shape, x3_shape, x4_shape, hparams):
    x1_input = keras.layers.Input(shape=(x1_shape,))
    x2_input = keras.layers.Input(shape=(x2_shape,))
    x3_input = keras.layers.Input(shape=(x3_shape,))
    x4_input = keras.layers.Input(shape=(x4_shape,))
    
    # compact embedding for x1 and x2
    compact_x1 = keras.layers.Dense(10)(x1_input)
    compact_x2 = keras.layers.Dense(10)(x2_input)
    
    # concat all
    merge = keras.layers.concatenate([compact_x1, compact_x2, x3_input, x4_input])
    
    # hidden layers
    h_input = merge
    for _ in range(hparams[HP_LAYERS]):
        h = keras.layers.Dense(hparams[HP_LAYER_SIZE], activation='relu')(h_input)
        h_input = h
    
    # output
    output = keras.layers.Dense(1, activation='sigmoid')(h_input)
    
    model = keras.models.Model(inputs=[x1_input, x2_input, x3_input, x4_input], outputs=output)
    
    # optimizer
    opt = keras.optimizers.Adam(learning_rate=hparams[HP_LEARN_RATE])
    model.compile(
        loss='binary_crossentropy',
        optimizer=opt,
        metrics=['accuracy']
    )
    
    return model

# Load dataset

In [20]:
import os

In [21]:
def data_files():
    filenames = []
    for root, dirs, files in os.walk('../../data/anime/dnn_feat_eng'):
        for file in files:
            if file.endswith('.parquet'):
                filenames.append(os.path.join(root, file))
                
    return filenames

In [22]:
filesnames = data_files()

In [44]:
# 模型调优总入口
# 负责构建超参数组合，并且调用模型
def test_params():
    with tf.summary.create_file_writer('hparams').as_default():
            hp.hparams_config(hparams=HPARAMS, metrics=METRICS)
            
    model_id = 0
    for layers in range(HP_LAYERS.domain.min_value, HP_LAYERS.domain.max_value + 1):
        for size in HP_LAYER_SIZE.domain.values:
            for rate in HP_LEARN_RATE.domain.values:
                hparams = {
                    HP_LAYERS: layers,
                    HP_LAYER_SIZE: size,
                    HP_LEARN_RATE: rate
                }

                run_model(model_id, hparams)
                model_id += 1

# 给定某一超参数组合，构建模型并训练
def run_model(model_id, hparams):
    # build model
    model = build_model(43, 43, 4, 4, hparams)
    print(f"\nmodel id: {model_id}:")
    print({h.name: hparams[h] for h in hparams})

    # config hparam logs
    log_filename = f"{model_id}"
    for h in hparams:
        log_filename += f"_{h.name}-{hparams[h]}"
    
    log_dir = os.path.join("hparams", log_filename)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir = log_dir,
        update_freq = 10,
        profile_batch = 0
    )
    hparams_callback = hp.KerasCallback(log_dir, hparams)
    
    # train model
    filenames = data_files()
    for filename in filenames[:1]:
        df = pd.read_parquet(filename)

        train_df = df

        # get features
        train_x1, train_x2, train_x3, train_x4 = get_all_features(train_df)

        # get label
        train_y = get_label(train_df)

        model.fit(
            [train_x1, train_x2, train_x3, train_x4], 
            train_y, 
            validation_split=0.2,
            batch_size=16, 
            epochs=4,
            callbacks=[tensorboard_callback, hparams_callback]
        )

In [45]:
%rm -rf hparams
test_params()


model id: 0:
{'layers': 2, 'layer_size': 64, 'learn_rate': 0.001}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 1:
{'layers': 2, 'layer_size': 64, 'learn_rate': 0.003}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 2:
{'layers': 2, 'layer_size': 64, 'learn_rate': 0.01}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 3:
{'layers': 2, 'layer_size': 128, 'learn_rate': 0.001}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 4:
{'layers': 2, 'layer_size': 128, 'learn_rate': 0.003}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 5:
{'layers': 2, 'layer_size': 128, 'learn_rate': 0.01}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 6:
{'layers': 2, 'layer_size': 256, 'learn_rate': 0.001}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 7:
{'layers': 2, 'layer_size': 256, 'learn_rate': 0.003}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 8:
{'layers': 2, 'layer_size': 256, 'learn_rate': 0.01}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 9:
{'layers': 3, 'layer


model id: 13:
{'layers': 3, 'layer_size': 128, 'learn_rate': 0.003}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 14:
{'layers': 3, 'layer_size': 128, 'learn_rate': 0.01}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 15:
{'layers': 3, 'layer_size': 256, 'learn_rate': 0.001}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 16:
{'layers': 3, 'layer_size': 256, 'learn_rate': 0.003}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

model id: 17:
{'layers': 3, 'layer_size': 256, 'learn_rate': 0.01}
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [47]:
%load_ext tensorboard
%tensorboard --logdir hparams

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 69402), started 2 days, 23:08:51 ago. (Use '!kill 69402' to kill it.)

# Test

In [None]:
test_x1 = np.vstack(test_x1s)
test_x2 = np.vstack(test_x2s)
test_x3 = np.vstack(test_x3s)
test_x4 = np.vstack(test_x4s)
test_y = np.vstack(test_ys)

In [None]:
test_loss, test_accuracy = model.evaluate([test_x1, test_x2, test_x3, test_x4], test_y)
print('test loss', test_loss)
print('test accuracy', test_accuracy)

In [None]:
model.save('mlp_model')