<a href="https://colab.research.google.com/github/lengochai97/thesis/blob/master/notebooks/models/Offline_Part.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare environment

In [0]:
%%capture

import google.colab.drive

google.colab.drive.mount('/content/gdrive', force_remount=True)

In [0]:
import functools
import glob
import os

import numpy as np
import pandas as pd
import tensorflow as tf

# Prepare dataset

In [0]:
DATA_PATH = '/content/gdrive/My Drive/dataset/adressa/one_week'

In [0]:
features = {
    'eventId': tf.io.FixedLenFeature([], tf.int64),
    'clickLabel': tf.io.FixedLenFeature([], tf.int64),
    'userActiveness': tf.io.FixedLenFeature([], tf.float32),
    'categoryVector': tf.io.FixedLenFeature([30], tf.float32),
    'newsClickCountVector': tf.io.FixedLenFeature([4], tf.float32),
    'contextVector': tf.io.FixedLenFeature([32], tf.float32),
    'userHistoryVector': tf.io.FixedLenFeature([30], tf.float32),
    'userProfileVector': tf.io.FixedLenFeature([120], tf.float32),
    'userClickCountVector': tf.io.FixedLenFeature([4], tf.float32),
    'userHistoryVectorNext': tf.io.FixedLenFeature([30], tf.float32),
    'userProfileVectorNext': tf.io.FixedLenFeature([120], tf.float32),
    'userClickCountVectorNext': tf.io.FixedLenFeature([4], tf.float32),
}


def parse_example(serialized):
  e = tf.io.parse_single_example(serialized, features)
  
  return {
      'event_id': e['eventId'],
      
      'click_label': e['clickLabel'],
      
      'user_activeness': e['userActiveness'],
      
      'news_features': tf.concat([e['categoryVector'], tf.math.log(e['newsClickCountVector'] + 1.)], 0),
      
      'user_features': tf.concat([e['userProfileVector'], tf.math.log(e['userClickCountVector'] + 1.)], 0),
      
      'user_features_next': tf.concat([e['userProfileVectorNext'], tf.math.log(e['userClickCountVectorNext'] + 1.)], 0),
      
      'user_news_features': tf.math.reduce_prod([e['categoryVector'], e['userHistoryVector']], axis=0),
      
      'user_news_features_next': tf.math.reduce_prod([e['categoryVector'], e['userHistoryVectorNext']], axis=0),
      
      'context_features': e['contextVector'],
  }

In [0]:
def parse_inputs_targets(serialized):
  e = tf.io.parse_single_example(serialized, features)
  
  inputs = {
      'news_features': tf.concat([e['categoryVector'], tf.math.log(e['newsClickCountVector'] + 1.)], 0),
      
      'user_features': tf.concat([e['userProfileVector'], tf.math.log(e['userClickCountVector'] + 1.)], 0),
      
      'user_news_features': tf.math.reduce_prod([e['categoryVector'], e['userHistoryVector']], axis=0),
      
      'context_features': e['contextVector'],
  }
  
  targets = e['clickLabel']
    
  return inputs, targets

def parse_inputs_targets_with_user_activeness(serialized, user_activeness_coef):
  e = tf.io.parse_single_example(serialized, features)
  
  inputs = {
      'news_features': tf.concat([e['categoryVector'], tf.math.log(e['newsClickCountVector'] + 1.)], 0),
      
      'user_features': tf.concat([e['userProfileVector'], tf.math.log(e['userClickCountVector'] + 1.)], 0),
      
      'user_news_features': tf.math.reduce_prod([e['categoryVector'], e['userHistoryVector']], axis=0),
      
      'context_features': e['contextVector'],
  }
  
  user_activeness_coef = tf.constant(user_activeness_coef, tf.float32)
  targets = tf.dtypes.cast(e['clickLabel'], tf.float32) + user_activeness_coef * e['userActiveness']
    
  return inputs, targets

In [0]:
def build_train_dataset(filepaths, batch_size, epochs, user_activeness_coef=None):
  dataset = tf.data.TFRecordDataset(filepaths, 'GZIP')
  
  if user_activeness_coef is None:
    func = parse_inputs_targets
  else:
    func = functools.partial(parse_inputs_targets_with_user_activeness, user_activeness_coef)
  
  dataset = (
      dataset
      .map(func)
      .batch(batch_size)
      .repeat(epochs)
      .prefetch(1)
  )
  
  return dataset

In [0]:
batch_size = 1024
epochs = 1

In [0]:
filepaths = sorted(glob.glob(os.path.join(DATA_PATH, 'tfrecords', 'train', '*')))

train_dataset = build_train_dataset(filepaths, batch_size, epochs)

# Define models

In [0]:
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.layers import Add, Concatenate, Dense, Dot, Input, Lambda, Subtract
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

## 1. Logistic Regression

In [0]:
def build_lr(input_info):
  inputs = [Input(shape=shape, name=name) for name, shape in input_info]
  
  inputs_concat = Concatenate()(inputs)
  
  outputs = Dense(1, activation=sigmoid)(inputs_concat)
  
  model = Model(inputs=inputs, outputs=outputs)  
  model.compile(Adam(), loss=BinaryCrossentropy())
  
  return model

## 2. Factorization Machines

In [0]:
def build_fm(input_info, k_latent):
  inputs = [Input(shape=shape, name=name) for name, shape in input_info]
  
  inputs_concat = Concatenate()(inputs)
  
  inputs_flat = [Lambda(lambda x: x[:, i:i+1])(inputs_concat) for i in range(inputs_concat.shape[1].value)]
  
  biases = [Dense(1)(x) for x in inputs_flat]
  
  factors = [Dense(k_latent)(x) for x in inputs_flat]
  
  s = Add()(factors)
  
  diffs = [Subtract()([s, x]) for x in factors]
  
  dots = [Dot(axes=1)([d, x]) for d, x in zip(diffs, factors)]
  
  outputs = Add()(dots + biases)
  outputs = Dense(1, activation=sigmoid)(outputs)

  model = Model(inputs=inputs, outputs=outputs)

  model.compile(Adam(), loss=BinaryCrossentropy())

  return model

## 3. Wide & Deep

In [0]:
def build_wd(input_info):
  inputs = [Input(shape=shape, name=name) for name, shape in input_info]
  
  inputs_concat = Concatenate()(inputs)
  
  wide = Concatenate()(inputs)
  
  deep = Dense(256, activation=relu)(inputs_concat)
  deep = Dense(128, activation=relu)(deep)
  
  wide_deep = Concatenate()([wide, deep])
  
  outputs = Dense(1, activation=sigmoid)(wide_deep)
  
  model = Model(inputs=inputs, outputs=outputs)
  
  model.compile(Adam(), loss=BinaryCrossentropy())
  
  return model

## 4. DN

In [0]:
def build_dqn(input_info, state_indices):
  inputs = [Input(shape=shape, name=name) for name, shape in input_info]
  
  inputs_concat = Concatenate()(inputs)
  
  value = Concatenate()([inputs[i] for i in state_indices])
  value = Dense(256, activation=relu)(value)
  value = Dense(128, activation=relu)(value)
  value = Dense(1)(value)
  
  advantage = Dense(256, activation=relu)(inputs_concat)
  advantage = Dense(128, activation=relu)(advantage)
  advantage = Dense(1)(advantage)

  value_advantage = Concatenate()([value, advantage])
  
  outputs = Dense(1)(value_advantage)

  model = Model(inputs=inputs, outputs=outputs)

  model.compile(Adam(), loss=MeanSquaredError())

  return model

# Train models

In [0]:
input_info = (
    ('news_features', (34,)),
    ('user_features', (124,)),
    ('user_news_features', (30,)),
    ('context_features', (32,)),
)

state_indices = (1, 3)

## 1. Logistic Regression

In [0]:
lr = build_lr(input_info)

In [0]:
lr.fit(train_dataset)

In [0]:
lr.save_weights(os.path.join(DATA_PATH, 'model', 'lr_weights.h5'), overwrite=True)

## 2. Factorization Machines

In [0]:
fm = build_fm(input_info, k_latent=2)

In [0]:
fm.fit(train_dataset)

In [0]:
fm.save_weights(os.path.join(DATA_PATH, 'model', 'fm_weights.h5'), overwrite=True)

## 3. Wide & Deep

In [0]:
wd = build_wd(input_info)

In [0]:
wd.fit(train_dataset)

In [0]:
wd.save_weights(os.path.join(DATA_PATH, 'model', 'wd_weights.h5'), overwrite=True)

## 4. DN

In [0]:
dqn = build_dqn(input_info, state_indices)

In [0]:
dqn.fit(train_dataset)

In [0]:
dqn.save_weights(os.path.join(DATA_PATH, 'model', 'dqn_weights.h5'), overwrite=True)

In [0]:
user_activeness_coef = 0.05

filepaths = sorted(glob.glob(os.path.join(DATA_PATH, 'tfrecords', 'train', '*')))

train_dataset_with_user_activeness = build_train_dataset(filepaths, batch_size, epochs, user_activeness_coef)

In [0]:
dqnu = build_dqn(input_info, state_indices)

In [0]:
dqnu.fit(train_dataset_with_user_activeness)

In [0]:
dqnu.save_weights(os.path.join(DATA_PATH, 'model', 'dqnu_weights.h5'), overwrite=True)