In [1]:
import os
# set device GPU
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing import sequence
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output
import time
from tensorflow import keras
from tensorflow.keras import layers

tf.enable_eager_execution()
tf.test.is_gpu_available()

import sys
sys.path.append('../')

from config import *

from tools import *



import warnings

warnings.filterwarnings('ignore')
tf.debugging.set_log_device_placement(True)

In [3]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
COLS_NAME = ['ad_id', 'product_id', 'advertiser_id']
BATCH_SIZE = 256
BUFFER_SIZE = 1024

### 载入数据

In [5]:
# tr_user_log = pd.read_pickle(TRAIN_DIR+USER_LOG_PATH)
# ts_user_log = pd.read_pickle(TEST_DIR+USER_LOG_PATH)

# tr_ad_id_log = pd.read_pickle(TRAIN_DIR+AD_INFO_PATH)
# ts_ad_id_log = pd.read_pickle(TEST_DIR+AD_INFO_PATH)

# tr_df = pd.concat([tr_user_log, tr_ad_id_log], axis=1)
# ts_df = pd.concat([ts_user_log, ts_ad_id_log], axis=1)

# tr_df = tr_df[['user_id', 'age', 'gender', 'ad_id', 'product_id', 'advertiser_id', 'click_times']]
# ts_df = ts_df[['user_id', 'ad_id', 'product_id', 'advertiser_id', 'click_times']]

# tr_df['age'] = tr_df['age'] - 1
# tr_df['gender'] = tr_df['gender'] - 1

### 载入gensim预训练词典

In [6]:
vocab_sizes = []
wv_matrixes = []
for col in COLS_NAME:
    #load w2v matrix
    f = open(TRAIN_DIR+'gensim_%s_dict.js'%col,'r')
    a = f.read()
    vocab_dict = eval(a)
    f.close()
    filter_keys = set(vocab_dict.keys())
    vocab_size = len(filter_keys) + 1

    wv_matrix = np.load(TRAIN_DIR+'gensim_%s.npy'%col)
    row = np.random.uniform(size=(1, wv_matrix.shape[1]))

    wv_matrix = np.concatenate([row, wv_matrix], axis=0)

#     ### process and get click list
#     tr_df[col] = tr_df[col].astype(str)
#     ts_df[col] = ts_df[col].astype(str)

#     tr_df = tr_df[tr_df[col].isin(filter_keys)]
#     ts_df = ts_df[ts_df[col].isin(filter_keys)]

#     tr_df[col] = tr_df[col].map(lambda x: vocab_dict[x]) + 1
#     ts_df[col] = ts_df[col].map(lambda x: vocab_dict[x]) + 1
    
    vocab_sizes.append(vocab_size)
    wv_matrixes.append(wv_matrix)

### 提取点击列表和对应的广告信息

In [7]:
# def get_clk_list(df):
#     return list(df.values)

# tr_df = tr_df.groupby(['user_id']).agg({'age': 'first', 
#                                         'gender': 'first', 
#                                         'click_times': lambda x: get_clk_list(x), 
#                                         'ad_id': lambda x: get_clk_list(x),
#                                         'product_id': lambda x: get_clk_list(x),
#                                         'advertiser_id': lambda x: get_clk_list(x)}).reset_index()

# ts_df = ts_df.groupby(['user_id']).agg({'click_times': lambda x: get_clk_list(x), 
#                                         'ad_id': lambda x: get_clk_list(x),
#                                         'product_id': lambda x: get_clk_list(x),
#                                         'advertiser_id': lambda x: get_clk_list(x)}).reset_index()

# tr_df.to_pickle('/home/baode/huangzc/tencent/data/train_preliminary/3shuru.pkl')

# ts_df.to_pickle('/home/baode/huangzc/tencent/data/test/3shuru.pkl')

In [8]:
tr_df = pd.read_pickle('/home/baode/huangzc/tencent/data/train_preliminary/3shuru.pkl')
ts_df = pd.read_pickle('/home/baode/huangzc/tencent/data/test/3shuru.pkl')

### 切分 train 和 test 数据集

In [9]:
msk = np.random.rand(len(tr_df)) <= 0.8
vl_df = tr_df[~msk]
tr_df = tr_df[msk]

### 配置超参数

In [10]:
n1 = 4
n2 = 3

d_model = 128
dff = 256
num_heads = 4

dropout_rate = 0.1
EPOCHS = 1

### padding 和mask

In [11]:
### cut lenght and pad sequence
sentence_size = int(min(tr_df['ad_id'].map(lambda x: len(x)).quantile(0.99), ts_df['ad_id'].map(lambda x: len(x)).quantile(0.99)))
print('max len: ', sentence_size)

### pad or trunc
def pad_or_trunc(t):
    dim = tf.size(t)
    return tf.cond(tf.equal(dim, sentence_size), lambda: t,
                    lambda: tf.cond(tf.greater(dim, sentence_size), lambda: tf.slice(t, [0], [sentence_size]), 
                                     lambda: tf.concat([t, tf.zeros(dtype=tf.int64, shape=sentence_size-dim)], 0)))

### padding mask
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # 添加额外的维度来将填充加到
    # 注意力对数（logits）。
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

### pool mask
def create_pooling_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # 添加额外的维度来将填充加到
    # 注意力对数（logits）。
    return seq[:, :, tf.newaxis]  # (batch_size, seq_len, 1)

# # ## train pad_or_truc
# vals = tf.constant([[1, 1, 1], [2, 2, 2], [3, 3, 3]], dtype=tf.int64)
# dset1 = tf.data.Dataset.from_tensor_slices(vals)
# y1 = tf.data.Dataset.from_tensor_slices(tf.constant([1, 1, 1], dtype=tf.int64))
# y2 = tf.data.Dataset.from_tensor_slices(tf.constant([1, 1, 1], dtype=tf.int64))
# y = tf.data.Dataset.zip((y1, y2))
# dset1 = tf.data.Dataset.zip((dset1, y))
# dset2 = dset1.map(lambda x, pair: (pad_or_trunc(x), pair))

# for li in dset2.take(1):
#     print(li)

# # ## test pad_or_truc
# vals = tf.constant([[1, 1, 1], [2, 2, 2], [3, 3, 3]], dtype=tf.int64)
# dset1 = tf.data.Dataset.from_tensor_slices(vals)
# dset2 = dset1.map(pad_or_trunc)

# for li in dset2.take(1):
#     print(li)

max len:  153


### 生成dataset

In [12]:
### make train dataset
def gen():
    for row in tr_df.itertuples():
        ad_id_li, product_id_li, advertiser_id_li, clk_li, age, gender = getattr(row, COLS_NAME[0]),\
                                                                         getattr(row, COLS_NAME[1]),\
                                                                         getattr(row, COLS_NAME[2]),\
                                                                         getattr(row, 'click_times'), \
                                                                         getattr(row, 'age'), \
                                                                         getattr(row, 'gender')

        yield (ad_id_li, product_id_li, advertiser_id_li, clk_li, (age, gender))

tr_ds = tf.data.Dataset.from_generator(
     gen,
     (tf.int64, tf.int64, tf.int64, tf.int64, (tf.int64, tf.int64)), 
     (tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), (tf.TensorShape([]), tf.TensorShape([]))))

### make valid dataset
def gen():
    for row in vl_df.itertuples():
        ad_id_li, product_id_li, advertiser_id_li, clk_li, age, gender = getattr(row, COLS_NAME[0]),\
                                                                         getattr(row, COLS_NAME[1]),\
                                                                         getattr(row, COLS_NAME[2]),\
                                                                         getattr(row, 'click_times'), \
                                                                         getattr(row, 'age'), \
                                                                         getattr(row, 'gender')

        yield (ad_id_li, product_id_li, advertiser_id_li, clk_li, (age, gender))

vl_ds = tf.data.Dataset.from_generator(
     gen,
     (tf.int64, tf.int64, tf.int64, tf.int64, (tf.int64, tf.int64)), 
     (tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), (tf.TensorShape([]), tf.TensorShape([]))))

### make test dataset
def gen():
    for row in ts_df.itertuples():
        ad_id_li, product_id_li, advertiser_id_li, clk_li = getattr(row, COLS_NAME[0]),\
                                                             getattr(row, COLS_NAME[1]),\
                                                             getattr(row, COLS_NAME[2]),\
                                                             getattr(row, 'click_times')
        yield (ad_id_li, product_id_li, advertiser_id_li, clk_li)

ts_ds = tf.data.Dataset.from_generator(
     gen,
     (tf.int64, tf.int64, tf.int64, tf.int64), 
     (tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])))

Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0


In [13]:
tr_ds = tr_ds.map(lambda ad, product, advertiser, clk_times, pair: (pad_or_trunc(ad), 
                                                                     pad_or_trunc(product), 
                                                                     pad_or_trunc(advertiser), 
                                                                     pad_or_trunc(clk_times), 
                                                                     pair), num_parallel_calls=tf.data.experimental.AUTOTUNE)

tr_ds = tr_ds.cache()
tr_ds = tr_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
tr_ds = tr_ds.prefetch(tf.data.experimental.AUTOTUNE)

vl_ds = vl_ds.map(lambda ad, product, advertiser, clk_times, pair: (pad_or_trunc(ad), 
                                                                     pad_or_trunc(product), 
                                                                     pad_or_trunc(advertiser), 
                                                                     pad_or_trunc(clk_times), 
                                                                     pair), num_parallel_calls=tf.data.experimental.AUTOTUNE)

vl_ds = vl_ds.cache()
vl_ds = vl_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
vl_ds = vl_ds.prefetch(tf.data.experimental.AUTOTUNE)

ts_ds = ts_ds.map(lambda ad, product, advertiser, clk_times: (pad_or_trunc(ad), 
                                                               pad_or_trunc(product), 
                                                               pad_or_trunc(advertiser), 
                                                               pad_or_trunc(clk_times)), num_parallel_calls=tf.data.experimental.AUTOTUNE)

Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op CacheDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0


In [14]:
tf.reset_default_graph()

### 优化器和学习率

In [15]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [16]:
# with tf.device('/gpu:6'):
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

Executing op Cast in device /job:localhost/replica:0/task:0/device:GPU:0


### 创建模型

In [17]:
import transformer
import imp
imp.reload(transformer)

<module 'transformer' from '/home/baode/huangzc/tencent/code/tencent_competition/Transformer/transformer.py'>

In [18]:
from transformer import *

In [19]:
class TransLSTM(tf.keras.layers.Layer):
    def __init__(self, n1, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, weights,
                  n2, lstm_dims,
                  pool_size, pool_strides,
                  rate):
        super(TransLSTM, self).__init__()
            
        self.n2 = n2        
        self.transformer_enc = Encoder(num_layers=n1, 
                                       d_model=d_model, 
                                       num_heads=num_heads, 
                                       dff=dff, 
                                       input_vocab_size=input_vocab_size,
                                       maximum_position_encoding=maximum_position_encoding, 
                                       rate=rate, 
                                       weights=weights)
        
        self.lstm = [tf.keras.layers.LSTM(lstm_dims, return_sequences=True) for _ in range(n2)]
        self.max_pool = tf.keras.layers.MaxPooling1D(pool_size=pool_size, strides=pool_strides, padding='valid')
        self.flatten = tf.keras.layers.Flatten()   
        
    def call(self, inputs, training, mask, attention_mask, pool_mask):
        x = inputs
        x = self.transformer_enc(x, training, mask, attention_mask)
        for i in range(self.n2):
            x = self.lstm[i](x)  
        x *= pool_mask
        x = self.max_pool(x)
        x = self.flatten(x)   

        return x

In [20]:
### test
# temp = TransLSTM(n1=1, d_model=32, num_heads=2, dff=10, input_vocab_size=60, maximum_position_encoding=60, weights=None,
#                                             n2=1, lstm_dims=32,
#                                             pool_size=2, pool_strides=2,
#                                             rate=0.1)

# y = tf.random.uniform((1, 60))  # (batch_size, encoder_sequence, d_model)
# out = temp(y, training=False, mask=None, attention_mask=None, pool_mask=tf.ones((1,60,1)))
# out.shape

In [21]:
class MyModel(tf.keras.Model):
    def __init__(self, n1, n2, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding,
                 pool_size=2, pool_strides=1, rate=0.1, weights=None,
                 lstm_dims=32,
                 dim1=64, dim2=32, num_class1=10, num_class2=2):
        '''
         n1: number of layers in transformer encoder;
         n2: number of layers in lstm;
         d_model: dimension of embedding;
         num_heads: number of heads in multi-head attention;
         dff: dimension of feed forward innner layer;
         input_vocab_size: vocab size;
         max_position_encoding: ;
         rate: dropout rate;
         weights: pre-trained embedding weights;
        '''
        super(MyModel, self).__init__()
        self.transLSTM = []
        if weights is None:
            for i, _ in enumerate(COLS_NAME):
                self.transLSTM.append(TransLSTM(n1, d_model, num_heads, dff, input_vocab_size[i], maximum_position_encoding[i], weights,
                                                n2, lstm_dims,
                                                pool_size, pool_strides,
                                                rate))
        else:
            for i, _ in enumerate(COLS_NAME):
                self.transLSTM.append(TransLSTM(n1, d_model, num_heads, dff, input_vocab_size[i], maximum_position_encoding[i], weights[i],
                                                n2, lstm_dims,
                                                pool_size, pool_strides,
                                                rate))            
        
        self.concat = tf.keras.layers.Concatenate(axis=-1)
        self.dense1 = tf.keras.layers.Dense(dim1, activation='relu', name='dense1')
        self.dense2 = tf.keras.layers.Dense(dim2, activation='relu', name='dense2')
        self.dense3_age = tf.keras.layers.Dense(num_class1, activation='softmax', name='softmax1')
        self.dense_gender = tf.keras.layers.Dense(num_class2, activation='softmax', name='softmax2')

    def call(self, x, training, mask, attention_mask, pool_mask):
        res = []
        for i, layer in enumerate(self.transLSTM):
            res.append(layer(x[i], training, mask, attention_mask, pool_mask))

        x = self.concat(res)
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3_age(x), self.dense_gender(x)

In [22]:
model = MyModel(n1=n1, n2=n2, d_model=d_model, num_heads=num_heads, dff=dff, 
                input_vocab_size=vocab_sizes, maximum_position_encoding=vocab_sizes, 
                weights=wv_matrixes)

Executing op Cast in device /job:localhost/replica:0/task:0/device:GPU:0


In [23]:
### test
# model((tf.random.uniform((1, 60)), 
#        tf.random.uniform((1, 60)), 
#        tf.random.uniform((1, 60)), 
#        tf.random.uniform((1, 60))), 
#       training=False,
#       mask=tf.random.uniform((1, 1, 1, 60)),       
#       attention_mask=tf.random.uniform((1, 1, 1, 60)),
#       pool_mask=tf.random.uniform((1,60,1)))

### 损失函数和metric

In [24]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

def loss_function(real, pred):
    pred_age = pred[0]
    real_age = real[0]

    pred_gender = pred[1]
    real_gender = real[1]

    loss1 = loss_object(real_age, pred_age)
    loss2 = loss_object(real_gender, pred_gender)

    loss_ = 0.5*loss1 + 0.5*loss2
    return tf.reduce_mean(loss_)

### train metric
train_loss = tf.keras.metrics.Mean(name='train_loss')
age_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
gender_train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

### test metric
valid_loss = tf.keras.metrics.Mean(name='valid_loss')
age_valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_accuracy')
gender_valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_accuracy')

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0


### 开始训练

In [25]:
@tf.function
def train_step(inp, tar):

    mask = create_padding_mask(inp[0])
    attention_mask = tf.cast(inp[-1][:, tf.newaxis, tf.newaxis, :], tf.float32)
    pool_mask = create_pooling_mask(inp[0])

    with tf.GradientTape() as tape:
        predictions = model(inp, 
                            training=False, 
                            mask=mask, 
                            attention_mask=attention_mask,
                            pool_mask=pool_mask
                            )
        loss = loss_function(tar, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    age_train_accuracy(tar[0], predictions[0])
    gender_train_accuracy(tar[1], predictions[1])
    return loss 

@tf.function
def valid_step(inp, tar):

    mask = create_padding_mask(inp[0])
    attention_mask = tf.cast(inp[-1][:, tf.newaxis, tf.newaxis, :], tf.float32)
    pool_mask = create_pooling_mask(inp[0])

    predictions = model(inp, 
                        training=False, 
                        mask=mask, 
                        attention_mask=attention_mask,
                        pool_mask=pool_mask
                        )
    loss = loss_function(tar, predictions)

    valid_loss(loss)
    age_valid_accuracy(tar[0], predictions[0])
    gender_valid_accuracy(tar[1], predictions[1])
    return loss

In [27]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    age_train_accuracy.reset_states()
    gender_train_accuracy.reset_states()

    '''
    inp1: ad_id list;
    inp2: product_id list;
    inp3: advertiser_id list;
    inp4: click_times list;
    '''
    for (batch, (inp1, inp2, inp3, inp4, tar)) in enumerate(tr_ds):
        train_step((inp1, inp2, inp3, inp4), tar)

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Age-Accuracy {:.4f} Gender-Accuracy {:.4f} Time taken for training: {} secs'
                   .format(epoch + 1, batch, train_loss.result(), 
                           age_train_accuracy.result(), gender_train_accuracy.result(),
                          time.time()-start))
            start = time.time()
            
    tmp = time.time()
    for inp1, inp2, inp3, inp4, tar in vl_ds:
        valid_step((inp1, inp2, inp3, inp4), tar)    
    print('########################################## valid ################################################')
    print('Epoch {} Batch {} Loss {:.4f} Age-Accuracy {:.4f} Gender-Accuracy {:.4f} \
          Time taken for validation: {} secs'
          .format(epoch + 1, batch, 
                  valid_loss.result(), age_valid_accuracy.result(), gender_valid_accuracy.result(),
                  time.time() - tmp))
    
       
#         if (epoch + 1) % 5 == 0:
#             ckpt_save_path = ckpt_manager.save()
#             print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
#                                                                  ckpt_save_path))

#             print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
#                                                         train_loss.result(), 
#                                                         train_accuracy.result()))

#             print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.4927 Age-Accuracy 0.1250 Gender-Accuracy 0.5000 Time taken for training: 2.544782876968384 secs
Epoch 1 Batch 100 Loss 1.2898 Age-Accuracy 0.2021 Gender-Accuracy 0.7806 Time taken for training: 192.59902548789978 secs
Epoch 1 Batch 200 Loss 1.1810 Age-Accuracy 0.2424 Gender-Accuracy 0.8381 Time taken for training: 191.30518531799316 secs
Epoch 1 Batch 300 Loss 1.1082 Age-Accuracy 0.2707 Gender-Accuracy 0.8629 Time taken for training: 189.51898741722107 secs
Epoch 1 Batch 400 Loss 1.0588 Age-Accuracy 0.2915 Gender-Accuracy 0.8753 Time taken for training: 190.8416042327881 secs
Epoch 1 Batch 500 Loss 1.0233 Age-Accuracy 0.3067 Gender-Accuracy 0.8841 Time taken for training: 190.17677426338196 secs
Epoch 1 Batch 600 Loss 0.9991 Age-Accuracy 0.3176 Gender-Accuracy 0.8894 Time taken for training: 189.46304559707642 secs
Epoch 1 Batch 700 Loss 0.9796 Age-Accuracy 0.3270 Gender-Accuracy 0.8940 Time taken for training: 193.56990027427673 secs
Epoch 1 Batch 800 Loss 0.964