In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing import sequence
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output
import time
from tensorflow import keras
from tensorflow.keras import layers

tf.enable_eager_execution()
tf.test.is_gpu_available()

import sys
sys.path.append('../')

from config import *

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

In [3]:
### Hyperparameter
###################
SAVE_PATH = '/home/huangzc/competition/tencent/model_ckpt/TRANSFORMER/model_all.ckpt'
EMB_SIZE = 50
BATCH_SIZE = 1024
EPOCHS = 100
TARGET = 'age'
COLS_NAME = ['creative_id', 'advertiser_id', 'ad_id', 'product_id', 'product_category', 'industry']

In [4]:
### Read data
### TARGET DF
tr_user_df = pd.read_pickle(TRAIN_DIR+USER_PATH)
tr_user_df = tr_user_df.groupby(['user_id']).agg({'age': 'first', 'gender': 'first'}).reset_index()

In [5]:
tr_ad_df = pd.read_pickle(TRAIN_DIR+AD_PATH)
ts_ad_df = pd.read_pickle(TEST_DIR+AD_PATH)

In [6]:
train_df = []
test_df = []
for col in tqdm(COLS_NAME):
    train_df.append(pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT[col]))
    test_df.append(pd.read_pickle(TRAIN_DIR+CLK_PATH_DICT[col]))
    
train_df.append(tr_user_df)

100%|██████████| 6/6 [03:02<00:00, 30.42s/it]


In [7]:
def get_grid_df(creative_df, advertiser_df, ad_df, product_id_df, product_cat_df, industry_df, user_df=None):
    if user_df is  None:
        user_df = creative_df[['user_id']]
        user_df[TARGET] = np.nan
    assert user_df['user_id'].values.tolist() == creative_df['user_id'].values.tolist() \
    == ad_df['user_id'].values.tolist() == product_id_df['user_id'].values.tolist() \
    == product_cat_df['user_id'].values.tolist() == industry_df['user_id'].values.tolist() \
    == advertiser_df['user_id'].values.tolist()

    del advertiser_df['user_id'], ad_df['user_id'], product_id_df['user_id'], product_cat_df['user_id'], industry_df['user_id']
    
    grid_df = pd.concat([creative_df, advertiser_df, 
                         ad_df, product_id_df, 
                         product_cat_df, industry_df,
                         user_df[[TARGET]]], axis=1)

    return grid_df

In [8]:
grid_df = get_grid_df(*train_df)
grid_df_test = get_grid_df(*test_df)
grid_df[TARGET] = grid_df[TARGET] - 1

In [9]:
train, val = train_test_split(grid_df, test_size=0.2, random_state=2020)
test = grid_df_test

In [10]:
sentence_size = max(int(grid_df[COLS_NAME[0]].map(lambda x: len(x)).quantile(0.6)), 
                    int(grid_df_test[COLS_NAME[0]].map(lambda x: len(x)).quantile(0.6)))
print('choose sentences max len: %d' % (sentence_size))
print("Pad sequences (samples x time)") 

choose sentences max len: 29
Pad sequences (samples x time)


In [11]:
def pad_feature():  
    x_train = []
    x_val = []
    x_test = []
     
    for col in tqdm(COLS_NAME):
        x_train.append(sequence.pad_sequences(train[col],
                                             maxlen=sentence_size, 
                                             padding='post', 
                                             truncating='post',
                                             dtype='int64',
                                             value=0
                                             ))
        x_val.append(sequence.pad_sequences(val[col],
                                             maxlen=sentence_size, 
                                             padding='post', 
                                             truncating='post',
                                             dtype='int64',
                                             value=0
                                             ))
        x_test.append(sequence.pad_sequences(test[col], 
                                            maxlen=sentence_size, 
                                            padding='post',
                                            truncating='post',
                                            dtype='int64',
                                            value=0
                                           ))
    print('feature count: train->%d, valid->%d, test->%d' %(len(x_train), len(x_val), len(x_test)))
    return x_train, x_val, x_test

In [12]:
x_train, x_val, x_test = pad_feature()

100%|██████████| 6/6 [01:34<00:00, 15.68s/it]

feature count: train->6, valid->6, test->6





In [13]:
### Get tf dataset
def get_train_ds(x, y): 
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(buffer_size=len(x))
    dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.repeat(EPOCHS)
    return dataset

def get_test_ds(x, ): 
    dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)
    return dataset

train_ds = get_train_ds(tuple(x_train), train[TARGET].values)
valid_ds = get_train_ds(tuple(x_val), val[TARGET].values)
test_ds = get_test_ds(tuple(x_test))

In [14]:
vocab_sizes = []
temp = pd.concat([tr_ad_df, ts_ad_df], axis=0)
for col in COLS_NAME:
    print(col)
    vocab_sizes.append(max(temp[col].unique().tolist()) + 1) ### padding 0 need add 1

creative_id
advertiser_id
ad_id
product_id
product_category
industry


In [15]:
# weights = np.load('/home/huangzc/competition/tencent/data/train_preliminary/gensim_dict.npy')

In [16]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8, ):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [17]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [18]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, emded_dim, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=emded_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=emded_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [19]:
### Construct Model
#################################
class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
#         self.embedding = tf.keras.layers.Embedding(vocab_size, EMB_SIZE, weights=[weights])
        self.embeddings = []
        for s in vocab_sizes:
            self.embeddings.append(TokenAndPositionEmbedding(sentence_size, s, EMB_SIZE))
        self.concat = tf.keras.layers.Concatenate(axis=-1)
        
        ### Transformer
        self.transformer_block = TransformerBlock(EMB_SIZE*len(vocab_sizes), 5, 64)
        
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(32, activation='relu')
        self.dense3 = tf.keras.layers.Dense(10, activation='softmax')

    def call(self, inputs):
        embs = []
        for emb, inp in zip(self.embeddings, inputs):
            x = emb(inp)            
            embs.append(x)
        x = self.concat(embs)
        
        x = self.transformer_block(x)
        x = layers.GlobalAveragePooling1D()(x)
        
        x = self.dense1(x)
        x = self.dense2(x)
        return self.dense3(x)

model = MyModel()

In [20]:
# # Restore the weights
# model.load_weights(SAVE_PATH)

In [21]:
### Loss & Metric
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

valid_loss = tf.keras.metrics.Mean(name='valid_loss')
valid_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='valid_accuracy')

In [22]:
@tf.function
def train_step(features, labels):
    with tf.GradientTape() as tape:
        predictions = model(features)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)

In [23]:
@tf.function
def valid_step(features, labels):
    predictions = model(features)
    v_loss = loss_object(labels, predictions)

    valid_loss(v_loss)
    valid_accuracy(labels, predictions)

In [24]:
for epoch in range(EPOCHS):
  # 在下一个epoch开始时，重置评估指标
    train_loss.reset_states()
    train_accuracy.reset_states()
    valid_loss.reset_states()
    valid_accuracy.reset_states()

    CNT = 0
    for features, labels in train_ds:
        begin = time.time()
        train_step(features, labels)
        CNT += 1


        if CNT % 50 == 0: 
            for val_features, val_labels in valid_ds:
                valid_step(val_features, val_labels)
            print('##################################################')
            print('batch: {:d}, batch train loss: {:.2f}, train acc: {:.2f}%, consuming tine: {:.2f}'.
                  format(CNT, train_loss.result(), train_accuracy.result()*100, time.time()-begin))            
            print('batch: {:d}, batch valid loss: {:.2f}, valid acc: {:.2f}%'.
                  format(CNT, valid_loss.result(), valid_accuracy.result()*100)) 
            print('##################################################')
            
    template = 'Epoch {}, Loss: {:.2f}, Accuracy: {:.2f}%, Valid Loss: {:.2f}, Valid Accuracy: {:.2f}%'
    print (template.format(epoch+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         valid_loss.result(),
                         valid_accuracy.result()*100))

##################################################
batch: 50, batch train loss: 1.93, train acc: 24.73%, consuming tine: 9.14
batch: 50, batch valid loss: 1.72, valid acc: 30.48%
##################################################
##################################################
batch: 100, batch train loss: 1.79, train acc: 28.73%, consuming tine: 8.31
batch: 100, batch valid loss: 1.66, valid acc: 32.62%
##################################################
##################################################
batch: 150, batch train loss: 1.72, train acc: 31.06%, consuming tine: 8.23
batch: 150, batch valid loss: 1.63, valid acc: 33.98%
##################################################
##################################################
batch: 200, batch train loss: 1.67, train acc: 32.72%, consuming tine: 8.14
batch: 200, batch valid loss: 1.60, valid acc: 35.04%
##################################################
##################################################
batch: 250, batch train

KeyboardInterrupt: 