# Introduction

**Demographic characteristics are important input characteristics of various recommendation systems, which naturally also include advertising platforms. Most verification methods use demographic attributes as input to generate recommendation results, and then compare the recommendation performance with and without these inputs offline or online. It worth to attempts to verify this hypothesis from another direction, that is, to use the user's interaction in the advertising system as input to predict the user's demographic attributes.**

**This kernal used to predict gender and age which are very important parts of user portrait.**

# Part 1:Generate user history click sequence

In [None]:
import pandas as pd
import gc 
#train
train_data=pd.read_csv('tencent2020/train_preliminary/click_log.csv')
#test
test_data=pd.read_csv('tencent2020/test/click_log.csv')
#all data
data=train_data.append(test_data)
data=data.reset_index(drop=True)
print(data[:5])
print(len(data))
del train_data,test_data
gc.collect()

In [None]:
#reduce memory
import numpy as np
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
data=reduce_mem_usage(data, verbose=True)

In [None]:
#label_encoder
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
cols=['creative_id']
for feat in tqdm(cols):
    lbe = LabelEncoder()  # or Hash
    data[feat] = lbe.fit_transform(data[feat])

In [None]:
#gen_session
from joblib import Parallel, delayed
def gen_session_list(user_id, t):
    t.sort_values('time', inplace=True, ascending=True)
    session = []
    for row in t.iterrows():
        creative= row[1]['creative_id']
        session.append((creative))
    return user_id, session

In [None]:
def applyParallel(df_grouped, func, n_jobs, backend='multiprocessing'):
    """Use Parallel and delayed """  # backend='threading'
    results = Parallel(n_jobs=n_jobs, verbose=4, backend=backend)(delayed(func)(name, group) for name, group in df_grouped)
    return {k: v for k, v in results}

In [None]:
all_user=pd.DataFrame()
all_user['user_id']=data['user_id']
all_user=all_user.drop_duplicates()
all_user=all_user.reset_index(drop=True)
print(len(all_user))
print(all_user[:5])

In [None]:
n_samples = all_user.shape[0]
print(n_samples)
batch_size = 500000
iters = (n_samples - 1) // batch_size + 1
print("total", iters, "iters", "batch_size", batch_size)
for i in range(0, iters):
    target_user = all_user['user_id'].values[i * batch_size:(i + 1) * batch_size]
    sub_data = data.loc[data.user_id.isin(target_user)]
    print(i, 'iter start')
    df_grouped = sub_data.groupby(['user_id'])
    user_hist_session = applyParallel(df_grouped, gen_session_list, n_jobs=20, backend='loky')
    pd.to_pickle(user_hist_session, 'tencent2020/user_hist_session' +str(i)+'.pkl')
    print(i, 'pickled')

In [None]:
#get all the hist_session
import pandas as pd
import gc
user_hist_session = {}
for i in range(4):
  user_hist_session_= pd.read_pickle('tencent2020/user_hist_session' + str(i) + '.pkl')
  user_hist_session.update(user_hist_session_)
  del user_hist_session_
  gc.collect()

# Part2: Padding

In [None]:
train_user=pd.read_csv('tencent2020/train_preliminary/user.csv')
test_user=pd.read_csv('tencent2020/test/click_log.csv')
test_user=test_user[['user_id']]
test_user=test_user.drop_duplicates()
test_user=test_user.reset_index(drop=True)
print(len(test_user))
print(test_user[:5])

In [None]:
SESS_MAX_LEN=64
def gen_sess_feature(row):
    sess_max_len = SESS_MAX_LEN
    sess_input_dict = {'creative':[0]}
    sess_input_length = 0
    user= row[1]['user_id']
    if user not in user_hist_session or len(user_hist_session[user]) == 0:
        sess_input_dict['creative'] = [0]
        sess_input_length = 0
    else:
        cur_sess = user_hist_session[user]
        for i in reversed(range(len(cur_sess))):
            sess_input_dict['creative'] = [e[2] for e in cur_sess[max(0, i + 1 - sess_max_len):i + 1]]
            sess_input_length = len(sess_input_dict['creative'])
            break
    return sess_input_dict['creative'],sess_input_length

In [None]:
from tqdm import tqdm
sess_input_dict= {'creative':[]}
sess_input_length= []
for row in tqdm(train_user[['user_id']].iterrows()):
    a, b= gen_sess_feature_din(row)
    sess_input_dict['creative'].append(a)
    sess_input_length.append(b)
print('done')
train_user['creative']=sess_input_dict['creative']

In [None]:
sess_input_dict= {'creative':[]}
sess_input_length= []
for row in tqdm(test_user[['user_id']].iterrows()):
    a, b= gen_sess_feature_din(row)
    sess_input_dict['creative'].append(a)
    sess_input_length.append(b)
print('done')
test_user['creative']=sess_input_dict['creative']

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
train_input=[pad_sequences(train_user['creative'].values, maxlen=SESS_MAX_LEN, padding='post')]
test_input=[pad_sequences(test_user['creative'].values, maxlen=SESS_MAX_LEN, padding='post')]

In [None]:
pd.to_pickle(train_input, 'tecent2020/train_input_creative_64' +'.pkl')
pd.to_pickle(test_input, 'tecent2020/test_input_creative_64' +'.pkl')

# Part3:BILSTM model

In [None]:
creative_id_max=len(set(data.creative_id)) 

In [None]:
from keras.layers import *
import numpy as np
from keras.models import Sequential
from keras.initializers import Constant,RandomNormal
from keras.regularizers import l2
SESS_MAX_LEN=64
em=128
model = Sequential()
model.add(Embedding(creative_id_max+1,
                    em,
                    embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
                    embeddings_regularizer=l2(1e-6),
                    input_length=SESS_MAX_LEN,
                    trainable=True,
                    mask_zero=True))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.25))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics = ['accuracy'])
print(model.summary())

# Part4: BILSTM with multi inputs and outputs

In [None]:
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras.models import Model
from keras.layers import *
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.initializers import Constant,RandomNormal
from keras.regularizers import l2

SESS_MAX_LEN=64
em=128

inputA = Input(shape=(SESS_MAX_LEN,))
inputB = Input(shape=(SESS_MAX_LEN,))
inputC = Input(shape=(SESS_MAX_LEN,))

x=Embedding(ad_id_max+1,
            em,
            embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
            embeddings_regularizer=l2(1e-6),
            input_length=SESS_MAX_LEN,
            trainable=True,
            mask_zero=True)(inputA)
x=SpatialDropout1D(0.2)(x)
x=Bidirectional(LSTM(128, return_sequences=True))(x)
x=Bidirectional(LSTM(64, return_sequences=True))(x)
x_avg_pool = GlobalAveragePooling1D()(x)
# x_avg_pool = GlobalMaxPooling1D()(x)
# x_conc= concatenate([x_avg_pool,x_max_pool])
x_last=Dropout(0.25)(x_avg_pool)
x_last= Model(inputs=inputA, outputs=x_last)


y=Embedding(advertiser_id_max+1,
            em,
            embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
            embeddings_regularizer=l2(1e-6),
            input_length=SESS_MAX_LEN,
            trainable=True,
            mask_zero=True)(inputB)
y=SpatialDropout1D(0.2)(y)
y=Bidirectional(LSTM(128, return_sequences=True))(y)
y=Bidirectional(LSTM(64, return_sequences=True))(y)
y_avg_pool = GlobalAveragePooling1D()(y)
# y_avg_pool = GlobalMaxPooling1D()(y)
# y_conc= concatenate([y_avg_pool,y_max_pool])
y_last=Dropout(0.25)(y_avg_pool)
y_last= Model(inputs=inputB, outputs=y_last)

z=Embedding(creative_id_max+1,
            em,
            embeddings_initializer=RandomNormal(mean=0.0, stddev=0.0001, seed=2020),
            embeddings_regularizer=l2(1e-6),
            input_length=SESS_MAX_LEN,
            trainable=True,
            mask_zero=True)(inputC)
z=SpatialDropout1D(0.2)(z)
z=Bidirectional(LSTM(128, return_sequences=True))(z)
z=Bidirectional(LSTM(64, return_sequences=True))(z)
z_avg_pool = GlobalAveragePooling1D()(z)
# z_avg_pool = GlobalMaxPooling1D()(z)
# z_conc= concatenate([z_avg_pool,z_max_pool])
z_last=Dropout(0.25)(z_avg_pool)
z_last= Model(inputs=inputC, outputs=z_last)

# combine the output of the three branches
combined = concatenate([x_last.output, y_last.output,z_last.output])

w = Dense(64, activation="relu")(combined)
w = Dropout(0.25)(w)
w1 = Dense(2, activation="softmax")(w)
w2 = Dense(10, activation="softmax")(w)

# our model will accept the inputs of the three branches and
# then output two values
model = Model(inputs=[inputA, inputB,inputC], outputs=[w1,w2])
model.summary()

# Part4:transformer model

In [None]:
import numpy as np
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.initializers import *
import tensorflow as tf
from keras.engine.topology import Layer

try:
    from dataloader import TokenList, pad_to_longest
    # for transformer
except: pass

embed_size = 128
class LayerNormalization(Layer):
    def __init__(self, eps=1e-6, **kwargs):
        self.eps = eps
        super(LayerNormalization, self).__init__(**kwargs)
    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                     initializer=Ones(), trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                    initializer=Zeros(), trainable=True)
        super(LayerNormalization, self).build(input_shape)
    def call(self, x):
        mean = K.mean(x, axis=-1, keepdims=True)
        std = K.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta
    def compute_output_shape(self, input_shape):
        return input_shape

class ScaledDotProductAttention():
    def __init__(self, d_model, attn_dropout=0.1):
        self.temper = np.sqrt(d_model)
        self.dropout = Dropout(attn_dropout)
    def __call__(self, q, k, v, mask):
        attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
        if mask is not None:
            mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
            attn = Add()([attn, mmask])
        attn = Activation('softmax')(attn)
        attn = self.dropout(attn)
        output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
        return output, attn

class MultiHeadAttention():
    # mode 0 - big martixes, faster; mode 1 - more clear implementation
    def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
        self.mode = mode
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.dropout = dropout
        if mode == 0:
            self.qs_layer = Dense(n_head*d_k, use_bias=False)
            self.ks_layer = Dense(n_head*d_k, use_bias=False)
            self.vs_layer = Dense(n_head*d_v, use_bias=False)
        elif mode == 1:
            self.qs_layers = []
            self.ks_layers = []
            self.vs_layers = []
            for _ in range(n_head):
                self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
                self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
                self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization() if use_norm else None
        self.w_o = TimeDistributed(Dense(d_model))

    def __call__(self, q, k, v, mask=None):
        d_k, d_v = self.d_k, self.d_v
        n_head = self.n_head

        if self.mode == 0:
            qs = self.qs_layer(q)  # [batch_size, len_q, n_head*d_k]
            ks = self.ks_layer(k)
            vs = self.vs_layer(v)

            def reshape1(x):
                s = tf.shape(x)   # [batch_size, len_q, n_head * d_k]
                x = tf.reshape(x, [s[0], s[1], n_head, d_k])
                x = tf.transpose(x, [2, 0, 1, 3])  
                x = tf.reshape(x, [-1, s[1], d_k])  # [n_head * batch_size, len_q, d_k]
                return x
            qs = Lambda(reshape1)(qs)
            ks = Lambda(reshape1)(ks)
            vs = Lambda(reshape1)(vs)

            if mask is not None:
                mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
            head, attn = self.attention(qs, ks, vs, mask=mask)  
                
            def reshape2(x):
                s = tf.shape(x)   # [n_head * batch_size, len_v, d_v]
                x = tf.reshape(x, [n_head, -1, s[1], s[2]]) 
                x = tf.transpose(x, [1, 2, 0, 3])
                x = tf.reshape(x, [-1, s[1], n_head*d_v])  # [batch_size, len_v, n_head * d_v]
                return x
            head = Lambda(reshape2)(head)
        elif self.mode == 1:
            heads = []; attns = []
            for i in range(n_head):
                qs = self.qs_layers[i](q)   
                ks = self.ks_layers[i](k) 
                vs = self.vs_layers[i](v) 
                head, attn = self.attention(qs, ks, vs, mask)
                heads.append(head); attns.append(attn)
            head = Concatenate()(heads) if n_head > 1 else heads[0]
            attn = Concatenate()(attns) if n_head > 1 else attns[0]

        outputs = self.w_o(head)
        outputs = Dropout(self.dropout)(outputs)
        if not self.layer_norm: return outputs, attn
        outputs = Add()([outputs, q])
        return self.layer_norm(outputs), attn

class PositionwiseFeedForward():
    def __init__(self, d_hid, d_inner_hid, dropout=0.1):
        self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
        self.w_2 = Conv1D(d_hid, 1)
        self.layer_norm = LayerNormalization()
        self.dropout = Dropout(dropout)
    def __call__(self, x):
        output = self.w_1(x) 
        output = self.w_2(output)
        output = self.dropout(output)
        output = Add()([output, x])
        return self.layer_norm(output)

class EncoderLayer():
    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
        self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn_layer  = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
    def __call__(self, enc_input, mask=None):
        output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
        output = self.pos_ffn_layer(output)
        return output, slf_attn

class DecoderLayer():
    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
        self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.enc_att_layer  = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn_layer  = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
    def __call__(self, dec_input, enc_output, self_mask=None, enc_mask=None):
        output, slf_attn = self.self_att_layer(dec_input, dec_input, dec_input, mask=self_mask)
        output, enc_attn = self.enc_att_layer(output, enc_output, enc_output, mask=enc_mask)
        output = self.pos_ffn_layer(output)
        return output, slf_attn, enc_attn

def GetPosEncodingMatrix(max_len, d_emb):
    pos_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] 
        if pos != 0 else np.zeros(d_emb) 
            for pos in range(max_len)
            ])
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
    return pos_enc

def GetPadMask(q, k):
    ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
    mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
    mask = K.batch_dot(ones, mask, axes=[2,1])
    return mask

def GetSubMask(s):
    len_s = tf.shape(s)[1]
    bs = tf.shape(s)[:1]
    mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
    return mask

class Encoder():
    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, \
                layers=6, dropout=0.1, word_emb=None, pos_emb=None):
        self.emb_layer = word_emb
        self.pos_layer = pos_emb
        self.emb_dropout = Dropout(dropout)
        self.layers = [EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout) for _ in range(layers)]
        
    def __call__(self, src_seq, src_pos, return_att=False, active_layers=999):
        x = self.emb_layer(src_seq)
        if src_pos is not None:
            pos = self.pos_layer(src_pos)
            x = Add()([x, pos])
        x = self.emb_dropout(x)
        if return_att: atts = []
        mask = Lambda(lambda x:GetPadMask(x, x))(src_seq)
        for enc_layer in self.layers[:active_layers]:
            x, att = enc_layer(x, mask)
            if return_att: atts.append(att)
        return (x, atts) if return_att else x


class Transformer():
    def __init__(self, len_limit, d_model=embed_size, \
              d_inner_hid=512, n_head=10, d_k=64, d_v=64, layers=2, dropout=0.1, \
              share_word_emb=False, **kwargs):
        self.name = 'Transformer'
        self.len_limit = len_limit
        self.src_loc_info = True
        self.d_model = d_model
        self.decode_model = None
        d_emb = d_model

        pos_emb = Embedding(len_limit, d_emb, trainable=False, \
                            weights=[GetPosEncodingMatrix(len_limit, d_emb)])

        i_word_emb = Embedding(max_features, d_emb) # Add embedding here

        self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
                               word_emb=i_word_emb, pos_emb=pos_emb)

        
    def get_pos_seq(self, x):
        mask = K.cast(K.not_equal(x, 0), 'int32')
        pos = K.cumsum(K.ones_like(x, 'int32'), 1)
        return pos * mask

    def compile(self, active_layers=999):
        src_seq_input = Input(shape=(None,))
        src_seq = src_seq_input
        src_pos = Lambda(self.get_pos_seq)(src_seq)
        if not self.src_loc_info: src_pos = None

        x = self.encoder(src_seq, src_pos, active_layers=active_layers)
        # x = GlobalMaxPool1D()(x) # Not sure about this layer. Just wanted to reduce dimension
        x = GlobalAveragePooling1D()(x)
        outp = Dense(2, activation="softmax")(x)
        self.model = Model(inputs=src_seq_input, outputs=outp)
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
maxlen=SESS_MAX_LEN
max_features=creative_id_max+1
model= Transformer(maxlen, layers=1)
model.compile()
model =model.model
model.summary()

# Part5:Transformer with gradient penalty and customized attention mask

In [None]:
import numpy as np
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.initializers import *
import tensorflow as tf
from keras.engine.topology import Layer
from keras import backend as K
from keras.losses import categorical_crossentropy
import keras
from keras.optimizers import Adam

try:
    from dataloader import TokenList, pad_to_longest
    # for transformer
except: pass

embed_size = 128
heads=8
deep=embed_size//heads
maxlen=64
drop=0.2
p=0.2
class LayerNormalization(Layer):
    def __init__(self, eps=1e-6, **kwargs):
        self.eps = eps
        super(LayerNormalization, self).__init__(**kwargs)
    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                     initializer=Ones(), trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                    initializer=Zeros(), trainable=True)
        super(LayerNormalization, self).build(input_shape)
    def call(self, x):
        mean = K.mean(x, axis=-1, keepdims=True)
        std = K.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta
    def compute_output_shape(self, input_shape):
        return input_shape

class ScaledDotProductAttention():
    def __init__(self, d_model, attn_dropout=drop):
        self.temper = np.sqrt(d_model)
        self.dropout = Dropout(attn_dropout)
    def __call__(self, q, k, v, mask):
        attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
        if mask is not None:
            mmask = Lambda(lambda x:x)(mask)
            attn = Add()([attn, mmask])
        attn = Activation('softmax')(attn)
        attn = self.dropout(attn)
        output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
        return output, attn

class MultiHeadAttention():
    # mode 0 - big martixes, faster; mode 1 - more clear implementation
    def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
        self.mode = mode
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.dropout = dropout
        if mode == 0:
            self.qs_layer = Dense(n_head*d_k, use_bias=False)
            self.ks_layer = Dense(n_head*d_k, use_bias=False)
            self.vs_layer = Dense(n_head*d_v, use_bias=False)
        elif mode == 1:
            self.qs_layers = []
            self.ks_layers = []
            self.vs_layers = []
            for _ in range(n_head):
                self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
                self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
                self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization() if use_norm else None
        self.w_o = TimeDistributed(Dense(d_model))

    def __call__(self, q, k, v, mask=None):
        d_k, d_v = self.d_k, self.d_v
        n_head = self.n_head

        if self.mode == 0:
            qs = self.qs_layer(q)  # [batch_size, len_q, n_head*d_k]
            ks = self.ks_layer(k)
            vs = self.vs_layer(v)

            def reshape1(x):
                s = tf.shape(x)   # [batch_size, len_q, n_head * d_k]
                x = tf.reshape(x, [s[0], s[1], n_head, d_k])
                x = tf.transpose(x, [2, 0, 1, 3])  
                x = tf.reshape(x, [-1, s[1], d_k])  # [n_head * batch_size, len_q, d_k]
                return x
            qs = Lambda(reshape1)(qs)
            ks = Lambda(reshape1)(ks)
            vs = Lambda(reshape1)(vs)

            if mask is not None:
                mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
            head, attn = self.attention(qs, ks, vs, mask=mask)  
                
            def reshape2(x):
                s = tf.shape(x)   # [n_head * batch_size, len_v, d_v]
                x = tf.reshape(x, [n_head, -1, s[1], s[2]]) 
                x = tf.transpose(x, [1, 2, 0, 3])
                x = tf.reshape(x, [-1, s[1], n_head*d_v])  # [batch_size, len_v, n_head * d_v]
                return x
            head = Lambda(reshape2)(head)
        elif self.mode == 1:
            heads = []; attns = []
            for i in range(n_head):
                qs = self.qs_layers[i](q)   
                ks = self.ks_layers[i](k) 
                vs = self.vs_layers[i](v) 
                head, attn = self.attention(qs, ks, vs, mask)
                heads.append(head); attns.append(attn)
            head = Concatenate()(heads) if n_head > 1 else heads[0]
            attn = Concatenate()(attns) if n_head > 1 else attns[0]

        outputs = self.w_o(head)
        outputs = Dropout(self.dropout)(outputs)
        if not self.layer_norm: return outputs, attn
        outputs = Add()([outputs, q])
        return self.layer_norm(outputs), attn

class PositionwiseFeedForward():
    def __init__(self, d_hid, d_inner_hid, dropout=drop):
        self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
        self.w_2 = Conv1D(d_hid, 1)
        self.layer_norm = LayerNormalization()
        self.dropout = Dropout(dropout)
    def __call__(self, x):
        output = self.w_1(x)
        output = self.dropout(output)
        output = self.w_2(output)
        output = self.dropout(output)
        output = Add()([output, x])
        return self.layer_norm(output)

class EncoderLayer():
    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=drop):
        self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn_layer  = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
    def __call__(self, enc_input, mask=None):
        output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
        output = self.pos_ffn_layer(output)
        return output, slf_attn

class DecoderLayer():
    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=drop):
        self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.enc_att_layer  = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn_layer  = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
    def __call__(self, dec_input, enc_output, self_mask=None, enc_mask=None):
        output, slf_attn = self.self_att_layer(dec_input, dec_input, dec_input, mask=self_mask)
        output, enc_attn = self.enc_att_layer(output, enc_output, enc_output, mask=enc_mask)
        output = self.pos_ffn_layer(output)
        return output, slf_attn, enc_attn

def GetPosEncodingMatrix(max_len, d_emb):
    pos_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] 
        if pos != 0 else np.zeros(d_emb) 
            for pos in range(max_len)
            ])
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
    return pos_enc

def GetPadMask(q, k):
    ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
    # mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
    mask = K.cast(K.expand_dims(k, 1), 'float32')
    mask = K.batch_dot(ones, mask, axes=[2,1])
    return mask

def GetSubMask(s):
    len_s = tf.shape(s)[1]
    bs = tf.shape(s)[:1]
    mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
    return mask

class Encoder():
    def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, \
                layers=6, dropout=drop, ad_emb=None,adv_emb=None,ind_emb=None,creative_emb=None,pos_emb=None):
        self.ad_emb_layer = ad_emb
        self.adv_emb_layer = adv_emb
        self.ind_emb_layer = ind_emb
        self.creative_emb_layer = creative_emb
        self.pos_layer = pos_emb
        self.emb_dropout = Dropout(dropout)
        self.layers = [EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout) for _ in range(layers)]
        
    def __call__(self,ad_seq,adv_seq,ind_seq,creative_seq,click_seq,src_pos,return_att=False, active_layers=999):
        ad = self.ad_emb_layer(ad_seq)
        adv = self.adv_emb_layer(adv_seq)
        ind = self.ind_emb_layer(ind_seq)
        creative = self.creative_emb_layer(creative_seq)
    
        # x = Add()([ad,adv])
        # x = Add()([x,ind])

        if src_pos is not None:
            pos = self.pos_layer(src_pos)
            ad = Add()([ad, pos])
            adv= Add()([adv, pos])
            ind= Add()([ind, pos])
            creative= Add()([creative, pos])
        x=keras.layers.concatenate([ad, adv, ind,creative])
        x=keras.layers.Dense(embed_size)(x)
        x=Dropout(0.25)(x)
        x = self.emb_dropout(x)
        if return_att: atts = []
        # m_click=keras.layers.concatenate([click_seq, click_seq, click_seq])
        mask = Lambda(lambda x:GetPadMask(x, x))(click_seq)
        for enc_layer in self.layers[:active_layers]:
            x, att = enc_layer(x, mask)
            if return_att: atts.append(att)
        return (x, atts) if return_att else x


class Transformer():
    def __init__(self, len_limit=maxlen, d_model=embed_size, \
              d_inner_hid=512, n_head=heads, d_k=deep, d_v=deep, layers=2, dropout=drop, \
              share_word_emb=False, **kwargs):
        self.name = 'Transformer'
        self.len_limit = len_limit
        self.src_loc_info = True
        self.d_model = d_model
        self.decode_model = None
        d_emb = d_model

        pos_emb = Embedding(len_limit, d_emb, trainable=False, \
                            weights=[GetPosEncodingMatrix(len_limit, d_emb)])

        ad_word_emb = Embedding(ad_id_max+1, d_emb,name='ad_em') # Add embedding here
        adv_word_emb = Embedding(advertiser_id_max+1, d_emb,name='adv_em') # Add embedding here
        ind_word_emb = Embedding(industry_id_max+1, d_emb,name='ind_em') # Add embedding here
        creative_word_emb = Embedding(creative_id_max+1, d_emb,name='creative_em')

        self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
                               ad_emb=ad_word_emb, adv_emb=adv_word_emb,ind_emb=ind_word_emb,creative_emb=creative_word_emb,pos_emb=pos_emb)

        
    def get_pos_seq(self, x):
        mask = K.cast(K.not_equal(x, 0), 'int32')
        pos = K.cumsum(K.ones_like(x, 'int32'), 1)
        return pos * mask


    def search_layer(self,inputs, name, exclude=None):
        if exclude is None:
            exclude = set()
        if isinstance(inputs, keras.layers.Layer):
            layer = inputs
        else:
            layer = inputs._keras_history[0]

        if layer.name == name:
            return layer
        elif layer in exclude:
            return None
        else:
            exclude.add(layer)
            inbound_layers = layer._inbound_nodes[0].inbound_layers
            if not isinstance(inbound_layers, list):
                inbound_layers = [inbound_layers]
            if len(inbound_layers) > 0:
                  for layer in inbound_layers:
                        layer = self.search_layer(layer, name, exclude)
                        if layer is not None:
                             return layer


    def loss_with_gradient_penalty(self,y_true, y_pred, epsilon=1):

        loss = K.mean(categorical_crossentropy(y_true, y_pred))
         ad_embeddings = self.search_layer(y_pred, 'ad_em').embeddings
      # adv_embeddings = self.search_layer(y_pred, 'adv_em').embeddings
      # ind_embeddings = self.search_layer(y_pred, 'ind_em').embeddings
         ad_gp = K.sum(K.gradients(loss, [ad_embeddings])[0].values**2)
      # adv_gp = K.sum(K.gradients(loss, [adv_embeddings])[0].values**2)
      # ind_gp = K.sum(K.gradients(loss, [ind_embeddings])[0].values**2)
         return loss + p*epsilon *ad_gp 

    def compile(self, active_layers=999):
        ad_seq= Input(shape=(maxlen,))
        adv_seq= Input(shape=(maxlen,))
        ind_seq= Input(shape=(maxlen,))
        creative_seq= Input(shape=(maxlen,))
        click_seq= Input(shape=(maxlen,))
        src_seq=ad_seq
        src_pos = Lambda(self.get_pos_seq)(src_seq)
        if not self.src_loc_info: src_pos = None
        x = self.encoder(ad_seq,adv_seq,ind_seq,creative_seq,click_seq,src_pos,active_layers=active_layers)
        x=Bidirectional(LSTM(128, return_sequences=True))(x)
        x=Bidirectional(LSTM(64, return_sequences=True))(x)
        x=GlobalMaxPool1D()(x) # Not sure about this layer. Just wanted to reduce dimension
        x=Dropout(0.25)(x)
        # x = GlobalAveragePooling1D()(x)
        # outp1 = Dense(10, activation="softmax")(x)
        # outp2 = Dense(2, activation="softmax")(x)
        outp= Dense(10, activation="softmax")(x)

        self.model = Model(inputs=[ad_seq,adv_seq,ind_seq,creative_seq,click_seq], outputs=outp)
        self.model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

# Part6:Reduce memory

In [None]:
def get_train(train_input_ad,train_input_advertiser,train_input_creative,train_set,target,batch_size):

    x_samples_ad=train_input_ad[0]
    x_samples_advertiser=train_input_advertiser[0]
    x_samples_creative=train_input_creative[0]
    y_samples=pd.get_dummies(train_set[target]).values

    batch_num = int(len(train_set) / batch_size)
    max_len = batch_num * batch_size
    x_samples_ad = x_samples_ad[:max_len]
    x_samples_advertiser = x_samples_advertiser[:max_len]
    x_samples_creative= x_samples_creative[:max_len]
    y_samples = y_samples[:max_len]
 
    print('the length of samples:', len(y_samples))

    for i in range(batch_num):
        x1=x_samples_ad[i * batch_size:(i + 1) * batch_size]
        x2=x_samples_advertiser[i * batch_size:(i + 1) * batch_size]
        x3=x_samples_creative[i * batch_size:(i + 1) * batch_size]
        y=y_samples[i * batch_size:(i + 1) * batch_size]
        yield ([x1,x2,x3],y)
        
 
batch_size = 2048
model.fit_generator(
  get_train(train_input_ad,train_input_advertiser,train_input_creative,train_set,'gender',batch_size=batch_size),
  epochs=10,
  steps_per_epoch=int(len(train_set) / batch_size),
  max_queue_size=300,
  validation_split=0.2，
shuffle=True)