In [1]:
import evaluate3 as evaluate
import pandas as pd
import preprocessing as preprocess
import numpy as np
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping,CSVLogger
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Input, Dense,Dropout,Embedding,LSTM,Bidirectional, Masking, TimeDistributed, Conv1D, MaxPooling1D, Flatten, concatenate, GRU
# from tensorflow_addons.layers.crf import CRF
from crf import CRF
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
idx2word, word2idx,char_embeddings = preprocess.get_pretrain_char_emb('CKIP')
char_vocabs = list(idx2word.values())

In [2]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus)!=0:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPUs visible")

1 Physical GPUs, 1 Logical GPU


# Read Data (as BERT format)

In [3]:
with tf.device('/cpu:0'):
    import time
    from sklearn.metrics import accuracy_score
    from tensorflow.keras.utils import to_categorical
    from random import shuffle
    msra_dest = ['./data/MSRA/msra_train_bioes.txt','./data/MSRA/msra_test_bioes.txt']
    msra_zhNERTF = ['./data/MSRA/train_data_zh-NER-TF_bioes.txt','./data/MSRA/train_data_zh-NER-TF_bioes.txt']
    peopledaily_dest = ['./data/PeopleDaily/example_bioes.train','./data/PeopleDaily/example_bioes.test']
    weibo_dest = ['./data/Weibo/weiboNER_2nd_conll_bioes.train', './data/Weibo/weiboNER_2nd_conll_bioes.test']
    homeapp_dest = ['./data/homeapp/家電NER.txt', './data/homeapp/家電NER.txt']
    singer_dest=['./data/Singer/SingerData.txt', './data/Singer/SingerData.txt']
    print('msra bert')
    (msra_train_x_bert, msra_train_y_bert, _), (msra_test_x_bert, msra_test_y_bert, _),  msra_tags = preprocess.load_data(msra_dest[0], msra_dest[1], True)
    print('people bert')
    (people_train_x_bert, people_train_y_bert, _), (people_test_x_bert, people_test_y_bert, _), people_tags = preprocess.load_data(peopledaily_dest[0], peopledaily_dest[1], True)
    print('weibo bert')
    (weibo_train_x_bert, weibo_train_y_bert, _), (weibo_test_x_bert, weibo_test_y_bert, _),  weibo_tags = preprocess.load_data(weibo_dest[0], weibo_dest[1], True)
    print('singer bert')
    (singer_train_x_bert, singer_train_y_bert, _), (_, _, _),  singer_tags = preprocess.load_data(singer_dest[0], singer_dest[1], True)
    print('homeapp bert')
    (homeapp_train_x_bert, homeapp_train_y_bert, _), (_, _, _),  homeapp_tags = preprocess.load_data(homeapp_dest[0], homeapp_dest[1], use_bert=True)
    msra_label2idx = {char: idx for idx, char in enumerate(msra_tags)}
    msra_idx2label = {idx: label for label, idx in msra_label2idx.items()}
    people_label2idx = {char: idx for idx, char in enumerate(people_tags)}
    people_idx2label = {idx: label for label, idx in people_label2idx.items()}
    weibo_label2idx = {char: idx for idx, char in enumerate(weibo_tags)}
    weibo_idx2label = {idx: label for label, idx in weibo_label2idx.items()}
    singer_label2idx = {char: idx for idx, char in enumerate(singer_tags)}
    singer_idx2label = {idx: label for label, idx in singer_label2idx.items()}
    homeapp_tags.append('S-PB')
    homeapp_tags=list(set(homeapp_tags))
    homeapp_tags.sort()
    homeapp_label2idx = {char: idx for idx, char in enumerate(homeapp_tags)}
    homeapp_idx2label = {idx: label for label, idx in homeapp_label2idx.items()}
    print('done!')

msra bert
people bert
weibo bert
singer bert
homeapp bert
done!


In [4]:
#clear ow and at
for i in range(len(homeapp_train_y_bert)):
    for j in range(len(homeapp_train_y_bert[i])):
        if len(homeapp_idx2label[homeapp_train_y_bert[i][j]])>1:
            if homeapp_idx2label[homeapp_train_y_bert[i][j]][-2:]=='OW' or homeapp_idx2label[homeapp_train_y_bert[i][j]][-2:]=='AT':
                homeapp_train_y_bert[i][j]=homeapp_label2idx['O']

# Create Model

In [6]:
def create_model(mytag, domain_num=2, crf_layer=True, compile_ = True, use_bert=False, embed_size = 50, pretrain_emb=False): #compile first
    if use_bert:
        input1 = Input(shape=(128,), name='input_word_ids', dtype=tf.int32)
        input2 = Input(shape=(128,), name='input_mask', dtype=tf.int32)
        input3 = Input(shape=(128,), name='input_type_ids', dtype=tf.int32)
        #no fintune
#        bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/3', trainable=False, output_key='sequence_output', name='bert')
        #fintune
        bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/3', trainable=True, output_key='sequence_output', name='encoding_bert')
        output = bert_layer({'input_word_ids':input1, 'input_mask':input2, 'input_type_ids':input3})
    else:
        inputs = Input(shape=(128,), name='encoding_input')
        output = Masking(mask_value=word2idx['<PAD>'], name='encoding_mask')(inputs)
        if pretrain_emb:
            embed_size = len(char_embeddings['我'])
            embedding_matrix = np.zeros((len(word2idx), embed_size))
            for word, i in word2idx.items():
                embedding_vector = char_embeddings.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector
            output = Embedding(len(char_vocabs), embed_size, weights=[embedding_matrix], trainable=True, name='encoding_emb')(output)  
        else:
            output = Embedding(len(char_vocabs), embed_size, trainable=True, name='encoding_emb')(output)  
        
    encoding_output = Bidirectional(LSTM(200 // 2, return_sequences=True, trainable=True), name='encoding_lstm')(output)
        
    decoding_output = TimeDistributed(Dense(len(mytag)), name='decoding_timedistribute')(encoding_output)
    if crf_layer:
        crf=CRF(len(mytag),name='decoding_crf_layer')
        decoding_output = crf(decoding_output)
    else:
        decoding_output = Dense(len(mytag), activation='softmax', name='decoding_softmax')(decoding_output)
    if use_bert:
        decoding_model = Model(inputs = [input1, input2, input3], outputs = decoding_output)
    else:    
        decoding_model = Model(inputs = inputs, outputs = decoding_output)
    
    temp_output = GradReverse()(encoding_output)
    cnn = Conv1D(1, kernel_size=1, strides=1,activation='relu', name='dis_conv')(temp_output)
    cnn = MaxPooling1D(10, name='dis_maxpooling1d')(cnn)
    flat = Flatten(name='dis_flatten')(cnn)
    discriminator_output = Dense(domain_num, activation='softmax', name = 'discriminator')(flat) #softmax會讓所有的output總和=1
    if use_bert:
        discriminator_model = Model(inputs = [input1, input2, input3], outputs = discriminator_output)
    else:
        discriminator_model = Model(inputs = inputs, outputs = discriminator_output)
    
    if compile_:
        if crf_layer:
            if use_bert:
                import model.optimization as optimization
                optimizer = optimization.create_optimizer(5e-5, (1280//32)*epochs, int((epochs*1280*0.1)//32), 0.0, 'adamw')
                decoding_model.compile(optimizer=optimizer,loss = crf.get_loss, metrics=[crf.get_accuracy])
            else:
                decoding_model.compile(optimizer="adam",loss = crf.get_loss, metrics=[crf.get_accuracy])
        else:
            decoding_model.compile(optimizer="adam",loss = 'categorical_crossentropy', metrics=['accuracy'])
        discriminator_model.compile(optimizer="adam",loss = 'categorical_crossentropy', metrics=['accuracy'])
    output_model = {'decoding_model':decoding_model, 'discriminator_model':discriminator_model}    
    
    print('【create model】')
    if not use_bert:        
        print('embedding size:', embed_size)
        print('use pretrain embedding:', pretrain_emb)
    print('domain number:', domain_num)
    print('tag size:', len(mytag))
    print('use BERT?', use_bert)

    return output_model

### with BERT

In [8]:
#read other pre-trained model
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/3', trainable=True, output_key='sequence_output', name='encoding_bert')
from tensorflow.keras.models import load_model
print('loading model...')
# decoding_final_model = load_model('./MetaNER_weight_save/bert531_homeapp_sgd.h5', custom_objects={'KerasLayer':bert_layer})
decoding_final_model = load_model('./MetaNER_weight_save/bert531_test_on_singer.h5', custom_objects={'KerasLayer':bert_layer})






In [16]:
test_samples = list(np.load('LC_test1000samples_idx.npy'))
test_bert = {'X':{key: np.array([homeapp_train_x_bert[key][i] for i in test_samples]) for key in homeapp_train_x_bert.keys()} , 'y':np.array([homeapp_train_y_bert[i] for i in test_samples])}
train_samples = [i for i in range(2000) if i not in test_samples]
train_bert = {'X':{key: np.array([homeapp_train_x_bert[key][i] for i in train_samples]) for key in homeapp_train_x_bert.keys()}, 'y':np.array([homeapp_train_y_bert[i] for i in train_samples])} 

 # Training

In [37]:
#with tf.device('/cpu:0'):
#with meta
from tensorflow.keras import callbacks
import time
meta = True
print('use meta?', meta)
epochs = 8
all_f1_table = 0
data_sizes = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# data_sizes = [1000]
for i in data_sizes:
    starttime = time.time()
    model = create_model(homeapp_tags, use_bert=True)['decoding_model']
    x, y = sample_data([train_bert['X'], train_bert['y']], random_=False, datasize=i)
    print('training data:', len(y))
    if meta:        
        temp_weight = model_get_weight(temp_model[0], keyword='enc')
        update_weights(model, temp_weight, keyword='enc')
    print('meta:{}, train size:{}, test size:{}'.format(meta, len(y), len(test_bert['y'])))
#     history = model.fit(train_bert['X'],train_bert['y'],batch_size=16, epochs=epochs, verbose=1)
    history = model.fit(x,y,batch_size=16, epochs=epochs, verbose=1)
    pred = [[j for j in i] for i in model.predict(test_bert['X'])]   

    _, _, f1_test, f1_table, result_count = evaluate.evaluation(pred, test_bert['y'], True,homeapp_idx2label, homeapp_idx2label)
    print('f1:', f1_test)
    print(f1_table)
    print('result count:')
    print(pd.DataFrame(result_count))
    print('spend', int(time.time()-starttime),'sec.')
    del model
#     if type(all_f1_table)==int:
#         all_f1_table=f1_table.copy()
#     else:
#         all_f1_table+=f1_table
print('--------------------------------------------------')        
# print(all_f1_table/itrs)


use meta? True
【create model】
domain number: 2
tag size: 17
use BERT? True
training data: 100
meta:True, train size:100, test size:1000
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
label type: BIOES
Accuracy:  124005 / 128000 = 0.9687890625
gold_num =  3194  pred_num =  2962  right_num =  1367
f1: 0.4441195581546459
            P       R       F
AT     0.0000  0.0000  0.0000
OW     0.0000  0.0000  0.0000
PB     0.4021  0.2062  0.2726
PN     0.4866  0.6842  0.5687
total  0.4600  0.4300  0.4400
result count:
         AT  OW    PB    PN  total
true      0   0   353  1014   1367
predict   0   0   878  2084   2962
answer    0   0  1712  1482   3194
spend 57 sec.
【create model】
domain number: 2
tag size: 17
use BERT? True
training data: 200
meta:True, train size:200, test size:1000
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
label type: BIOES
Accuracy:  126727 / 128000 = 0.9900546875
gold_num =  3194  pred_num =  3471  ri

【create model】
domain number: 2
tag size: 17
use BERT? True
training data: 700
meta:True, train size:700, test size:1000
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
label type: BIOES
Accuracy:  127169 / 128000 = 0.9935078125
gold_num =  3194  pred_num =  3326  right_num =  3010
f1: 0.9233128834355828
            P       R       F
AT     0.0000  0.0000  0.0000
OW     0.0000  0.0000  0.0000
PB     0.8978  0.9340  0.9155
PN     0.9133  0.9521  0.9323
total  0.9000  0.9400  0.9200
result count:
         AT  OW    PB    PN  total
true      0   0  1599  1411   3010
predict   0   0  1781  1545   3326
answer    0   0  1712  1482   3194
spend 188 sec.
【create model】
domain number: 2
tag size: 17
use BERT? True
training data: 800
meta:True, train size:800, test size:1000
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
label type: BIOES
Accuracy:  127214 / 128000 = 0.993859375
gold_num =  3194  pred_num =  3256  right_num =  3006

# Transfer Learning

原本是想要試在NER上用不同的transfer learning方法

但發現幾乎都沒用，所以放棄 

下面的程式碼的效能一樣沒有放在論文中

# 基本設定

In [5]:
#Gradient Reverse Layer
@tf.custom_gradient
def grad_reverse(x):
    y = tf.identity(x)
    def custom_grad(dy):
        return -dy
    return y, custom_grad

class GradReverse(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def call(self, x):
        return grad_reverse(x)

In [7]:
#new for share encoder
def create_src_model(mytag, crf_layer=True, compile_ = True, use_bert=False, src_num = 2, embed_size=50, pretrain_emb=False): #compile first
    if type(mytag[0])!=list:
        mytag = [mytag]*src_num
    elif len(mytag)!=src_num:
        src_num=len(mytag)
    if use_bert:
        input1 = Input(shape=(128,), name='input_word_ids', dtype=tf.int32)
        input2 = Input(shape=(128,), name='input_mask', dtype=tf.int32)
        input3 = Input(shape=(128,), name='input_type_ids', dtype=tf.int32)
        #fintune
        bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/3', trainable=True, name='encoding_bert')
        output = bert_layer({'input_word_ids':input1, 'input_mask':input2, 'input_type_ids':input3})['sequence_output']
    else:
        inputs = Input(shape=(128,), name='encoding_input')
        output = Masking(mask_value=word2idx['<PAD>'], name='encoding_mask')(inputs)
        if pretrain_emb:
            embed_size = len(char_embeddings['我'])
            embedding_matrix = np.zeros((len(word2idx), embed_size))
            for word, i in word2idx.items():
                embedding_vector = char_embeddings.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector
            output = Embedding(len(char_vocabs), embed_size, weights=[embedding_matrix], trainable=True, name='encoding_emb')(output)  
        else:
            output = Embedding(len(char_vocabs), embed_size, trainable=True, name='encoding_emb')(output)  
        
    encoding_output = Bidirectional(LSTM(200 // 2, return_sequences=True, trainable=True), name='encoding_lstm')(output)
#    encoding_model = Model(inputs = inputs, outputs = encoding_output)

    #decoder
    decoding_models = []
    for i in range(src_num):
        decoding_output = TimeDistributed(Dense(len(mytag[i])), name='decoding_timedistribute')(encoding_output)
        if crf_layer:
            crf=CRF(len(mytag[i]),name='decoding_crf_layer')
            decoding_output = crf(decoding_output)
        else:
            decoding_output = Dense(len(mytag[i]), activation='softmax', name='decoding_softmax')(decoding_output)
        if use_bert:
            import model.optimization as optimization
            optimizer = optimization.create_optimizer(5e-5, (1280//32)*epochs, int((epochs*1280*0.1)//32), 0.0, 'adamw')
            decoding_model = Model(inputs = [input1, input2, input3], outputs = decoding_output)
            if compile_:
                decoding_model.compile(optimizer=optimizer,loss = crf.get_loss, metrics=[crf.get_accuracy])
        else:    
            decoding_model = Model(inputs = inputs, outputs = decoding_output)
        decoding_models.append(decoding_model)
    temp_output = bert_layer({'input_word_ids':input1, 'input_mask':input2, 'input_type_ids':input3})['pooled_output']
    temp_output = GradReverse()(temp_output)
#     cnn = Conv1D(1, kernel_size=1, strides=1,activation='relu', name='dis_conv')(temp_output)
#     cnn = MaxPooling1D(10, name='dis_maxpooling1d')(cnn)
#     flat = Flatten(name='dis_flatten')(cnn)
    discriminator_output = Dense(src_num+1, activation='softmax', name = 'discriminator')(temp_output)
    if use_bert:
        discriminator_model = Model(inputs = [input1, input2, input3], outputs = discriminator_output)
    else:
        discriminator_model = Model(inputs = inputs, outputs = discriminator_output)
    
    if compile_:
        discriminator_model.compile(optimizer="adam",loss = 'categorical_crossentropy', metrics=['accuracy'])
#         if crf_layer:
#             if use_bert:
#                 import model.optimization as optimization
#                 optimizer = optimization.create_optimizer(5e-5, (1280//32)*epochs, int((epochs*1280*0.1)//32), 0.0, 'adamw')
#                 for temp in range(src_num):
#                     crf=CRF(len(mytag[temp]),name='decoding_crf_layer')
#                     decoding_models[temp].compile(optimizer=optimizer,loss = crf.get_loss, metrics=[crf.get_accuracy])
#             else:
#                 for temp in range(src_num):
#                     decoding_models[temp].compile(optimizer='adam',loss = crf.get_loss, metrics=[crf.get_accuracy])
#         else:
#             for temp in range(src_num):
#                decoding_models[temp].compile(optimizer="categorical_crossentropy",loss = crf.get_loss, metrics=[accuracy])
    return_model = {'decoding_model':decoding_models, 'discriminator_model':discriminator_model}
    print('【create source model】')
    if not use_bert:
        print('embedding size:', embed_size)
        print('use pretrain embedding:', pretrain_emb)
    print('domain number:', src_num)
    print('tag size:', len(mytag))
    print('use crf?', crf_layer)
    print('use BERT?', use_bert)
    return return_model



In [8]:
def model_get_weight(model, keyword='', not_=False):
    origin_weight = []
    for layer in model.layers:
        if not_:
            if not layer.name.startswith(keyword): 
                origin_weight.append(np.array(layer.get_weights()))
        else:
            if layer.name.startswith(keyword): 
                origin_weight.append(np.array(layer.get_weights()))
    return np.array(origin_weight)

def update_weights(model, update_weight, keyword='', not_=False):
    k=0
    for layer in model.layers:
        if not_:
            if not layer.name.startswith(keyword):
                layer.set_weights(update_weight[k])
                k+=1
        else:
            if layer.name.startswith(keyword):
                layer.set_weights(update_weight[k])
                k+=1
def update_weights_forsame(model, model_src):
    for layer in model.layers:
        flag = False
        for layer_src in model_src.layers:
            if layer.name==layer_src.name and len(layer.get_weights())==len(layer_src.get_weights()) and flag==False:
                try: 
                    layer.set_weights(layer_src.get_weights())
                    flag = True
                except:
                    print('error!')
        if flag==False:
            print('model layer: "', layer.name, '" not in source model')

In [9]:
#資料設定
import random
def sample_data(data_list, datasize, random_=True):
    #data_list = [BERT_x, label]
    if random_:
        samples = random.sample(range(len(data_list[1])), datasize)
    else:
        samples = list(range(datasize))        
    bert_x = data_list[0]
    bert_x = {k:np.array([bert_x[k][i] for i in samples]) for k in bert_x.keys()}
    label = np.array(data_list[1])
    label = np.array([label[i] for i in samples])
    return bert_x, label

def sample_data_dann(data_list, datasize, random_=True, start_=0):
    #data_list = [[BERT_x, sentiment], [BERT_x, sentiment], [BERT_x, sentiment], ...]
    #different domain
    bert_x, bert_y, domains = [], [], []
    bert_x_wo_tgt, bert_y_wo_tgt = [], []
    count=0
    for data in data_list:
        if random_ and (count!=len(data_list)-1 and start_!=-1):
            samples = random.sample(range(len(data[1])), datasize)
        else:
            end = start_+datasize
            if end>len(data[1]):
                end = len(data[1])
            samples = list(range(start_, end))  
        tmp_data_x = [{k:data[0][k][i] for k in data[0].keys()}  for i in samples] #list[dict]
        tmp_data_y = [data[1][i] for i in samples]
        bert_x+=tmp_data_x
        bert_y+=tmp_data_y
        if count<len(data_list)-1:
            bert_x_wo_tgt.append(transBERTtype(tmp_data_x, True))
            bert_y_wo_tgt.append(np.array(tmp_data_y))
        domains+=len(samples)*[count]
        count+=1
    bert_x = transBERTtype(bert_x, True)
    label = np.array(bert_y)
    domains = np.array(to_categorical(domains))
    return bert_x, label, domains, bert_x_wo_tgt, bert_y_wo_tgt

def sample_task(train, test=None, support_size=50, query_size=50, domain_num=0):    
    #train = [X, y]
    #test = [X, y]
    #這邊BERT抽出來會變只有一份資料
    bert=False
    if type(train[0])==dict:
        bert=True
        train[0] = transBERTtype(train[0], False)
        if test!=None:
            test[0] = transBERTtype(test[0], False)
    train[0] = np.array(train[0])
    train[1] = np.array(train[1])
    import random
#    random.seed()
    if test!=None:
        test[0] = np.array(test[0])
        test[1] = np.array(test[1])
        train_samples = random.sample(range(len(train[0])), support_size)
        return_train = [train[0][train_samples], train[1][train_samples], [domain_num]*support_size]
        other_list = [i for i in range(len(train[0]))] 
        test_samples = random.sample(range(len(test[0])), query_size)
        return_test = [test[0][test_samples], test[1][test_samples], [domain_num]*query_size]
    else:
        train_samples = random.sample(range(len(train[0])), support_size)
        return_train = [train[0][train_samples], train[1][train_samples], [domain_num]*support_size]
        other_list = [i for i in range(len(train[0])) if i not in train_samples ] 
        test_samples = random.sample(other_list, query_size)
        return_test = [train[0][test_samples], train[1][test_samples], [domain_num]*query_size]
    if bert:
        return_train[0] = transBERTtype(return_train[0], True)
        return_test[0] = transBERTtype(return_test[0], True)
    return return_train, return_test

def multiply_grads(lamb, grads):
    for i in range(len(grads)):
        grads[i]*=lamb
    return grads

def add_grads(grad1, grad2):
    grad = grad1.copy()
    try:
        for i in range(len(grad)):
            grad[i]=grad1[i]+grad2[i]
    except:
        print('error!')
    return grad

def filter_sentence(my_x, my_y, myidx2label=None, filt=False, datasize=1500):
    bert=False
    if type(my_x)==dict:
        my_x = transBERTtype(my_x, False)
        bert=True
    if filt:
        outputy = []
        if myidx2label!=None:
            for ys in range(len(my_y)):
                flag = False
                for y in my_y[ys]:
                    if myidx2label[y]!='O':
                        flag=True
                        break
                if flag:
                    outputy.append(ys)
        else:
            outputy = my_y
        if bert:            
            return [transBERTtype([my_x[i] for i in outputy[:datasize]], True), [my_y[i] for i in outputy[:datasize]]]
        else:
            return [[my_x[i] for i in outputy[:datasize]], [my_y[i] for i in outputy[:datasize]]]
    else:
        if bert:
            return [transBERTtype(my_x, True), my_y]
        else:
            return [my_x, my_y]
    
def transBERTtype(data, toBERT=True):
    if toBERT: #input 每個資料都有三個key，每個key的維度都是128*768
        return {k:np.array([data[i][k] for i in range(len(data))]) for k in data[0].keys()}
    else: #原本BERT的形式
        return [{k:data[k][i] for k in data.keys()} for i in range(len(data['input_word_ids']))]

## FineTuning

In [34]:
#train on batch
#DANN training 的時候用的target x 是train_x; 模型訓練完後還是要像meta learning一樣再去對模型finetune
import warnings
warnings.simplefilter('ignore')

import time
from sklearn.metrics import f1_score, accuracy_score, classification_report
filt_sentence = False

my_tags = [msra_tags, people_tags, weibo_tags, singer_tags]
# with tf.device('/cpu:0'):
#     data_list = [filter_sentence(msra_train_x_bert, msra_train_y_bert, msra_idx2label,filt_sentence), 
#                 filter_sentence(people_train_x_bert, people_train_y_bert, people_idx2label,filt_sentence), 
#                 filter_sentence(weibo_train_x_bert, weibo_train_y_bert, weibo_idx2label, filt_sentence), 
#                 filter_sentence(singer_train_x_bert, singer_train_y_bert, singer_idx2label, filt_sentence)]
#     temp_model = create_src_model(my_tags, crf_layer=True, compile_ = True, use_bert=True)['decoding_model']

for dta_idx in range(len(data_list)):
    history = temp_model[dta_idx].fit(data_list[dta_idx][0],np.array(data_list[dta_idx][1]),batch_size=16, epochs=1, verbose=1)




## Reptile 

In [None]:
outer_iteration = 100
inner_iteration = 5
epochs = inner_iteration
datasize_per_task = 16
meta_step_size = 0.1
import time
starttime = time.time()
filt_sentence=False
my_tags = [msra_tags, people_tags, weibo_tags, singer_tags]
source_data = [{'train':filter_sentence(msra_train_x_bert, msra_train_y_bert, msra_idx2label,filt_sentence), 'test':filter_sentence(msra_test_x_bert, msra_test_y_bert, msra_idx2label, filt_sentence), 'tags':msra_tags},
               {'train':filter_sentence(people_train_x_bert, people_train_y_bert, people_idx2label,filt_sentence), 'test':filter_sentence(people_test_x_bert, people_test_y_bert, msra_idx2label, filt_sentence), 'tags':people_tags},
               {'train':filter_sentence(weibo_train_x_bert, weibo_train_y_bert, weibo_idx2label, filt_sentence), 'test':filter_sentence(weibo_test_x_bert, weibo_test_y_bert, weibo_idx2label, filt_sentence), 'tags':weibo_tags},
               {'train':filter_sentence(singer_train_x_bert, singer_train_y_bert, singer_idx2label, filt_sentence), 'test':filter_sentence(singer_train_x_bert, singer_train_y_bert, singer_idx2label, filt_sentence), 'tags':singer_tags}]

tmp_model = create_src_model(my_tags, crf_layer=True, compile_ = True, use_bert=True)['decoding_model'] 

print('start!')
# origin_weights = model_get_weight(tmp_model[0], 'enc')
# for src_idx in range(len(source_data)):
#     update_weights(tmp_model[src_idx], origin_weights, keyword='enc')
minloss = 100000000.0
flagcount = 0
threshold = 5
for itr in range(outer_iteration):    
    if itr%10==0:
        print('itr =',itr)
    done_step = itr/outer_iteration
    cur_meta_step_size = (1-done_step)*meta_step_size
    origin_weights = model_get_weight(tmp_model[0], 'enc')
    new_weights = []
    losses = []
    for srcs in range(len(source_data)): 
        src_idx = srcs%len(source_data)
        train, _ = sample_task(source_data[src_idx]['train'].copy(), domain_num=src_idx, support_size=datasize_per_task)   
        for i in range(inner_iteration):
            loss = tmp_model[src_idx].train_on_batch(x=train[0], y=train[1])[0]
            loss = round(loss, 5)
        new_weights.append(model_get_weight(tmp_model[src_idx], keyword='enc'))
        update_weights(tmp_model[src_idx], origin_weights, keyword='enc') #這邊的src_idx其實沒差，隨便一個index都可以，因為encoder共用
        losses.append(loss)
    #update weights
    new_weights = np.array(new_weights)
    new_weight = new_weights[0]
    for i in range(len(new_weights)-1):
        new_weight+=new_weights[i+1]
    new_weight/=len(new_weights)    
    new_weight = origin_weights + ((new_weight-origin_weights)*cur_meta_step_size)
    update_weights(tmp_model[0], new_weight, keyword='enc')
    print('lr', round(cur_meta_step_size, 5), '\tloss', losses, '\t spend', int(time.time()-starttime))
    if sum(losses)<minloss:
        minloss = sum(losses)
        flagcount = 0
    else:
        flagcount+=1
    if flagcount==5:
        break
    del new_weight, origin_weights, new_weights
print('total spend {} seconds'.format(int(time.time()-starttime)))

## DANN

In [11]:
#train on batch
#DANN training 的時候用的target x 是train_x; 模型訓練完後還是要像meta learning一樣再去對模型finetune
import warnings
warnings.simplefilter('ignore')

import time
from sklearn.metrics import f1_score, accuracy_score, classification_report
epochs = 5
datasize_per_domain = 50
update_times =2
filt_sentence = False
my_tags = [msra_tags, people_tags, weibo_tags, singer_tags]

with tf.device('/cpu:0'):
    data_list = [filter_sentence(msra_train_x_bert, msra_train_y_bert, msra_idx2label,filt_sentence), 
                filter_sentence(people_train_x_bert, people_train_y_bert, people_idx2label,filt_sentence), 
                filter_sentence(weibo_train_x_bert, weibo_train_y_bert, weibo_idx2label, filt_sentence), 
                filter_sentence(singer_train_x_bert, singer_train_y_bert, singer_idx2label, filt_sentence),
                filter_sentence(homeapp_train_x_bert, homeapp_train_y_bert, homeapp_idx2label, filt_sentence)]
    tmp_model = create_src_model(my_tags, crf_layer=True, compile_ = True, use_bert=True)

print('maximum epochs:', epochs)
print('datasize per batch:', datasize_per_domain)
print('no opt. max times:', update_times)
print('start to train DANN!')
starttime = time.time()
flag_count = 0
best_loss = 1000.0
for epoch in range(epochs):
    #對於每個batch，都有每個domain的資訊 e.g. 3個src, 1個tgt, 一個batch可能就有40筆資料，每個domain各10筆
    #要分tgt domain 有/無 label的case    
    loss, adv_loss = [], []
    acc, adv_acc = [], []
    #in each batch
    for batches in range(int(len(data_list[-1][1])/datasize_per_domain)+1):
#     for batches in range(1):
        start_ = time.time()
        start = batches*datasize_per_domain
        x, _, domain, x_wo_tgt, label_wo_tgt = sample_data_dann(data_list, datasize_per_domain, start_=start)
        if len(domain[0])==len(data_list):
            #先對每一個domain的 NER訓練
            for dm in range(len(label_wo_tgt)):
                #預測一波
                pred_label = tmp_model['decoding_model'][dm].predict(x_wo_tgt[dm])
                pred_label = [np.argmax(i) for i in pred_label]
                ans_label = [np.argmax(i) for i in label_wo_tgt[dm]]
                acc.append(accuracy_score(ans_label, pred_label))
                #再訓練
                with tf.device('/cpu:0'): #這邊不用cpu跑會OOM
                    tmp_loss = tmp_model['decoding_model'][dm].train_on_batch(x=x_wo_tgt[dm], y=label_wo_tgt[dm])[0]
                loss.append(round(tmp_loss, 5))

            #再對domain discriminator進行訓練(一起訓練)
            pred_domain = tmp_model['discriminator_model'].predict(x)
            with tf.device('/cpu:0'): #這邊不用cpu跑會OOM
                tmp_adv_loss = tmp_model['discriminator_model'].train_on_batch(x=x, y=domain)[0]        
            adv_loss.append(round(tmp_adv_loss, 5))
            pred_domain = [np.argmax(i) for i in pred_domain]
            ans_domain = [np.argmax(i) for i in domain]
            adv_acc.append(accuracy_score(ans_domain, pred_domain))
            print('epoch {} in batch'.format(epoch), batches, 'spend', int(time.time()-start_))        
            print('label loss:{}; label acc.:{}'.format(round(tmp_loss, 5), round(np.mean(acc[-len(label_wo_tgt):]), 5)))
            print('adv. loss:{}; adv. acc.:{}'.format(round(tmp_adv_loss, 5), round(adv_acc[-1], 5)))
            print()
            
    print('#epoch ', epoch)
    print('label loss:{}; label acc.:{}; spend {} sec.'.format(round(np.mean(loss), 5), round(np.mean(acc), 5), int(time.time()-starttime)))
    print('adv. loss:{}; adv. acc.:{}; spend {} sec.'.format(round(np.mean(adv_loss), 5), round(np.mean(adv_acc), 5), int(time.time()-starttime)))
    print('------------------------------')
    
#    tmp_model['decoding_model'][dm].save('./MetaNER_weight_save/DANN_decoding.h5')
    model_weight = []
    tmp_model['decoding_model'][0]
    for layer in tmp_model['decoding_model'][0].layers:
        model_weight.append(layer.get_weights())
    np.save('./MetaNER_weight_save/dann_weight.npy', np.array(model_weight))
    del model_weight
    if np.mean(loss)>=best_loss:
        flag_count+=1
    else:
        flag_count=0
        best_loss=np.mean(loss)
    if flag_count>=update_times:
        break
    starttime = time.time()
# del tmp_model
print('done!')  

【create source model】
domain number: 4
tag size: 4
use crf? True
use BERT? True
maximum epochs: 5
datasize per batch: 50
no opt. max times: 2
start to train DANN!








epoch 0 in batch 0 spend 138
label loss:311.48398; label acc.:0.095
adv. loss:2.18905; adv. acc.:0.22

epoch 0 in batch 1 spend 68
label loss:302.11102; label acc.:0.415
adv. loss:4.59204; adv. acc.:0.2

epoch 0 in batch 2 spend 68
label loss:239.79359; label acc.:0.16
adv. loss:2.31555; adv. acc.:0.168

epoch 0 in batch 3 spend 67
label loss:220.27699; label acc.:0.265
adv. loss:9.90711; adv. acc.:0.2

epoch 0 in batch 4 spend 67
label loss:198.94414; label acc.:0.235
adv. loss:6.69642; adv. acc.:0.2

epoch 0 in batch 5 spend 69
label loss:200.56319; label acc.:0.26
adv. loss:11.24656; adv. acc.:0.2

epoch 0 in batch 6 spend 69
label loss:227.44557; label acc.:0.205
adv. loss:13.20655; adv. acc.:0.2

epoch 0 in batch 7 spend 69
label loss:211.23969; label acc.:0.45
adv. loss:15.58029; adv. acc.:0.2

epoch 0 in batch 8 spend 69
label loss:187.51132; label acc.:0.445
adv. loss:20.25161; adv. acc.:0.2

epoch 0 in batch 9 spend 68
label loss:162.62886; label acc.:0.455
adv. loss:25.10936;

epoch 2 in batch 0 spend 69
label loss:32.38158; label acc.:0.835
adv. loss:2.4789; adv. acc.:0.2

epoch 2 in batch 1 spend 69
label loss:28.67718; label acc.:0.875
adv. loss:2.46108; adv. acc.:0.2

epoch 2 in batch 2 spend 72
label loss:28.29774; label acc.:0.865
adv. loss:2.50388; adv. acc.:0.2

epoch 2 in batch 3 spend 74
label loss:25.81173; label acc.:0.85
adv. loss:2.61372; adv. acc.:0.2

epoch 2 in batch 4 spend 75
label loss:28.05194; label acc.:0.87
adv. loss:2.73051; adv. acc.:0.2

epoch 2 in batch 5 spend 74
label loss:29.65903; label acc.:0.84
adv. loss:2.78554; adv. acc.:0.2

epoch 2 in batch 6 spend 73
label loss:29.12881; label acc.:0.885
adv. loss:2.75606; adv. acc.:0.2

epoch 2 in batch 7 spend 72
label loss:30.59589; label acc.:0.845
adv. loss:2.67766; adv. acc.:0.2

epoch 2 in batch 8 spend 73
label loss:27.49424; label acc.:0.875
adv. loss:2.60048; adv. acc.:0.2

epoch 2 in batch 9 spend 73
label loss:28.2778; label acc.:0.87
adv. loss:2.55974; adv. acc.:0.2

epoch 

epoch 4 in batch 0 spend 67
label loss:24.28326; label acc.:0.87
adv. loss:1.61002; adv. acc.:0.2

epoch 4 in batch 1 spend 68
label loss:27.0445; label acc.:0.845
adv. loss:1.60995; adv. acc.:0.2

epoch 4 in batch 2 spend 68
label loss:27.77853; label acc.:0.87
adv. loss:1.61014; adv. acc.:0.2

epoch 4 in batch 3 spend 68
label loss:28.65083; label acc.:0.855
adv. loss:1.6101; adv. acc.:0.2

epoch 4 in batch 4 spend 68
label loss:26.35558; label acc.:0.855
adv. loss:1.61013; adv. acc.:0.2

epoch 4 in batch 5 spend 68
label loss:27.21803; label acc.:0.845
adv. loss:1.61007; adv. acc.:0.2

epoch 4 in batch 6 spend 68
label loss:24.81693; label acc.:0.885
adv. loss:1.60997; adv. acc.:0.2

epoch 4 in batch 7 spend 68
label loss:26.39206; label acc.:0.905
adv. loss:1.60988; adv. acc.:0.2

epoch 4 in batch 8 spend 68
label loss:28.37292; label acc.:0.86
adv. loss:1.60978; adv. acc.:0.2

epoch 4 in batch 9 spend 68
label loss:26.62097; label acc.:0.88
adv. loss:1.60956; adv. acc.:0.2

epoch 