這是在senti-hood這份TABSA上用 BERT-pari & adversarial reptile測試的程式

效果沒有在論文中提及(因為也不是我們要討論的重點)，參考就好

In [1]:
import os
import re
import sys
import nltk
import json
import random
import operator
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import xml.etree.ElementTree
from tqdm import tqdm, trange
import tensorflow_text as text 
import model.tokenization as tokenization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping,CSVLogger
from tensorflow.keras.layers import Input, Dense,Dropout,Embedding,LSTM,Bidirectional, Masking, TimeDistributed, Conv1D, MaxPooling1D, Flatten, concatenate, GRU

random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

nltk.download('punkt')
data_dir = './data/sentihood/'
BERT_src = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
BERT_LAYER = hub.KerasLayer(BERT_src, trainable=False)
VOCAB_FILE = BERT_LAYER.resolved_object.vocab_file.asset_path.numpy()
tokenizer = tokenization.FullTokenizer(VOCAB_FILE, True) 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data loading

In [2]:
def get_aspect_idx(data, aspect2idx):
    ret = []
    for _, _, _, aspect, _ in data:
        ret.append(aspect2idx[aspect])
    assert len(data) == len(ret)
    return np.array(ret)

In [3]:
def parse_sentihood_json(in_file):
    with open(in_file) as f:
        data = json.load(f)
    ret = []
    for d in data:
        text = d['text']
        sent_id = d['id']
        opinions = []
        targets = set()
        for opinion in d['opinions']:
            sentiment = opinion['sentiment']
            aspect = opinion['aspect']
            target_entity = opinion['target_entity']
            targets.add(target_entity)
            opinions.append((target_entity, aspect, sentiment))
        ret.append((sent_id, text, opinions))
    return ret



In [4]:
def convert_input(data, all_aspects, allow_none=True):
    ret = []
    for sent_id, text, opinions in data:
        for target_entity, aspect, sentiment in opinions:
            if aspect not in all_aspects:
                continue
            ret.append((sent_id, text, target_entity, aspect, sentiment))
        assert 'LOCATION1' in text
        targets = set(['LOCATION1'])
        if 'LOCATION2' in text:
            targets.add('LOCATION2')
        for target in targets:
            aspects = set([a for t, a, _ in opinions if t == target])
            if allow_none:
                none_aspects = [a for a in all_aspects if a not in aspects]
                for aspect in none_aspects:
                    ret.append((sent_id, text, target, aspect, 'None'))
    return ret

In [5]:
def tokenize(data):
    ret = []
    for sent_id, text, target_entity, aspect, sentiment in data:
        new_text = nltk.word_tokenize(text)
        new_aspect = aspect.split('-')
        ret.append((sent_id, new_text, target_entity, new_aspect, sentiment))
    return ret

In [6]:
def load_task(data_dir, aspect2idx, allow_none=True):
    in_file = os.path.join(data_dir, 'sentihood-train.json')
    train = parse_sentihood_json(in_file)
    in_file = os.path.join(data_dir, 'sentihood-dev.json')
    dev = parse_sentihood_json(in_file)
    in_file = os.path.join(data_dir, 'sentihood-test.json')
    test = parse_sentihood_json(in_file)
    
    train = convert_input(train, aspect2idx, allow_none)
    train_aspect_idx = get_aspect_idx(train, aspect2idx)
    train = tokenize(train)
    
    dev = convert_input(dev, aspect2idx, allow_none)
    dev_aspect_idx = get_aspect_idx(dev, aspect2idx)
    dev = tokenize(dev)
    
    test = convert_input(test, aspect2idx, allow_none)
    test_aspect_idx = get_aspect_idx(test, aspect2idx)
    test = tokenize(test)

    return (train, train_aspect_idx), (dev, dev_aspect_idx), (test, test_aspect_idx)

## create NLI_M

In [8]:
#create NLI_M
aspect2idx = {
    'general': 0,
    'price': 1,
    'transit-location': 2,
    'safety': 3,
}
#id / sentence / target / aspect(s) / sentiment
(train, train_aspect_idx), (val, val_aspect_idx), (test, test_aspect_idx) = load_task(data_dir, aspect2idx)
print("len(train) = ", len(train))
print("len(val) = ", len(val))
print("len(test) = ", len(test))

train.sort(key=lambda x:x[2]+str(x[0])+x[3][0])
val.sort(key=lambda x:x[2]+str(x[0])+x[3][0])
test.sort(key=lambda x:x[2]+str(x[0])+x[3][0])

dir_path = data_dir+'bert-pair/'
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

with open(dir_path+"train_NLI_M.tsv","w",encoding="utf-8") as f:
    f.write("id\tsentence1\tsentence2\tlabel\n")
    for v in train:
        f.write(str(v[0])+"\t")
        word=v[1][0].lower()
        if word=='location1':f.write('location - 1')
        elif word=='location2':f.write('location - 2')
        elif word[0]=='\'':f.write("\' "+word[1:])
        else:f.write(word)
        for i in range(1,len(v[1])):
            word=v[1][i].lower()
            f.write(" ")
            if word == 'location1':
                f.write('location - 1')
            elif word == 'location2':
                f.write('location - 2')
            elif word[0] == '\'':
                f.write("\' " + word[1:])
            else:
                f.write(word)
        f.write("\t")
        if v[2]=='LOCATION1':f.write('location - 1 - ')
        if v[2]=='LOCATION2':f.write('location - 2 - ')
        if len(v[3])==1:
            f.write(v[3][0]+"\t")
        else:
            f.write("transit location\t")
        f.write(v[4]+"\n")

with open(dir_path+"dev_NLI_M.tsv","w",encoding="utf-8") as f:
    f.write("id\tsentence1\tsentence2\tlabel\n")
    for v in val:
        f.write(str(v[0])+"\t")
        word=v[1][0].lower()
        if word=='location1':f.write('location - 1')
        elif word=='location2':f.write('location - 2')
        elif word[0]=='\'':f.write("\' "+word[1:])
        else:f.write(word)
        for i in range(1,len(v[1])):
            word=v[1][i].lower()
            f.write(" ")
            if word == 'location1':
                f.write('location - 1')
            elif word == 'location2':
                f.write('location - 2')
            elif word[0] == '\'':
                f.write("\' " + word[1:])
            else:
                f.write(word)
        f.write("\t")
        if v[2]=='LOCATION1':f.write('location - 1 - ')
        if v[2]=='LOCATION2':f.write('location - 2 - ')
        if len(v[3])==1:
            f.write(v[3][0]+"\t")
        else:
            f.write("transit location\t")
        f.write(v[4]+"\n")

with open(dir_path+"test_NLI_M.tsv","w",encoding="utf-8") as f:
    f.write("id\tsentence1\tsentence2\tlabel\n")
    for v in test:
        f.write(str(v[0])+"\t")
        word=v[1][0].lower()
        if word=='location1':f.write('location - 1')
        elif word=='location2':f.write('location - 2')
        elif word[0]=='\'':f.write("\' "+word[1:])
        else:f.write(word)
        for i in range(1,len(v[1])):
            word=v[1][i].lower()
            f.write(" ")
            if word == 'location1':
                f.write('location - 1')
            elif word == 'location2':
                f.write('location - 2')
            elif word[0] == '\'':
                f.write("\' " + word[1:])
            else:
                f.write(word)
        f.write("\t")
        if v[2]=='LOCATION1':f.write('location - 1 - ')
        if v[2]=='LOCATION2':f.write('location - 2 - ')
        if len(v[3])==1:
            f.write(v[3][0]+"\t")
        else:
            f.write("transit location\t")
        f.write(v[4]+"\n")

len(train) =  15008
len(val) =  3748
len(test) =  7516


In [7]:
import os

data_dir='./data/sentihood/'
aspect2idx = {
    'general': 0,
    'price': 1,
    'transit-location': 2,
    'safety': 3,
}

# (train, train_aspect_idx), (val, val_aspect_idx), (test, test_aspect_idx) = load_task(data_dir, aspect2idx)

# print("len(train) = ", len(train))
# print("len(val) = ", len(val))
# print("len(test) = ", len(test))

# train.sort(key=lambda x:x[2]+str(x[0])+x[3][0])
# val.sort(key=lambda x:x[2]+str(x[0])+x[3][0])
# test.sort(key=lambda x:x[2]+str(x[0])+x[3][0])

location_name = ['loc1', 'loc2']
aspect_name = ['general', 'price', 'safety', 'transit']
dir_path = [data_dir + 'bert-single/' + i + '_' + j + '/' for i in location_name for j in aspect_name]
for path in dir_path:
    if not os.path.exists(path):
        os.makedirs(path)

count=0
with open(dir_path[0]+"train.tsv","w",encoding="utf-8") as f1_general, \
    open(dir_path[1]+"train.tsv", "w", encoding="utf-8") as f1_price, \
    open(dir_path[2]+"train.tsv", "w", encoding="utf-8") as f1_safety, \
    open(dir_path[3]+"train.tsv", "w", encoding="utf-8") as f1_transit, \
    open(dir_path[4]+"train.tsv", "w", encoding="utf-8") as f2_general, \
    open(dir_path[5]+"train.tsv", "w", encoding="utf-8") as f2_price, \
    open(dir_path[6]+"train.tsv", "w", encoding="utf-8") as f2_safety, \
    open(dir_path[7]+"train.tsv", "w",encoding="utf-8") as f2_transit, \
    open(data_dir + "bert-pair/train_NLI_M.tsv", "r", encoding="utf-8") as f:
    s = f.readline().strip()
    s = f.readline().strip()
    while s:
        count+=1
        tmp=s.split("\t")
        line=tmp[0]+"\t"+tmp[1]+"\t"+tmp[3]+"\n"
        if count<=11908:               #loc1
            if count%4==1:
                f1_general.write(line)
            if count%4==2:
                f1_price.write(line)
            if count%4==3:
                f1_safety.write(line)
            if count%4==0:
                f1_transit.write(line)
        else:                          #loc2
            if count%4==1:
                f2_general.write(line)
            if count%4==2:
                f2_price.write(line)
            if count%4==3:
                f2_safety.write(line)
            if count%4==0:
                f2_transit.write(line)
        s = f.readline().strip()

count=0
with open(dir_path[0]+"dev.tsv","w",encoding="utf-8") as f1_general, \
    open(dir_path[1]+"dev.tsv", "w", encoding="utf-8") as f1_price, \
    open(dir_path[2]+"dev.tsv", "w", encoding="utf-8") as f1_safety, \
    open(dir_path[3]+"dev.tsv", "w", encoding="utf-8") as f1_transit, \
    open(dir_path[4]+"dev.tsv", "w", encoding="utf-8") as f2_general, \
    open(dir_path[5]+"dev.tsv", "w", encoding="utf-8") as f2_price, \
    open(dir_path[6]+"dev.tsv", "w", encoding="utf-8") as f2_safety, \
    open(dir_path[7]+"dev.tsv", "w",encoding="utf-8") as f2_transit, \
    open(data_dir + "bert-pair/dev_NLI_M.tsv", "r", encoding="utf-8") as f:
    s = f.readline().strip()
    s = f.readline().strip()
    while s:
        count+=1
        tmp=s.split("\t")
        line=tmp[0]+"\t"+tmp[1]+"\t"+tmp[3]+"\n"
        if count<=2988:               #loc1
            if count%4==1:
                f1_general.write(line)
            if count%4==2:
                f1_price.write(line)
            if count%4==3:
                f1_safety.write(line)
            if count%4==0:
                f1_transit.write(line)
        else:                          #loc2
            if count%4==1:
                f2_general.write(line)
            if count%4==2:
                f2_price.write(line)
            if count%4==3:
                f2_safety.write(line)
            if count%4==0:
                f2_transit.write(line)
        s = f.readline().strip()

count=0
with open(dir_path[0]+"test.tsv","w",encoding="utf-8") as f1_general, \
    open(dir_path[1]+"test.tsv", "w", encoding="utf-8") as f1_price, \
    open(dir_path[2]+"test.tsv", "w", encoding="utf-8") as f1_safety, \
    open(dir_path[3]+"test.tsv", "w", encoding="utf-8") as f1_transit, \
    open(dir_path[4]+"test.tsv", "w", encoding="utf-8") as f2_general, \
    open(dir_path[5]+"test.tsv", "w", encoding="utf-8") as f2_price, \
    open(dir_path[6]+"test.tsv", "w", encoding="utf-8") as f2_safety, \
    open(dir_path[7]+"test.tsv", "w",encoding="utf-8") as f2_transit, \
    open(data_dir + "bert-pair/test_NLI_M.tsv", "r", encoding="utf-8") as f:
    s = f.readline().strip()
    s = f.readline().strip()
    while s:
        count+=1
        tmp=s.split("\t")
        line=tmp[0]+"\t"+tmp[1]+"\t"+tmp[3]+"\n"
        if count<=5964:               #loc1
            if count%4==1:
                f1_general.write(line)
            if count%4==2:
                f1_price.write(line)
            if count%4==3:
                f1_safety.write(line)
            if count%4==0:
                f1_transit.write(line)
        else:                          #loc2
            if count%4==1:
                f2_general.write(line)
            if count%4==2:
                f2_price.write(line)
            if count%4==3:
                f2_safety.write(line)
            if count%4==0:
                f2_transit.write(line)
        s = f.readline().strip()

print("Finished!")

Finished!


## data processor

In [8]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
            
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    
    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the test set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines
        
        
class Sentihood_single_Processor(DataProcessor):
    """Processor for the Sentihood data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        train_data = pd.read_csv(os.path.join(data_dir, "train.tsv"),header=None,sep="\t").values
        return self._create_examples(train_data, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        dev_data = pd.read_csv(os.path.join(data_dir, "dev.tsv"),header=None,sep="\t").values
        return self._create_examples(dev_data, "dev")
    
    def get_test_examples(self, data_dir):
        """See base class."""
        test_data = pd.read_csv(os.path.join(data_dir, "test.tsv"),header=None,sep="\t").values
        return self._create_examples(test_data, "test")

    def get_labels(self):
        """See base class."""
        return ['None', 'Positive', 'Negative']

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
          #  if i>50:break
            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(str(line[1]))
            label = tokenization.convert_to_unicode(str(line[2]))
#             if i%1000==0:
#                 print(i)
#                 print("guid=",guid)
#                 print("text_a=",text_a)
# #                 print("text_b=",text_b)
#                 print("label=",label)
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
    
class Sentihood_NLI_M_Processor(DataProcessor):
    """Processor for the Sentihood data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        train_data = pd.read_csv(os.path.join(data_dir, "train_NLI_M.tsv"),sep="\t").values
        return self._create_examples(train_data, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        dev_data = pd.read_csv(os.path.join(data_dir, "dev_NLI_M.tsv"),sep="\t").values
        return self._create_examples(dev_data, "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        test_data = pd.read_csv(os.path.join(data_dir, "test_NLI_M.tsv"),sep="\t").values
        return self._create_examples(test_data, "test")

    def get_labels(self):
        """See base class."""
        return ['None', 'Positive', 'Negative']

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
          #  if i>50:break
            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(str(line[1]))
            text_b = tokenization.convert_to_unicode(str(line[2]))
            label = tokenization.convert_to_unicode(str(line[3]))
#             if i%1000==0:
#                 print(i)
#                 print("guid=",guid)
#                 print("text_a=",text_a)
#                 print("text_b=",text_b)
#                 print("label=",label)
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


    
# processor = Sentihood_single_Processor()
processor = Sentihood_NLI_M_Processor()

In [9]:
def sentihood_macro_F1(y_true_, y_pred_):
    """
    Calculate "Macro-F1" of aspect detection task of Sentihood.
    """
    sent2idx = {'無':0, '正向':1, '負向':2}
    y_true = [sent2idx[i] for i in y_true_]
    y_pred = [sent2idx[i] for i in y_pred_]
    p_all=0
    r_all=0
    count=0
    for i in range(len(y_pred)//4):
        a=set()
        b=set()
        for j in range(4):
            if y_pred[i*4+j]!=0:
                a.add(j)
            if y_true[i*4+j]!=0:
                b.add(j)
        if len(b)==0:continue
        a_b=a.intersection(b)
        if len(a_b)>0:
            p=len(a_b)/len(a)
            r=len(a_b)/len(b)
        else:
            p=0
            r=0
        count+=1
        p_all+=p
        r_all+=r
    Ma_p=p_all/count
    Ma_r=r_all/count
    aspect_Macro_F1 = 2*Ma_p*Ma_r/(Ma_p+Ma_r)

    return aspect_Macro_F1

## create features

In [10]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    features = []
    for (ex_index, example) in enumerate(tqdm(examples)):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]

        features.append(
                InputFeatures(
                        input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id))
    return features

In [11]:
from tqdm import tqdm, trange
processor = Sentihood_NLI_M_Processor()
dta_dir = './data/sentihood/bert-pair/'
label_list = processor.get_labels()
train_examples=processor.get_train_examples(dta_dir)
test_examples=processor.get_test_examples(dta_dir)
dev_examples=processor.get_dev_examples(dta_dir)
train_features = convert_examples_to_features(train_examples, label_list, 128, tokenizer)
dev_features = convert_examples_to_features(dev_examples, label_list, 128, tokenizer)
test_features = convert_examples_to_features(test_examples, label_list, 128, tokenizer)
len(train_features), len(dev_features), len(test_features)

100%|██████████| 15008/15008 [00:07<00:00, 1984.44it/s]
100%|██████████| 3748/3748 [00:01<00:00, 2013.66it/s]
100%|██████████| 7516/7516 [00:04<00:00, 1839.63it/s]


(15008, 3748, 7516)

In [12]:
# self.label_id = label_id
# ['input_word_ids', 'input_mask', 'input_type_ids']
train_x = {} 
train_x['input_word_ids']=np.array([np.array(train_features[i].input_ids) for i in range(len(train_features))])
train_x['input_mask']=np.array([np.array(train_features[i].input_mask) for i in range(len(train_features))])
train_x['input_type_ids']=np.array([np.array(train_features[i].segment_ids) for i in range(len(train_features))])

test_x = {} 
test_x['input_word_ids']=np.array([np.array(test_features[i].input_ids) for i in range(len(test_features))])
test_x['input_mask']=np.array([np.array(test_features[i].input_mask) for i in range(len(test_features))])
test_x['input_type_ids']=np.array([np.array(test_features[i].segment_ids) for i in range(len(test_features))])

dev_x = {} 
dev_x['input_word_ids']=np.array([np.array(dev_features[i].input_ids) for i in range(len(dev_features))])
dev_x['input_mask']=np.array([np.array(dev_features[i].input_mask) for i in range(len(dev_features))])
dev_x['input_type_ids']=np.array([np.array(dev_features[i].segment_ids) for i in range(len(dev_features))])

train_y = np.array(to_categorical([train_features[i].label_id for i in range(len(train_features))]))
dev_y = np.array(to_categorical([dev_features[i].label_id for i in range(len(dev_features))]))
test_y = np.array(to_categorical([test_features[i].label_id for i in range(len(test_features))]))

We evaluate our method on the SentiHood dataset, which consists of 5,215 sentences, 3,862 of which contain a single target,   
and the remainder multiple targets. Each sentence contains a list of target-aspect pairs {t, a} with the  
sentiment polarity y. Ultimately, given a sentences and the target t in the sentence, we need to:  
(1) detect the mention of an aspect a for the target t;  
(2) determine the positive or negative sentiment polarity y for detected target-aspect pairs.   

 ## modeling

In [13]:
#sentence base, multi-task approach
def create_classify_model(data_size, batch_size = 16, epochs=10, category_len = 8, sentiment_len = 2):
    import model.optimization as optimization
    input1 = Input(shape=(128,), name='input_word_ids', dtype=tf.int32)
    input2 = Input(shape=(128,),name='input_mask', dtype=tf.int32)
    input3 = Input(shape=(128,),name='input_type_ids', dtype=tf.int32)
    bert_layer = hub.KerasLayer(BERT_src, trainable=True, output_key='pooled_output', name='bert_layer')
    output = bert_layer({'input_word_ids':input1, 'input_mask':input2, 'input_type_ids':input3})
#     output = Dense(128, name = 'presentation_')(output)
    
    sentiment_output = Dense(64, activation='relu', name = 'sentiment_pre', 
                             kernel_initializer=keras.initializers.glorot_normal(0), bias_initializer='zeros')(output)
#     sentiment_output = Dropout(0.2, name='sentiment_drop')(sentiment_output)
    sentiment_output = Dense(sentiment_len, activation='softmax', name = 'sentiment', 
                             kernel_initializer=keras.initializers.glorot_normal(0), bias_initializer='zeros')(sentiment_output) #softmax會讓所有的output總和=1
    
    output_model = Model(inputs = [input1, input2, input3], outputs = sentiment_output)
    optimizer = optimization.create_optimizer(
    5e-5, (data_size//batch_size)*epochs, int((epochs*data_size*0.1)//batch_size), 0.0, 'adamw')
    
    output_model.compile(optimizer=optimizer, 
                         loss={'sentiment':'categorical_crossentropy'})#'categorical_crossentropy'})
    return output_model

In [14]:
#sentence base, only sentence approach
#Gradient Reverse Layer
@tf.custom_gradient
def grad_reverse(x):
    y = tf.identity(x)
    def custom_grad(dy):
        return -dy
    return y, custom_grad

class GradReverse(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()

    def call(self, x):
        return grad_reverse(x)
    
def create_temp_model(data_size, batch_size = 16, epochs=10, domain_size = 1, sentiment_len = 2):
    import model.optimization as optimization
    input1 = Input(shape=(128,), name='input_word_ids', dtype=tf.int32)
    input2 = Input(shape=(128,),name='input_mask', dtype=tf.int32)
    input3 = Input(shape=(128,),name='input_type_ids', dtype=tf.int32)
    bert_layer = hub.KerasLayer(BERT_src, trainable=True, output_key='pooled_output', name='bert_layer')
    output = bert_layer({'input_word_ids':input1, 'input_mask':input2, 'input_type_ids':input3})
#     output = Dense(128, name = 'presentation_')(output)
    
    sentiment_output = Dense(64, activation='relu', name = 'sentiment_pre', 
                             kernel_initializer=keras.initializers.he_normal(0), bias_initializer='zeros')(output)
#     sentiment_output = Dropout(0.2, name='sentiment_drop')(sentiment_output)
    sentiment_output = Dense(sentiment_len, activation='softmax', name = 'sentiment', 
                             kernel_initializer=keras.initializers.he_normal(0), bias_initializer='zeros')(sentiment_output) #softmax會讓所有的output總和=1
    if domain_size>1:
        temp_output = GradReverse()(output)
        dis_output = Dense(64, activation='relu', name = 'dis_pre', 
                             kernel_initializer=keras.initializers.he_normal(0), bias_initializer='zeros')(temp_output)
        dis_output = Dense(domain_size, activation='softmax', name = 'discriminator', 
                             kernel_initializer=keras.initializers.he_normal(0), bias_initializer='zeros')(dis_output) #softmax會讓所有的output總和=1

    optimizer = optimization.create_optimizer(
    5e-5, (data_size//batch_size)*epochs, int((epochs*data_size*0.1)//batch_size), 0.0, 'adamw')
    if domain_size>1:        
        output_model = Model(inputs = [input1, input2, input3], outputs = [sentiment_output, dis_output])
        output_model.compile(optimizer=optimizer, 
                             loss={'sentiment':'categorical_crossentropy', 'discriminator':'categorical_crossentropy'},
                            loss_weights={'sentiment':1., 'discriminator':.3})
        only_sentiment_model = Model(inputs = [input1, input2, input3], outputs = sentiment_output)
    else:
        output_model = Model(inputs = [input1, input2, input3], outputs = sentiment_output)
        output_model.compile(optimizer=optimizer, 
                             loss={'sentiment':'categorical_crossentropy'})
        
    return output_model, only_sentiment_model

In [15]:
import model.optimization as optimization
try:
    del tmp_model
except:
    ;
data_size=1600
batch_size=32
epochs=5
optimizer = optimization.create_optimizer(5e-5, (data_size//batch_size)*epochs, int((epochs*data_size*0.1)//batch_size), 0.0, 'adamw')
from tensorflow.keras.models import load_model
print('loading model...')
tmp_model = load_model('./Meta-ACS_weight_save/eng_rep_adv.h5', custom_objects={'KerasLayer':BERT_LAYER, 'AdamWeightDecay':optimizer})
print('done!')

loading model...




done!


In [19]:
#train n 個 y/n模型
# data_ratios = [.01, .05, .1, .2, .3, .4, .5, .8, 1]
data_ratios = [.8, 1]
epochs = 20
for meta in range(2):
    if meta==0: continue;
    for data_ratio in data_ratios:
        datasize = int(data_ratio*len(train_y))
        x, y = sample_data([train_x, train_y], datasize, random_=False)
        print('meta:', meta)
        print('train size:', datasize)
        try:R
            del model
        except:
            ;
        callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
        model = create_classify_model(data_size=len(y), epochs=epochs, sentiment_len = 3)
        if meta:
            update_weights_forsame(model, tmp_model)
        model.fit(x, y, epochs=epochs, verbose=1, batch_size=32, validation_data=(dev_x, dev_y), callbacks=[callback])
        sent_pred = model.predict(test_x)
        del model
        #evaluate
        from sklearn.metrics import f1_score, accuracy_score, classification_report
        idx2sent = ['無', '正向', '負向']
        sent_predict = [idx2sent[np.argmax(i)] for i in sent_pred]
        sent_ans = [idx2sent[np.argmax(i)] for i in test_y]


        total_cases = len(sent_predict)/4
        true_cases = 0
        for i in range(int(total_cases)):
            if sent_predict[i*4]!=sent_ans[i*4]: continue
            if sent_predict[i*4+1]!=sent_ans[i*4+1]: continue
            if sent_predict[i*4+2]!=sent_ans[i*4+2]: continue
            if sent_predict[i*4+3]!=sent_ans[i*4+3]: continue
            true_cases+=1
        print('aspect acc.:',true_cases/total_cases)
        print('aspect macro-f1:', sentihood_macro_F1(sent_ans, sent_predict))

        sent_predict = [idx2sent[np.argmax(i)] for i in sent_pred]
        sent_ans = [idx2sent[np.argmax(i)] for i in test_y]
        sent_predict, sent_answer = [], []
        for i in range(len(sent_ans)):
            if sent_ans[i]!='無':
                sent_answer.append(sent_ans[i])
                if sent_pred[i][1]>=sent_pred[i][2]:
                    sent_predict.append('正向')
                else:
                    sent_predict.append('負向')
        print('sentiment acc.:', accuracy_score(sent_answer, sent_predict))  
        print(classification_report(sent_answer, sent_predict))

meta: 1
train size: 12006
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
aspect acc.: 0.7264502394890899
aspect macro-f1: 0.8136581017590614
sentiment acc.: 0.8930921052631579
              precision    recall  f1-score   support

          正向       0.89      0.96      0.92       810
          負向       0.91      0.76      0.83       406

    accuracy                           0.89      1216
   macro avg       0.90      0.86      0.87      1216
weighted avg       0.89      0.89      0.89      1216

meta: 1
train size: 15008
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
aspect acc.: 0.7626397019691326
aspect macro-f1: 0.8302260527938087
sentiment acc.: 0.928453947368421
              precision    recall  f1-score   support

          正向       0.95      0.94      0.95       810
          負向       0.88      0.91      0.89       406

    accuracy                           0.93      1216
   macro avg       0.92      0.92      0.92      1216
weighted avg       0.93      0.93      0.93  

## Source data

In [19]:
class src_Processor(DataProcessor):
    """Processor for the Sentihood data set."""
    def get_train_examples(self, data_dir):
        """See base class."""
        data_dir+='train.json'
        train_data = parse_src_json(data_dir)
        return self._create_examples(train_data, "train")

    def get_test_examples(self, data_dir):
        """See base class."""
        data_dir+='test.json'
        test_data = parse_src_json(data_dir)
        return self._create_examples(test_data, "test")

    def get_labels(self):
        return ['Neutral', 'Positive', 'Negative']

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        labels = {'中立':'Neutral', '正向':'Positive', '負向':'Negative'}
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = tokenization.convert_to_unicode(str(line['sentence']))
            text_b = tokenization.convert_to_unicode(str(line['target']))
            label = tokenization.convert_to_unicode(str(labels[line['sentiment']]))
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples        

def parse_src_json(in_file):
    with open(in_file, 'r', encoding='utf8') as f:
        data = f.readlines()
    ret = []
    for d in data:
        ret.append(json.loads(d))
    return ret        
in_file = './data/ABSA_Eng/laptop_' 
processor = src_Processor()

In [20]:
def get_x_y(in_file, processor):
    train_data = processor.get_train_examples(in_file)
    test_data = processor.get_test_examples(in_file)
    label_list = processor.get_labels()
    print('train:{}, test:{}'.format(len(train_data), len(test_data)))

    label_list = processor.get_labels()
    train_features = convert_examples_to_features(train_data, label_list, 128, tokenizer)
    test_features = convert_examples_to_features(test_data, label_list, 128, tokenizer)

    train_x = {} 
    train_x['input_word_ids']=np.array([np.array(train_features[i].input_ids) for i in range(len(train_features))])
    train_x['input_mask']=np.array([np.array(train_features[i].input_mask) for i in range(len(train_features))])
    train_x['input_type_ids']=np.array([np.array(train_features[i].segment_ids) for i in range(len(train_features))])

    test_x = {} 
    test_x['input_word_ids']=np.array([np.array(test_features[i].input_ids) for i in range(len(test_features))])
    test_x['input_mask']=np.array([np.array(test_features[i].input_mask) for i in range(len(test_features))])
    test_x['input_type_ids']=np.array([np.array(test_features[i].segment_ids) for i in range(len(test_features))])

    train_y = np.array(to_categorical([train_features[i].label_id for i in range(len(train_features))]))
    test_y = np.array(to_categorical([test_features[i].label_id for i in range(len(test_features))]))
    
    train = {'x':train_x, 'y':train_y}
    test = {'x':test_x, 'y':test_y}
    
    return train, test

processor = src_Processor()
laptop_train, laptop_test = get_x_y('./data/ABSA_Eng/laptop_', processor )
restaurant_train, restaurant_test = get_x_y('./data/ABSA_Eng/restaurant_', processor )

  8%|▊         | 183/2328 [00:00<00:01, 1829.50it/s]

train:2328, test:638


100%|██████████| 2328/2328 [00:01<00:00, 1770.93it/s]
100%|██████████| 638/638 [00:00<00:00, 2100.40it/s]
  6%|▌         | 214/3608 [00:00<00:01, 2127.78it/s]

train:3608, test:1120


100%|██████████| 3608/3608 [00:01<00:00, 2025.84it/s]
100%|██████████| 1120/1120 [00:00<00:00, 1765.22it/s]


## Adv_reptile

In [17]:
import random
def sample_data(data_list, datasize, random_=True):
    #data_list = [BERT_x, sentiment]
    if random_:
        samples = random.sample(range(len(data_list[1])), datasize)
    else:
        samples = list(range(datasize))        
    bert_x = data_list[0]
    bert_x = {k:np.array([bert_x[k][i] for i in samples]) for k in bert_x.keys()}
    sentiment = np.array(data_list[1])
    sentiment = np.array([sentiment[i] for i in samples])
    return bert_x, sentiment
def model_get_weight(model, keyword='', not_=False):
    origin_weight = []
    for layer in model.layers:
        if not_:
            if not layer.name.startswith(keyword): 
                origin_weight.append(np.array(layer.get_weights()))
        else:
            if layer.name.startswith(keyword): 
                origin_weight.append(np.array(layer.get_weights()))
    return np.array(origin_weight)

def update_weights(model, update_weight, keyword='', not_=False):
    k=0
    for layer in model.layers:
        if not_:
            if not layer.name.startswith(keyword):
                layer.set_weights(update_weight[k])
                k+=1
        else:
            if layer.name.startswith(keyword):
                layer.set_weights(update_weight[k])
                k+=1
def update_weights_forsame(model, model_src):
    for layer in model.layers:
        flag = False
        for layer_src in model_src.layers:
            if layer.name==layer_src.name and len(layer.get_weights())==len(layer_src.get_weights()) and flag==False:
                try: 
                    layer.set_weights(layer_src.get_weights())
                    flag = True
                except:
                    print('error!')
        if flag==False:
            print('model layer: "', layer.name, '" not in source model')
        

In [26]:
outer_iteration = 30 #一個iteration大概3.1秒(一個domain)
inner_iteration = 20
epochs = inner_iteration
datasize_per_task = 32
meta_step_size = 0.1
domain_adver = 2
import time
starttime = time.time()
src_x = [laptop_train['x'], restaurant_train['x']]
src_y = [laptop_train['y'], restaurant_train['y']]
tmp_model, save_model = create_temp_model(data_size=len(laptop_train['y']), epochs=epochs, batch_size=datasize_per_task, domain_size = domain_adver, sentiment_len = 3)

print('start to train!')
for itr in range(outer_iteration):
    if itr%10==0:
        print('itr =',itr)
    done_step = itr/outer_iteration
    cur_meta_step_size = (1-done_step)*meta_step_size
    origin_weights = model_get_weight(tmp_model)
    
    new_weights = []
    losses = []
    advloss = []
    adv_loss = 0
    domain_num = 0
    
    for x, y in zip(src_x, src_y):     
        tmp_train_x, tmp_sentiment = sample_data([x, y], datasize_per_task)  
        loss = []
        for i in range(inner_iteration):
            if domain_adver>1:
                domain_ans = to_categorical(len(tmp_sentiment)*[domain_num], num_classes=domain_adver)
                total_loss = tmp_model.train_on_batch(x=tmp_train_x, y=[tmp_sentiment, domain_ans])                
                loss = round(total_loss[1], 5) 
                adv_loss = round(total_loss[2], 5)
            else:
                loss = tmp_model.train_on_batch(x=tmp_train_x, y=tmp_sentiment)
                loss = round(loss, 5)
                
        new_weights.append(model_get_weight(tmp_model))
        update_weights(tmp_model, origin_weights)
        losses.append(loss)
        advloss.append(adv_loss) #只看最後一個的loss好了
        domain_num+=1
    #update weights
    new_weights = np.array(new_weights)
    new_weight = new_weights[0]
    for i in range(len(new_weights)-1):
        new_weight+=new_weights[i+1]
    new_weight/=len(new_weights)    
    new_weight = origin_weights + ((new_weight-origin_weights)*cur_meta_step_size)
    print('lr', round(cur_meta_step_size, 5), '\tloss', losses, '\t adv loss', round(np.mean(advloss), 5), '\t spend', int(time.time()-starttime))
    update_weights(tmp_model, new_weight)
    del new_weight, origin_weights, new_weights
print('total spend {} seconds'.format(int(time.time()-starttime)))


start to train!
itr = 0




lr 0.1 	loss [1.04505, 0.69711] 	 adv loss 1.14007 	 spend 44
lr 0.09667 	loss [0.54643, 0.60665] 	 adv loss 1.8317 	 spend 64
lr 0.09333 	loss [0.19355, 0.28034] 	 adv loss 3.52664 	 spend 84
lr 0.09 	loss [0.28936, 0.38339] 	 adv loss 8.42871 	 spend 104
lr 0.08667 	loss [0.22859, 0.43503] 	 adv loss 8.10102 	 spend 125
lr 0.08333 	loss [0.34958, 0.41372] 	 adv loss 10.74382 	 spend 146
lr 0.08 	loss [0.4784, 0.53215] 	 adv loss 12.07064 	 spend 167
lr 0.07667 	loss [0.42843, 0.42905] 	 adv loss 12.06102 	 spend 189
lr 0.07333 	loss [0.52688, 0.41826] 	 adv loss 13.21854 	 spend 210
lr 0.07 	loss [0.4524, 0.52519] 	 adv loss 13.71357 	 spend 232
itr = 10
lr 0.06667 	loss [0.55659, 0.38213] 	 adv loss 14.01343 	 spend 254
lr 0.06333 	loss [0.56347, 0.35216] 	 adv loss 14.31878 	 spend 276
lr 0.06 	loss [0.5411, 0.56021] 	 adv loss 14.09672 	 spend 298
lr 0.05667 	loss [0.60183, 0.42578] 	 adv loss 13.82195 	 spend 321
lr 0.05333 	loss [0.63901, 0.38216] 	 adv loss 13.99214 	 spend 343