In [4]:
# encoding = utf8
'''
    @Author: King
    @Date: 2019.05.28
    @Purpose: Neural Relation Extraction with Selective Attention over Instances
    @Introduction:  Neural Relation Extraction with Selective Attention over Instances
    @Datasets: Chinese relation extration datasets
    @Link : 论文研读笔记作业-
    @Reference : 
    @paper ： https://aclweb.org/anthology/P/P16/P16-1200.pdf
'''

'\n    @Author: King\n    @Date: 2019.05.28\n    @Purpose: Neural Relation Extraction with Selective Attention over Instances\n    @Introduction:  Neural Relation Extraction with Selective Attention over Instances\n    @Datasets: Chinese relation extration datasets\n    @Link : 论文研读笔记作业-\n    @Reference : \n    @paper ： https://aclweb.org/anthology/P/P16/P16-1200.pdf\n'

## Neural Relation Extraction with Selective Attention over Instances


Original paper [Neural Relation Extraction with Selective Attention over Instances](https://aclweb.org/anthology/P/P16/P16-1200.pdf) 

    In this paper, we propose a sentence-level attention-based convolutional neural network (CNN) for distantly supervised relation extraction. 

    As illustrated in Fig. 1, we employ a CNN to embed the semantics of sentences. Afterward, to utilize all informative sentences, we represent the relationship as the semantic composition of sentence embeddings. 

    To address the wrong labeling problem, we build sentence-level attention over multiple instances, which is expected to dynamically reduce the weights of those noisy instances. Finally, we extract relation with the relation vector weighted by sentence-level attention. (为了解决错误的标签问题，我们在多个实例上建立了句子级别的注意力，这有望动态地减少那些噪声实例的权重。 最后，我们提取与句子级注意加权的关系向量的关系。)

    We evaluate our model on a real-world dataset in the task of relation extraction. The experimental results show that our model achieves significant and consistent improvements in relation extraction as compared with the state-of-the-art methods.

<p align="center">
	<img width="500" height="300" src="img/sentence_level_attention_based_CNN.png">
</p>

### Requrements

* Python (>=3.5)

* TensorFlow (>=r1.0)

* scikit-learn (>=0.18)

## 1、导入包

In [5]:
import numpy as np
import tensorflow as tf
import random
import os,sys
import datetime
from collections import Counter

def set_seed():
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(2019)
    random.seed(2019)
    tf.set_random_seed(2019)

set_seed()

  from ._conv import register_converters as _register_converters


## 2、编写 Settings Class

In [6]:
class Settings(object):
    def __init__(self):
        
        '''
            Data loading params 
        '''
        self.data_path = "E:/pythonWp/game/CCKS2019/RelationshipExtraction/open_data/" # data dir to load
        self.relation2id_path = "relation2id.txt"
        
        self.sent_train_path = "sent_train.txt" 
        self.sent_relation_train_path = "sent_relation_train.txt" 
        
        self.sent_dev_path = "sent_dev.txt" 
        self.sent_relation_dev_path = "sent_relation_dev.txt" 
        
        self.sent_test_path = "sent_test.txt" 
        self.sent_relation_test_path = "sent_relation_test.txt" 
        
        self.result_sent_file = "result_sent.txt" 
        

        '''
            Model Hyper-parameters 
        '''
        '''
            1、 parameters
        '''
        self.cuda = '0'                   # gpu id
        self.batch_size = 50              # batch size
        self.epochs = 200                 # max train epochs
        self.model_path = 'model'         # save model dir
        self.level = 'sent'                # bag level or sentence level, option:bag/sent
        self.mode = 'train'               # train or test
        
        '''
            2、Embeddings
        '''
        self.embed_path = "E:/pythonWp/game/CCKS2019/RelationshipExtraction/origin_data/vec.txt"
        self.pre_embed = True             # load pre-trained word2vec
        self.word_dim = 200               # dimension of word embedding
        self.pos_dim = 5                  # dimension of position embedding
        
        '''
            2、Training parameters
        '''
        self.hidden_dim = 100             #dimension of hidden embedding
        self.dropout = 0.5
        self.lr = 0.001
        self.word_frequency = 5          # minimum word frequency when constructing vocabulary list
        self.pos_limit = 15              # max distance of position embedding
        self.sen_len = 60                # sentence length
        self.window = 3                  # window size
        
        

In [11]:
class ACNN:
    def __init__(self, settins):
        self.lr = settins.lr
        self.sen_len = settins.sen_len
        self.pre_embed = settins.pre_embed
        self.pos_limit = settins.pos_limit
        self.pos_dim = settins.pos_dim
        self.window = settins.window
        self.word_dim = settins.word_dim
        self.hidden_dim = settins.hidden_dim
        self.batch_size = settins.batch_size
        self.data_path = settins.data_path
        self.model_path = settins.model_path
        self.mode = settins.mode
        self.epochs = settins.epochs
        self.dropout = settins.dropout
        self.word_frequency = settins.word_frequency

        self.relation2id_path = settins.relation2id_path
        self.embed_path = settins.embed_path

        self.sent_train_path = settins.sent_train_path
        self.sent_relation_train_path =settins.sent_relation_train_path

        self.sent_dev_path = settins.sent_dev_path
        self.sent_relation_dev_path = settins.sent_relation_dev_path

        self.sent_test_path = settins.sent_test_path
        self.sent_relation_test_path = settins.sent_relation_test_path
        self.result_sent_file = settins.result_sent_file



        self.pos_num = 2 * self.pos_limit + 3
        self.relation2id = self.load_relation()
        self.num_classes = len(self.relation2id)

        if self.pre_embed:
            self.wordMap, word_embed = self.load_wordVec()
            self.word_embedding = tf.get_variable(initializer=word_embed, name='word_embedding', trainable=False)

        else:
            self.wordMap = self.load_wordMap()
            self.word_embedding = tf.get_variable(shape=[len(self.wordMap), self.word_dim], name='word_embedding',trainable=True)


        self.pos_e1_embedding = tf.get_variable(name='pos_e1_embedding', shape=[self.pos_num, self.pos_dim])
        self.pos_e2_embedding = tf.get_variable(name='pos_e2_embedding', shape=[self.pos_num, self.pos_dim])

        self.relation_embedding = tf.get_variable(name='relation_embedding', shape=[self.hidden_dim, self.num_classes])
        self.relation_embedding_b = tf.get_variable(name='relation_embedding_b', shape=[self.num_classes])

        self.sentence_reps = self.CNN_encoder()

        self.sentence_level()
        self._classifier_train_op = tf.train.AdamOptimizer(self.lr).minimize(self.classifier_loss)

    def pos_index(self, x):
        if x < -self.pos_limit:
            return 0
        if x >= -self.pos_limit and x <= self.pos_limit:
            return x + self.pos_limit + 1
        if x > self.pos_limit:
            return 2 * self.pos_limit + 2

    # 加载词向量及词典
    def load_wordVec(self):
        wordMap = {}
        wordMap['PAD'] = len(wordMap)
        wordMap['UNK'] = len(wordMap)
        word_embed = []
        for line in open(os.path.join(self.data_path, self.embed_path),'r',encoding='utf8'):
            content = line.strip().split()
            if len(content) != self.word_dim + 1:
                continue
            wordMap[content[0]] = len(wordMap)
            word_embed.append(np.asarray(content[1:], dtype=np.float32))

        #print(word_embed)
        word_embed = np.stack(word_embed)
        embed_mean, embed_std = word_embed.mean(), word_embed.std()

        pad_embed = np.random.normal(embed_mean, embed_std, (2, self.word_dim))
        word_embed = np.concatenate((pad_embed, word_embed), axis=0)
        word_embed = word_embed.astype(np.float32)
        return wordMap, word_embed

    # 加载词典，在没用预训练词向量的时候，需要利用该方法加载词典
    def load_wordMap(self):
        wordMap = {}
        wordMap['PAD'] = len(wordMap)
        wordMap['UNK'] = len(wordMap)
        all_content = []
        for line in open(os.path.join(self.data_path, self.sent_train_path),encoding='utf-8'):
            all_content += line.strip().split('\t')[3].split()
        for item in Counter(all_content).most_common():
            if item[1] > self.word_frequency:
                wordMap[item[0]] = len(wordMap)
            else:
                break
        return wordMap
    
    # 加载关系
    def load_relation(self):
        relation2id = {}
        for line in open(os.path.join(self.data_path, self.relation2id_path),encoding='utf8'):
            relation, id_ = line.strip().split()
            relation2id[relation] = int(id_)
        return relation2id

    # 加载句子文件，并计算位置嵌入向量
    def load_sent(self, filename):
        sentence_dict = {}
        nums = 0
        with open(os.path.join(self.data_path, filename), 'r',encoding='utf8') as fr:
            for line in fr:
                id_, en1, en2, sentence = line.strip().split('\t')
                sentence = sentence.split()
                en1_pos = 0
                en2_pos = 0
                for i in range(len(sentence)):
                    if sentence[i] == en1:
                        en1_pos = i
                    if sentence[i] == en2:
                        en2_pos = i
                words = []
                pos1 = []
                pos2 = []

                length = min(self.sen_len, len(sentence))

                for i in range(length):
                    words.append(self.wordMap.get(sentence[i], self.wordMap['UNK']))
                    pos1.append(self.pos_index(i - en1_pos))
                    pos2.append(self.pos_index(i - en2_pos))

                if length < self.sen_len:
                    for i in range(length, self.sen_len):
                        words.append(self.wordMap['PAD'])
                        pos1.append(self.pos_index(i - en1_pos))
                        pos2.append(self.pos_index(i - en2_pos))
                sentence_dict[id_] = np.reshape(np.asarray([words, pos1, pos2], dtype=np.int32), (1, 3, self.sen_len))
                
        return sentence_dict

    def data_batcher(self, sentence_dict, filename, padding=False, shuffle=True):
        all_sent_ids = []
        all_sents = []
        all_labels = []
        with open(os.path.join(self.data_path, filename), 'r',encoding='utf-8') as fr:
            for line in fr:
                rel = [0] * self.num_classes
                try:
                    sent_id, types = line.strip().split('\t')
                    type_list = types.split()
                    for tp in type_list:
                        if len(type_list) > 1 and tp == '0': # if a sentence has multiple relations, we only consider non-NA relations
                            continue
                        rel[int(tp)] = 1
                except:
                    sent_id = line.strip()

                all_sent_ids.append(sent_id)
                all_sents.append(sentence_dict[sent_id])

                all_labels.append(np.reshape(np.asarray(rel, dtype=np.float32), (-1, self.num_classes)))

        self.data_size = len(all_sent_ids)
        self.datas = all_sent_ids

        all_sents = np.concatenate(all_sents, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        data_order = list(range(self.data_size))
        if shuffle:
            np.random.shuffle(data_order)
        if padding:
            if self.data_size % self.batch_size != 0:
                data_order += [data_order[-1]] * (self.batch_size - self.data_size % self.batch_size)     

        for i in range(len(data_order) // self.batch_size):
            idx = data_order[i * self.batch_size:(i + 1) * self.batch_size]
            yield all_sents[idx], all_labels[idx], None
            
    # ACNN 模型
    def CNN_encoder(self):
        self.keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
        self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, self.sen_len], name='input_word')
        self.input_pos_e1 = tf.placeholder(dtype=tf.int32, shape=[None, self.sen_len], name='input_pos_e1')
        self.input_pos_e2 = tf.placeholder(dtype=tf.int32, shape=[None, self.sen_len], name='input_pos_e2')
        self.input_label = tf.placeholder(dtype=tf.float32, shape=[None, self.num_classes], name='input_label')

        inputs_forward = tf.concat(axis=2, values=[tf.nn.embedding_lookup(self.word_embedding, self.input_word), \
                                                   tf.nn.embedding_lookup(self.pos_e1_embedding, self.input_pos_e1), \
                                                   tf.nn.embedding_lookup(self.pos_e2_embedding, self.input_pos_e2)])
        inputs_forward = tf.expand_dims(inputs_forward, -1)

        with tf.name_scope('conv-maxpool'):
            w = tf.get_variable(name='w', shape=[self.window, self.word_dim + 2 * self.pos_dim, 1, self.hidden_dim])
            b = tf.get_variable(name='b', shape=[self.hidden_dim])
            conv = tf.nn.conv2d(
                inputs_forward,
                w,
                strides=[1, 1, 1, 1],
                padding='VALID',
                name='conv')
            h = tf.nn.bias_add(conv, b)
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, self.sen_len - self.window + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name='pool')
        sen_reps = tf.tanh(tf.reshape(pooled, [-1, self.hidden_dim]))
        sen_reps = tf.nn.dropout(sen_reps, self.keep_prob)
        return sen_reps

    def sentence_level(self):
        out = tf.matmul(self.sentence_reps, self.relation_embedding) + self.relation_embedding_b
        self.probability = tf.nn.softmax(out, 1)
        self.classifier_loss = tf.reduce_mean(
            tf.reduce_sum(-tf.log(tf.clip_by_value(self.probability, 1.0e-10, 1.0)) * self.input_label, 1))

    def run_train(self, sess, batch):

        sent_batch, label_batch, sen_num_batch = batch

        feed_dict = {}
        feed_dict[self.keep_prob] = self.dropout
        feed_dict[self.input_word] = sent_batch[:, 0, :]
        feed_dict[self.input_pos_e1] = sent_batch[:, 1, :]
        feed_dict[self.input_pos_e2] = sent_batch[:, 2, :]
        feed_dict[self.input_label] = label_batch
        _, classifier_loss = sess.run([self._classifier_train_op, self.classifier_loss], feed_dict)

        return classifier_loss

    def run_dev(self, sess, dev_batchers):
        all_labels = []
        all_probs = []
        for batch in dev_batchers:
            sent_batch, label_batch, sen_num_batch = batch
            all_labels.append(label_batch)

            feed_dict = {}
            feed_dict[self.keep_prob] = 1.0
            feed_dict[self.input_word] = sent_batch[:, 0, :]
            feed_dict[self.input_pos_e1] = sent_batch[:, 1, :]
            feed_dict[self.input_pos_e2] = sent_batch[:, 2, :]
            prob = sess.run([self.probability], feed_dict)
            all_probs.append(np.reshape(prob, (-1, self.num_classes)))

        all_labels = np.concatenate(all_labels, axis=0)[:self.data_size]
        all_probs = np.concatenate(all_probs, axis=0)[:self.data_size]
 
        all_preds = np.eye(self.num_classes)[np.reshape(np.argmax(all_probs, 1), (-1))]

        return all_preds, all_labels

    def run_test(self, sess, test_batchers):
        all_probs = []
        for batch in test_batchers:
            sent_batch, _, sen_num_batch = batch

            feed_dict = {}
            feed_dict[self.keep_prob] = 1.0
            feed_dict[self.input_word] = sent_batch[:, 0, :]
            feed_dict[self.input_pos_e1] = sent_batch[:, 1, :]
            feed_dict[self.input_pos_e2] = sent_batch[:, 2, :]
            all_probs.append(np.reshape(prob, (-1, self.num_classes)))

        all_probs = np.concatenate(all_probs,axis=0)[:self.data_size]
        all_preds = np.eye(self.num_classes)[np.reshape(np.argmax(all_probs, 1), (-1))]
       
        with open(self.result_sent_file , 'w',encoding='utf-8') as fw:
            for i in range(self.data_size):
                rel_one_hot = [int(num) for num in all_preds[i].tolist()]
                rel_list = []
                for j in range(0, self.num_classes):
                    if rel_one_hot[j] == 1:
                        rel_list.append(str(j))
                fw.write(self.datas[i] + '\t' + ' '.join(rel_list) + '\n')

    def run_model(self, sess, saver):
        if self.mode == 'train':
            global_step = 0
            sent_train = self.load_sent(self.sent_train_path)
            sent_dev = self.load_sent(self.sent_dev_path)
            max_f1 = 0.0

            if not os.path.isdir(self.model_path):
                os.mkdir(self.model_path)

            for epoch in range(self.epochs):
                train_batchers = self.data_batcher(sent_train, self.sent_relation_train_path, padding=False, shuffle=True)
                for batch in train_batchers:
                    losses = self.run_train(sess, batch)
                    global_step += 1
                    if global_step % 50 == 0:
                        time_str = datetime.datetime.now().isoformat()
                        tempstr = "{}: step {}, classifier_loss {:g}".format(time_str, global_step, losses)
                        print(tempstr)
                    if global_step % 200 == 0:
                        dev_batchers = self.data_batcher(sent_dev, self.sent_relation_dev_path, padding=True, shuffle=False)
                        all_preds, all_labels = self.run_dev(sess, dev_batchers)

                        # when calculate f1 score, we don't consider whether NA results are predicted or not
                        # the number of non-NA answers in test is counted as n_std
                        # the number of non-NA answers in predicted answers is counted as n_sys
                        # intersection of two answers is counted as n_r
                        n_r = int(np.sum(all_preds[:, 1:] * all_labels[:, 1:]))
                        n_std = int(np.sum(all_labels[:,1:]))
                        n_sys = int(np.sum(all_preds[:,1:]))
                        try:
                            precision = n_r / n_sys
                            recall = n_r / n_std
                            f1 = 2 * precision * recall / (precision + recall)
                        except ZeroDivisionError:
                            f1 = 0.0

                        if f1 > max_f1:
                            max_f1 = f1
                            print('f1: %f' % f1)
                            print('saving model')
                            path = saver.save(sess, os.path.join(self.model_path, 'ipre_bag_%d' % (self.bag)), global_step=0)
                            tempstr = 'have saved model to ' + path
                            print(tempstr)

        else:
            path = os.path.join(self.model_path, 'ipre_bag_%d' % self.bag) + '-0'
            tempstr = 'load model: ' + path
            print(tempstr)
            try:
                saver.restore(sess, path)
            except:
                raise ValueError('Unvalid model name')

            sent_test = self.load_sent(self.sent_test_path)
            test_batchers = self.data_batcher(sent_test, self.sent_relation_test_path, padding=True, shuffle=False)

            self.run_test(sess, test_batchers)



In [None]:
# 清除每次运行时，tensorflow中不断增加的节点并重置整个defualt graph
tf.reset_default_graph()
print('build model')
settings = Settings()
# 设置 GPU
gpu_options = tf.GPUOptions(visible_device_list=settings.cuda, allow_growth=True)
with tf.Graph().as_default():
    set_seed()
    '''
        通过设置intra_op_parallelism_threads参数和inter_op_parallelism_threads参数，
        来控制每个操作符op并行计算的线程个数。二者的区别在于:

        intra_op_parallelism_threads 控制运算符op内部的并行
            当运算符op为单一运算符，并且内部可以实现并行时，如矩阵乘法，reduce_sum之类的操作，
            可以通过设置intra_op_parallelism_threads参数来并行, intra代表内部。
        inter_op_parallelism_threads 控制多个运算符op之间的并行计算
            当有多个运算符op，并且他们之间比较独立，运算符和运算符之间没有直接的路径Path相连。
            Tensorflow会尝试并行地计算他们，使用由inter_op_parallelism_threads参数来控制数量的一个线程池。
        参考：https://blog.csdn.net/s_sunnyy/article/details/71422264
    '''
    sess = tf.Session(
        config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
    with sess.as_default():
        # This function implements the weight initialization
        initializer = tf.contrib.layers.xavier_initializer()
        with tf.variable_scope('', initializer=initializer):
            model = ACNN(settings)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=None)
        model.run_model(sess, saver)

build model
