## Distant Supervision for Relation Extraction via Piecewise Convolutional Neural Networks EMNLP 2015

# 1前言

### 1.1课程回顾

### 1.2 模型结构

In [47]:
import os
import models
import dataset
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from utils import save_pr, now, eval_metric

### 1.3 代码结构

# 2 准备工作
### 2.1项目环境配置

* Python3.8
* jupyter notebook
* torch            1.6.0+cu10.2
* numpy            1.18.5

代码运行环境建议使用Visual Studio Code(VScode)

### 2.2 数据集下载

# 3 项目代码结构（VScode中演示）

>1）是什么？

　　我们首先会在VScode环境中让代码跑一下，直观感受到项目的训练，并展示前向推断的输出，让大家看到模型的效果。
>2）怎么构成的？

　　然后介绍项目代码的构成，介绍项目有哪些文件夹，包含哪些文件，这些文件构成了什么功能模块如：数据预处理模块，模型设计模块，损失函数模块，推断与评估模块。
>3）小结

　　在主文件中在过一下启动训练的流程。

# 4 算法模块及细节（jupyter和VScode中演示）

　　在jupyter notebook中细致地讲解每一个模块。
  
　　以实现模块功能为目的，来讲解每个函数的执行流程，呈现中间数据，方便同学们理解学习。
  
　　内容分为以下几个模块：**超参数设置，数据读取与处理，模型定义，模型训练，模型评价**。

### 4.1 超参数设置

In [2]:
data_dic ={
    'NYT': {
        'data_root': './dataset/NYT/',
        'w2v_path': './dataset/NYT/w2v.npy',
        'p1_2v_path': './dataset/NYT/p1_2v.npy',
        'p2_2v_path': './dataset/NYT/p2_2v.npy',
        'vocab_size': 114043,
        'rel_num': 53
    },
    'FilterNYT': {
        'data_root': './dataset/FilterNYT/',
        'w2v_path': './dataset/FilterNYT/w2v.npy',
        'p1_2v_path': './dataset/FilterNYT/p1_2v.npy',
        'p2_2v_path': './dataset/FilterNYT/p2_2v.npy',
        'vocab_size': 160695 + 2,
        'rel_num': 27
    }
}

In [3]:
class DefaultConfig(object):

    model = 'PCNN_ONE'  # the name of used model, in  <models/__init__.py>
    data = 'FilterNYT'  # SEM NYT FilterNYT

    result_dir = './out'
    data_root = data_dic[data]['data_root']  # the data dir
    w2v_path = data_dic[data]['w2v_path']
    p1_2v_path = data_dic[data]['p1_2v_path']
    p2_2v_path = data_dic[data]['p2_2v_path']
    load_model_path = 'checkpoints/model.pth'  # the trained model

    seed = 3435
    batch_size = 128  # batch size
    use_gpu = True  # user GPU or not
    gpu_id = 1
    num_workers = 0  # how many workers for loading data

    max_len = 80 + 2  # max_len for each sentence + two padding
    limit = 50  # the position range <-limit, limit>

    vocab_size = data_dic[data]['vocab_size']  # vocab + UNK + BLANK
    rel_num = data_dic[data]['rel_num']
    word_dim = 50
    pos_dim = 5
    pos_size = limit * 2 + 2

    norm_emb=True

    num_epochs = 16  # the number of epochs for training
    drop_out = 0.5
    lr = 0.0003  # initial learning rate
    lr_decay = 0.95  # when val_loss increase, lr = lr*lr_decay
    weight_decay = 0.0001  # optimizer parameter

    # Conv
    filters = [3]
    filters_num = 230
    sen_feature_dim = filters_num

    rel_dim = filters_num * len(filters)
    rel_filters_num = 100

    print_opt = 'DEF'
    use_pcnn=True


In [4]:
def parse(self, kwargs):
    '''
    user can update the default hyperparamter
    '''
    for k, v in kwargs.items():
        if not hasattr(self, k):
            raise Exception('opt has No key: {}'.format(k))
        setattr(self, k, v)
    data_list = ['data_root', 'w2v_path', 'rel_num', 'vocab_size', 'p1_2v_path', 'p2_2v_path']
    for r in data_list:
        setattr(self, r, data_dic[self.data][r])

    print('*************************************************')
    print('user config:')
    for k, v in self.__class__.__dict__.items():
        if not k.startswith('__'):
            print("{} => {}".format(k, getattr(self, k)))

    print('*************************************************')

In [8]:
DefaultConfig.parse = parse

In [10]:
opt = DefaultConfig()

In [16]:
k = {}

In [17]:
opt.parse(k)

*************************************************
user config:
model => PCNN_ONE
data => FilterNYT
result_dir => ./out
data_root => ./dataset/FilterNYT/
w2v_path => ./dataset/FilterNYT/w2v.npy
p1_2v_path => ./dataset/FilterNYT/p1_2v.npy
p2_2v_path => ./dataset/FilterNYT/p2_2v.npy
load_model_path => checkpoints/model.pth
seed => 3435
batch_size => 128
use_gpu => True
gpu_id => 1
num_workers => 0
max_len => 82
limit => 50
vocab_size => 160697
rel_num => 27
word_dim => 50
pos_dim => 5
pos_size => 102
norm_emb => True
num_epochs => 16
drop_out => 0.5
lr => 0.0003
lr_decay => 0.95
weight_decay => 0.0001
filters => [3]
filters_num => 230
sen_feature_dim => 230
rel_dim => 230
rel_filters_num => 100
print_opt => DEF
use_pcnn => True
parse => <bound method parse of <__main__.DefaultConfig object at 0x7f55ccf07340>>
*************************************************


### 4.2 数据读取与处理
* 数据处理细节
* 构建dataset类

#### 4.2.1数据处理细节

##### 载入原始数据集并且预处理

In [56]:
w2v_path = os.path.join(opt.data_root, 'vector.txt')
word_path = os.path.join(opt.data_root, 'dict.txt')
train_path = os.path.join(opt.data_root, 'train', 'train.txt')
test_path = os.path.join(opt.data_root, 'test', 'test.txt')

载入word2vector

In [57]:
wordlist = []
vecs = []

In [58]:
wordlist.append('BLANK')

In [59]:
wlist = [word.strip('\n') for word in open(word_path)]

In [60]:
wlist

[',',
 'the',
 '.',
 'of',
 'to',
 'a',
 'and',
 "''",
 'in',
 'that',
 "'s",
 'for',
 'is',
 'The',
 'said',
 'on',
 'was',
 'with',
 'at',
 'he',
 'Mr.',
 'it',
 'as',
 'by',
 'his',
 'from',
 'be',
 'are',
 'have',
 'not',
 'I',
 'an',
 'has',
 'who',
 '$',
 ':',
 'had',
 'they',
 '``',
 'or',
 'their',
 'would',
 '-RRB-',
 '-LRB-',
 'were',
 'will',
 'but',
 'this',
 '--',
 'about',
 'more',
 'which',
 'one',
 'been',
 'its',
 'But',
 ';',
 'In',
 'It',
 "n't",
 'He',
 'her',
 'than',
 'you',
 'when',
 'up',
 'out',
 'all',
 'she',
 'do',
 'two',
 'we',
 'like',
 'can',
 'years',
 'other',
 'last',
 'A',
 'also',
 'there',
 'year',
 'into',
 'people',
 "'",
 'new',
 'some',
 'first',
 'them',
 'after',
 'what',
 'time',
 'could',
 'no',
 'so',
 'over',
 'only',
 'if',
 'most',
 '?',
 'him',
 'percent',
 'did',
 'because',
 'million',
 'We',
 'many',
 'now',
 'And',
 'New_York',
 'just',
 'Ms.',
 'American',
 'company',
 'where',
 'made',
 'through',
 'They',
 'three',
 'before',
 '

In [61]:
wordlist.extend(wlist)

In [None]:
for line in open(w2v_path):
    line = line.strip('\n').split()
    vec = list(map(float, line))
    vecs.append(vec)

##### 载入预处理后的数据集

In [48]:
path = os.path.join(opt.data_root, 'train/')

In [None]:
data = np.load(path + 'bags_feature.npy', allow_pickle=True)

In [53]:
data[0][0]

[1122, 53041]

In [54]:
labels = np.load(path + 'labels.npy')

In [55]:
labels

array([[ 1, -1, -1, -1],
       [ 3, -1, -1, -1],
       [ 5,  2, -1, -1],
       ...,
       [ 0, -1, -1, -1],
       [ 0, -1, -1, -1],
       [ 0, -1, -1, -1]])

#### 4.2.2构建dataset类

In [29]:
DataModel = getattr(dataset, opt.data + 'Data')

In [31]:
DataModel

dataset.filternyt.FilterNYTData

In [34]:
from torch.utils.data import Dataset

In [35]:
class FilterNYTData(Dataset):

    def __init__(self, root_path, train=True):
        if train:
            path = os.path.join(root_path, 'train/')
            print('loading train data')
        else:
            path = os.path.join(root_path, 'test/')
            print('loading test data')

        self.labels = np.load(path + 'labels.npy')
        self.data = np.load(path + 'bags_feature.npy', allow_pickle=True)

        print('loading finish')

    def __getitem__(self, idx):
        assert idx < len(self.data)
        return self.data[idx], self.labels[idx]

    def __len__(self):
        return len(self.data)

In [32]:
train_data = DataModel(opt.data_root, train=True)

loading train data
loading finish


In [38]:
def collate_fn(batch):
    data, label = zip(*batch)
    return data, label

In [39]:
train_data_loader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers, collate_fn=collate_fn)

In [41]:
test_data = DataModel(opt.data_root, train=False)
test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, collate_fn=collate_fn)
print('train data: {}; test data: {}'.format(len(train_data), len(test_data)))

loading test data
loading finish
train data: 65726; test data: 93574


### 4.3 模型定义

In [25]:
import time

In [21]:
if opt.use_gpu:
    torch.cuda.set_device(opt.gpu_id)

In [22]:
model = getattr(models, 'PCNN_ONE')(opt)

In [24]:
class BasicModule(torch.nn.Module):
    '''
    封装了nn.Module,主要是提供了save和load两个方法
    '''

    def __init__(self):
        super(BasicModule, self).__init__()
        self.model_name=str(type(self))  # model name

    def load(self, path):
        '''
        可加载指定路径的模型
        '''
        self.load_state_dict(torch.load(path))

    def save(self, name=None):
        '''
        保存模型，默认使用“模型名字+时间”作为文件名
        '''
        prefix = 'checkpoints/'
        if name is None:
            name = prefix + self.model_name + '_'
            name = time.strftime(name + '%m%d_%H:%M:%S.pth')
        else:
            name = prefix + self.model_name + '_' + str(name)+ '.pth'
        torch.save(self.state_dict(), name)
        return name

In [26]:
class PCNN_ONE(BasicModule):
    '''
    Zeng 2015 DS PCNN
    '''
    def __init__(self, opt):
        super(PCNN_ONE, self).__init__()

        self.opt = opt

        self.model_name = 'PCNN_ONE'

        self.word_embs = nn.Embedding(self.opt.vocab_size, self.opt.word_dim)
        self.pos1_embs = nn.Embedding(self.opt.pos_size, self.opt.pos_dim)
        self.pos2_embs = nn.Embedding(self.opt.pos_size, self.opt.pos_dim)

        feature_dim = self.opt.word_dim + self.opt.pos_dim * 2

        # for more filter size
        self.convs = nn.ModuleList([nn.Conv2d(1, self.opt.filters_num, (k, feature_dim), padding=(int(k / 2), 0)) for k in self.opt.filters])

        all_filter_num = self.opt.filters_num * len(self.opt.filters)

        if self.opt.use_pcnn:
            all_filter_num = all_filter_num * 3
            masks = torch.FloatTensor(([[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]]))
            if self.opt.use_gpu:
                masks = masks.cuda()
            self.mask_embedding = nn.Embedding(4, 3)
            self.mask_embedding.weight.data.copy_(masks)
            self.mask_embedding.weight.requires_grad = False

        self.linear = nn.Linear(all_filter_num, self.opt.rel_num)
        self.dropout = nn.Dropout(self.opt.drop_out)

        self.init_model_weight()
        self.init_word_emb()

    def init_model_weight(self):
        '''
        use xavier to init
        '''
        for conv in self.convs:
            nn.init.xavier_uniform_(conv.weight)
            nn.init.constant_(conv.bias, 0.0)

        nn.init.xavier_uniform_(self.linear.weight)
        nn.init.constant_(self.linear.bias, 0.0)

    def init_word_emb(self):

        def p_2norm(path):
            v = torch.from_numpy(np.load(path))
            if self.opt.norm_emb:
                v = torch.div(v, v.norm(2, 1).unsqueeze(1))
                v[v != v] = 0.0
            return v

        w2v = p_2norm(self.opt.w2v_path)
        p1_2v = p_2norm(self.opt.p1_2v_path)
        p2_2v = p_2norm(self.opt.p2_2v_path)

        if self.opt.use_gpu:
            self.word_embs.weight.data.copy_(w2v.cuda())
            self.pos1_embs.weight.data.copy_(p1_2v.cuda())
            self.pos2_embs.weight.data.copy_(p2_2v.cuda())
        else:
            self.pos1_embs.weight.data.copy_(p1_2v)
            self.pos2_embs.weight.data.copy_(p2_2v)
            self.word_embs.weight.data.copy_(w2v)

    def mask_piece_pooling(self, x, mask):
        '''
        refer: https://github.com/thunlp/OpenNRE
        A fast piecewise pooling using mask
        '''
        x = x.unsqueeze(-1).permute(0, 2, 1, -1)
        masks = self.mask_embedding(mask).unsqueeze(-2) * 100
        x = masks.float() + x
        x = torch.max(x, 1)[0] - torch.FloatTensor([100]).cuda()
        x = x.view(-1, x.size(1) * x.size(2))
        return x

    def piece_max_pooling(self, x, insPool):
        '''
        old version piecewise
        '''
        split_batch_x = torch.split(x, 1, 0)
        split_pool = torch.split(insPool, 1, 0)
        batch_res = []
        for i in range(len(split_pool)):
            ins = split_batch_x[i].squeeze()  # all_filter_num * max_len
            pool = split_pool[i].squeeze().data    # 2
            seg_1 = ins[:, :pool[0]].max(1)[0].unsqueeze(1)          # all_filter_num * 1
            seg_2 = ins[:, pool[0]: pool[1]].max(1)[0].unsqueeze(1)  # all_filter_num * 1
            seg_3 = ins[:, pool[1]:].max(1)[0].unsqueeze(1)
            piece_max_pool = torch.cat([seg_1, seg_2, seg_3], 1).view(1, -1)    # 1 * 3all_filter_num
            batch_res.append(piece_max_pool)

        out = torch.cat(batch_res, 0)
        assert out.size(1) == 3 * self.opt.filters_num
        return out

    def forward(self, x, train=False):

        insEnt, _, insX, insPFs, insPool, insMasks = x
        insPF1, insPF2 = [i.squeeze(1) for i in torch.split(insPFs, 1, 1)]

        word_emb = self.word_embs(insX)
        pf1_emb = self.pos1_embs(insPF1)
        pf2_emb = self.pos2_embs(insPF2)

        x = torch.cat([word_emb, pf1_emb, pf2_emb], 2)
        x = x.unsqueeze(1)
        x = self.dropout(x)

        x = [conv(x).squeeze(3) for conv in self.convs]
        if self.opt.use_pcnn:
            x = [self.mask_piece_pooling(i, insMasks) for i in x]
            # x = [self.piece_max_pooling(i, insPool) for i in x]
        else:
            x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1).tanh()
        x = self.dropout(x)
        x = self.linear(x)

        return x

In [28]:
if opt.use_gpu:
    model.cuda()