# Part 2
# 预测人物关系

In [1]:
import re
import sys, json
import torch
import os
import numpy as np
import opennre
from opennre import encoder, model, framework
import argparse
import pandas as pd
import itertools

In [2]:
os.chdir("OpenNRE")

In [3]:
# 导入关系抽取模型

parser = argparse.ArgumentParser()
parser.add_argument('--mask_entity', action='store_true', help='Mask entity mentions')
args = parser.parse_known_args()[0]

root_path = '.'
sys.path.append(root_path)
if not os.path.exists('ckpt'):
    os.mkdir('ckpt')
ckpt = 'ckpt/people_chinese_bert_softmax.pth.tar'

rel2id = json.load(open(os.path.join(root_path, 'benchmark/data_use/rel2id.json'), encoding='utf-8'))

In [4]:
rel2id

{'不知道': 0,
 '父亲': 1,
 '儿子': 2,
 '奴才': 3,
 '主人': 4,
 '夫人': 5,
 '丈夫': 6,
 '丫环': 7,
 '女儿': 8,
 '母亲': 9,
 '兄弟': 10,
 '儿媳': 11,
 '婆婆': 12,
 '外祖母亲': 13,
 '外孙女': 14,
 '被抚养': 15,
 '侄女': 16,
 '姐妹': 17,
 '好友': 18,
 '养子': 19,
 '哥哥': 20,
 '买办': 21,
 '嫂子': 22,
 '陪房': 23,
 '乳母亲': 24,
 '相好': 25,
 '孙子': 26,
 '姑舅哥哥': 27,
 '侄儿': 28,
 '姑母亲': 29,
 '兄妹': 30,
 '岳母亲': 31,
 '老师': 32,
 '岳父亲': 33,
 '朋友': 34,
 '好兄弟': 35,
 '女婿': 36,
 '乾娘': 37,
 '暧昧': 38,
 '养父亲': 39,
 '孙女': 40,
 '伯父亲': 41,
 '弟弟': 42,
 '爷爷': 43,
 '奶奶': 44}

In [5]:
sentence_encoder = opennre.encoder.BERTEncoder(
    max_length=100, 
    pretrain_path=os.path.join(root_path, 'pretrain/chinese_wwm_pytorch'),
    mask_entity=args.mask_entity
)

model = opennre.model.SoftmaxNN(sentence_encoder, len(rel2id), rel2id)

2021-12-07 16:50:37,959 - root - INFO - Loading BERT pre-trained checkpoint.
Some weights of the model checkpoint at ./pretrain/chinese_wwm_pytorch were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
model.load_state_dict(torch.load(ckpt)['state_dict'])

<All keys matched successfully>

In [7]:
model.to("cuda")

SoftmaxNN(
  (sentence_encoder): BERTEncoder(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
        

In [8]:
model.infer({"text": "她再到宁府，找上贾珍父子与尤氏大闹一场，吓得贾珍借故躲出，贾蓉自打耳光。", "h": {"pos": [9, 10]}, "t": {"pos": [29, 31]}}
)

('父亲', 0.9840774536132812)

In [None]:
name_all = pd.read_csv("../data/name_clean_sorted.csv", index_col=0)
name_all.head()

In [None]:
name_text_all = list(name_all['文中名字'])

In [None]:
relation_all = []
for sentence in sentence_raw:
    name_each_sentence = []
    for name in name_text_all:
        if name in sentence:
            name_each_sentence.append(name)
    if len(name_each_sentence) >= 2:
        for i in range(len(name_each_sentence)):
            for j in range(i+1, len(name_each_sentence)):
                name1 = name_each_sentence[i]
                name2 = name_each_sentence[j]

                start1 = sentence.index(name1)
                end1 = start1 + len(name1)
                start2 = sentence.index(name2)
                end2 = start2 + len(name2)

                # create dict for data
                data_set = {}
                data_set['text'] = sentence
                data_set['h'] = {}
                data_set['h']['pos'] = tuple([start1, end1])
                data_set['t'] = {}
                data_set['t']['pos'] = tuple([start2, end2])

                (rel, p) = model.infer(data_set)
                relation_all.append([name1, name2, rel, p])

In [None]:
relation_all = pd.DataFrame(relation_all)
relation_all = relation_all[relation_all[3]>0.8]
relation_all.to_csv('../data/relation_unknown.csv')