In [1]:
# coding: utf-8
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import nltk

import uuid
import json

nltk.internals.config_java("/usr/lib/jvm/java-8-openjdk-amd64/bin/java")

# Add the jar and model via their path (instead of setting environment variables):
pos_jar = 'stanford-nlp/stanford-postagger-2018-10-16/stanford-postagger.jar'
pos_model = 'stanford-nlp/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger'
ner_jar = 'stanford-nlp/stanford-ner-2018-10-16/stanford-ner.jar'
ner_model = 'stanford-nlp/stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz'

pos_tagger = StanfordPOSTagger(pos_model, pos_jar, encoding='utf8')
eng_tagger = StanfordNERTagger(ner_model, ner_jar, encoding='utf8')

def sentence_token_nltk(str):
    sent_tokenize_list = sent_tokenize(str)
    return sent_tokenize_list

def extract_entity(text):
    sentences = sentence_token_nltk(text)

    new_dict = []
    for sen in sentences:
        sen_dict = {}
        ners = []
        tokens = word_tokenize(sen)
        #print(tokens)
        ner_pair = eng_tagger.tag(tokens)
        for i in range(len(ner_pair)):
            ners.append(ner_pair[i][1])
        sen_dict = {"token":tokens, "stanford_ner":ners}
        new_dict.append(sen_dict)
    return new_dict

def transfer_to_multi_relation(new_dict):
    result = []
    for i in new_dict:
    
            location_start = 0
            location_end = 0
            entity_count = 0
            subj_start = []
            subj_end = []
            subj_type = []
            subjType = ''
            
            for j in i['stanford_ner']:
                if subjType != j:
                    if j != 'O':
                        if subjType != 'O' and subjType != '':
                            subj_end.append(location_end)
                        subj_start.append(location_start)
                        subj_type.append(j)
                        subjType = j
                        location_end = location_start
                        entity_count += 1
                    else:
                        subjType = j
                        if location_start != 0:
                            subj_end.append(location_end)
                else:
                    location_end += 1
                location_start += 1
            if len(subj_end) != len(subj_start):
                subj_end.append(len(i['token']) - 1)
                
            if entity_count > 2:
                sub_count = 0
                while sub_count < len(subj_start) - 1:
                    obj_count = sub_count + 1
                    while obj_count < len(subj_start):
                        dict_so = {}
                        dict_so['id'] = str(uuid.uuid1())
                        dict_so['relation'] = 'no_relation'
                        dict_so['token'] = i['token']
                        dict_so['subj_start'] = subj_start[sub_count]
                        dict_so['subj_end'] = subj_end[sub_count]
                        dict_so['obj_start'] = subj_start[obj_count]
                        dict_so['obj_end'] = subj_end[obj_count]
                        dict_so['subj_type'] = subj_type[sub_count]
                        dict_so['obj_type'] = subj_type[obj_count]
                        dict_so['stanford_ner'] = i['stanford_ner']
                        result.append(dict_so)
                        obj_count += 1
                    sub_count += 1
            elif entity_count == 2:
                dict_so = {}
                dict_so['id'] = str(uuid.uuid1())
                dict_so['relation'] = 'no_relation'
                dict_so['token'] = i['token']
                dict_so['subj_start'] = subj_start[0]
                dict_so['subj_end'] = subj_end[0]
                dict_so['obj_start'] = subj_start[1]
                dict_so['obj_end'] = subj_end[1]
                dict_so['subj_type'] = subj_type[0]
                dict_so['obj_type'] = subj_type[1]
                result.append(dict_so)
    return result

In [2]:
text = "The National Congress of American Indians was founded in 1944 in response to assimilation policies being imposed on tribes by the federal government. Founded in 1951, PATA is a non-profit membership association dedicated to building responsible development of the Asia Pacific travel and tourism industry. The Securities and Exchange Commission scheduled a news conference Thursday in Washington, DC, to discuss the allegations against Mozilo , who founded Countrywide in 1969 and was its chief executive until Bank of America purchased it last year as its financial condition deteriorated. Panama poised to withdraw from Central American Parliament PARLACEN , founded in 1991 , is based in Guatemala City and has six member states ."

res = extract_entity(text)
result = transfer_to_multi_relation(res)
with open('test1014.json', 'w', encoding='utf-8') as f:
    json.dump(result, f)

In [None]:
!python3 code/run_tacred.py --model tacred_dir1  --do_eval --eval_test  --data_dir tacred  --eval_batch_size 32   --learning_rate 2e-5   --max_seq_length 128   --output_dir tacred_dir1 --fp16

10/17/2019 05:42:38 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file tacred_dir1/vocab.txt
10/17/2019 05:42:56 - INFO - pytorch_pretrained_bert.modeling - loading archive file tacred_dir1
10/17/2019 05:42:56 - INFO - pytorch_pretrained_bert.modeling - Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 512,
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "type_vocab_size": 2,
  "vocab_size": 28996
}



In [8]:
with open('tacred_dir1/predictions.json', 'r') as f:
     data = json.load(f)

for d in data:
    output = {}
    output['entities'] = d[0]
    output['relations'] = d[1]
    output['sentence'] = d[2]
    #print(output)
    print(json.dumps(output, sort_keys=True, indent=2))
    

import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

displacy.render(nlp(text), jupyter=True, style='ent')

{
  "entities": [
    {
      "DATE": [
        "1944"
      ],
      "ORGANIZATION": [
        "National Congress of American Indians"
      ]
    }
  ],
  "relations": [
    {
      "entity1": "National Congress of American Indians",
      "entity2": "1944",
      "relation": "org:founded"
    }
  ],
  "sentence": [
    {
      "sentence": "The National Congress of American Indians was founded in 1944 in response to assimilation policies being imposed on tribes by the federal government ."
    }
  ]
}
{
  "entities": [
    {
      "DATE": [
        "1951"
      ],
      "LOCATION": [
        "Asia Pacific"
      ],
      "ORGANIZATION": [
        "PATA"
      ]
    }
  ],
  "relations": [
    {
      "entity1": "1951",
      "entity2": "PATA",
      "relation": "org:alternate_names"
    },
    {
      "entity1": "1951",
      "entity2": "Asia Pacific",
      "relation": "org:country_of_headquarters"
    },
    {
      "entity1": "PATA",
      "entity2": "Asia Pacific",
      "relatio

In [24]:
import spacy
from spacy import displacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

displacy.render(nlp(text), jupyter=True, style='ent')