# Extract triples using OpenIE 5 and OLLIE

In [2]:
import json
import os
import tempfile
from pathlib import Path
from subprocess import Popen
import sys

sys.path.insert(1, '../ClausIE')
import clausie_api

sys.path.insert(1, '../StanfordOpenIE')
from openie import StanfordOpenIE

from pyopenie import OpenIE5

def read_text(filename):
    raw_text = ''
    with open(filename) as file:
        for line in file:
            raw_text += line
    return raw_text

def write_text(text, filename):
    with open(filename, 'w') as file:
        for line in text:
            file.write(line)
            
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

def write_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file)

## ClausIE

In [3]:
text = read_text('../data/preprocessed_text.txt')
text = 'Barack Hussein Obama II is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017.Barack Hussein Obama II previously served as a U.S. senator from Illinois from 2005 to 2008 and an Illinois state senator from 1997 to 2004. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black person to be president of the Harvard Law Review. Turning to elective politics, he represented the 13th district from 1997 until 2004 in the Illinois Senate, when he ran for the Illinois Senate. Obama received national attention in 2004 with Obama the Illinois Senate the Illinois Senate primary win, Obama well-received July Democratic National Convention keynote address, and Obama landslide November election to the Illinois Senate. In 2008, Obama was nominated for president a year after Obama presidential campaign began, and after a close primary campaign against Hillary Clinton, Obama was elected over Republican nominee John McCain and was inaugurated alongside Joe Biden on January 20, 2009. Nine months later, Obama was named the 2009 Nobel Peace Prize laureate.'
sent_list = clausie_api.prepare_raw_text(text)
sentences = clausie_api.clausie(text_list=sent_list)

triples = {}
for sent in sentences:
    print(sent)
    triples[sent.raw_sentence] = []
    for clause in sent.clauses:
        print('Clause:', clause)
    for triple in sent.triples:
        triple = {
            'subject': triple[0],
            'relation': triple[1],
            'object': triple[2]
        }
        print('Triple:', triple)
        triples[sent.raw_sentence].append(triple)


write_json(triples, '../data/demo/clausie_triples.json')

UnboundLocalError: local variable 'num_clauses' referenced before assignment

## OLLIE

In [None]:
# run `java -Xmx512m -jar ollie-app-latest.jar ../data/preprocessed_text.txt -o ../data/ollie_output.txt -t 0.5 --output-format tabbed`
# read triples file of OLLIE and convert to JSON format
ollie_output = read_text('../data/ollie_output.txt')
triples = {}
extractions = ollie_output.split('\n')
title, extractions = extractions[0], extractions[1:]
print(title)
for extraction in extractions:
    print(extraction)
    # skip empty line
    if not extraction:
        continue
    extraction = extraction.split('\t')
    if extraction[4] == 'None':
        extraction[4] = None
    if extraction[5] == 'None':
        extraction[5] = None
    if extraction[6] not in triples:
        triples[extraction[6]] = []
    triples[extraction[6]].append({
        'subject': [extraction[1]],
        'relation': [extraction[2]],
        'object': [extraction[3]],
        'enabler': extraction[4],
        'attribution': extraction[5]  
    })

write_json(triples, '../data/demo/ollie_triples.json')


## Stanford OpenIE

In [4]:
text = read_text('../data/preprocessed_text.txt')
text = 'Any problem shall be referred to the UG committee which may refer it to the Senate.'


triples = []
with StanfordOpenIE() as client:
    print('Text: %s.' % text)
    for triple in client.annotate(text):
        print('|-', triple)
        triples.append(triple)


write_json({text: triples}, '../data/demo/stanfordopenie_triples.json')

Text: Barack Hussein Obama II is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017.Barack Hussein Obama II previously served as a U.S. senator from Illinois from 2005 to 2008 and an Illinois state senator from 1997 to 2004. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black person to be president of the Harvard Law Review. Turning to elective politics, he represented the 13th district from 1997 until 2004 in the Illinois Senate, when he ran for the Illinois Senate. Obama received national attention in 2004 with Obama the Illinois Senate the Illinois Senate primary win, Obama well-received July Democratic National Convention keynote address, and Obama landslide November election to the Illinois Senate. In 2008, Obama was nominated for president a year after Obama presidential campaign began, and after a close pr

## OpenIE5

In [7]:
text = read_text('../data/preprocessed_text.txt')
text = 'Any problem shall be referred to the UG committee which may refer it to the Senate.'

sentences = text.split('\n')[:-1]

# Get tripples from OpenIE5 for the preprocessed text
edges = []

triples = {}

extractor = OpenIE5('http://localhost:8000')
for s_no in range(len(sentences)):
    sentence = sentences[s_no]
    extractions = extractor.extract(sentence)
    triples[sentence] = []
    for extraction in extractions:
        bad_extraction = False
        extraction = extraction['extraction']
        if len(extraction['arg1']['text'].split()) > 5:
            bad_extraction = True
    # 		print(extraction['arg1']['text'], '|', extraction['rel']['text'] , '|', *[extraction['arg2s'][i]['text'] + ',' for i in range(len(extraction['arg2s']))], extraction['negated'])

        triples[sentence].append({
            'subject' : extraction['arg1']['text'], 
            'relation': extraction['rel']['text'], 
            'object': [e['text'] for e in extraction['arg2s']],
            'negated': extraction['negated']
        })
    
        edges.append({'$' + str(s_no), extraction['rel']['text'], extraction['arg1']['text']})
        for i in range(len(extraction['arg2s'])):
            if len(extraction['arg2s'][i]['text'].split()) > 5:
                bad_extraction = True
            edges.append(['$' + str(s_no), 'object', extraction['arg2s'][i]['text']])
        if extraction['negated']:
            edges.append(['$' + str(s_no), 'type', 'Don\'t'])

In [10]:
write_json(triples, '../data/openie5_triples.json')