In [1]:
import pandas as pd
from openie import StanfordOpenIE

Below function is used to write relation or predicate string in camel case to match the formatting of WebNLG dataset.
input: is located in
output: isLocatedIN

In [2]:
def relation_camel_casing(words):
    s = "".join(word[0].upper() + word[1:].lower() for word in words)
    return s[0].lower() + s[1:]

Below function is used to merge the objects of the triplets having same relationship or predicate terms
input: list of triplets dictionary


In [3]:
def triplet_merge(triplet_list):
    relations = set()
    for triplet in triplet_list:
        relations.add(triplet['relation'])
#     print('total relations are ',relations)
    unique_triplet_list = []
    for relation in relations:
        subject = ''
        object_list = []
        for triplet in triplet_list:
            if relation  == triplet['relation']:
                subject = triplet['subject']
                object_list.append(triplet['object'])
        temp_triplet = {}
        temp_triplet['subject'] = subject
        temp_triplet['relation'] = relation
        temp_triplet['object'] = ','.join(object_list)
#         print(temp_triplet)
        unique_triplet_list.append(temp_triplet)
    return unique_triplet_list

Below funtion takes a sentence as input and gives the triplets present in it. 
input: "Fred Astaire was born in Omaha, Nebraska, to Johanna (Geilus) and Fritz Austerlitz, a brewer. "
output: 'Fred_Astaire | wasBornTo | Geilus,Johanna && Fred_Astaire | wasBornIn | Omaha,Nebraska && Fred_Astaire | was | born_in_Omaha_to_Johanna,born.'

In [4]:
def setence_to_triplet(sen):
    with StanfordOpenIE() as client:
#         print('Text: %s.' % sen)
        partial_triplet_list = []
        triplet_list = client.annotate(sen)
        triplet_list = triplet_merge(triplet_list)
        for triple in triplet_list:
            sub = '_'.join(triple['subject'].split())
            rel = relation_camel_casing(triple['relation'].split())
            obj = '_'.join(triple['object'].split())
            partial_triplet = sub + ' | ' + rel + ' | ' + obj
            partial_triplet_list.append(partial_triplet)
        if len(partial_triplet_list) <= 5:
            final_triplet = ' && '.join(partial_triplet_list) + '.'
#         print(final_triplet)
            return final_triplet
        else: 
            return
# sen = "Fred Astaire was born in Omaha, Nebraska, to Johanna (Geilus) and Fritz Austerlitz, a brewer. "
# setence_to_triplet(sen)

Below function takes input a paragraph or text as input and split the sentences, then each sentence is passed to sentence_to_triplet method. 
input: text
output: add extracted triplets and their correspoding sentences in lists 


In [5]:
def text_to_triplet_sentence(text):
    global sentence_list
    global triplet_list
    sents = text.split('.')
    for sent in sents:
        triplet = setence_to_triplet(sent)
        if triplet:
            triplet_list.append(triplet)
            sentence_list.append(sent)

Below function takes a dataframe as input which is having coreference resolved text second argument is the path where to save the output dataframe.
input: dataframe, output_file
output: a dataframe having input_text and target_text columns which are in format similar to WebNLG format.

In [11]:
sentence_list = []
triplet_list = []
def dataframe_formatting(df,output_file):
    global sentence_list
    global triplet_list
    df['resolved_text'].apply(text_to_triplet_sentence)
#     print(sentence_list,triplet_list)
    final_df = pd.DataFrame(list(zip(triplet_list,sentence_list)),columns=['input_text','target_text'])
#     final_df.to_csv(output_file,index = False)
    display(final_df.head())
    print(len(final_df))

In [10]:
df = pd.read_csv('../data/imdb_bio/resolved_bio.csv')
display(df.head())
output_file = '../data/imdb_bio/final_bio_with_merged_triplets.csv'
dataframe_formatting(df,output_file)

Unnamed: 0,resolved_text
0,"Fred Astaire was born in Omaha, Nebraska, to J..."
1,One of Hollywood's preeminent male stars of al...
2,Born to Alice Cooper and Charles Cooper. Gary ...
3,"Georges Delerue was born on March 12, 1925 in ..."
4,"Born in London, England, John Gielgud trained ..."


Starting server with command: java -Xmx8G -cp /home/anurag/.stanfordnlp_resources/stanford-corenlp-4.1.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-92ddd4e321314f71.props -preload openie


KeyboardInterrupt: 

In [12]:
df = pd.read_csv('../data/wikipedia/wiki_summary_resolved.csv')
display(df.head())
output_file = '../data/wikipedia/wiki_summary_with_merged_triplets.csv'
dataframe_formatting(df,output_file)

Unnamed: 0,resolved_text
0,Macrocyclops is a genus of copepods belonging ...
1,David Tartakover (דוד טרטקובר) (born 1944) is...
2,"Kullamäe is a village in Veriora Parish, Põlv..."
3,"Beatrice Morgari (1858, Turin – 1936, Turin) w..."
4,YouTubers are people mostly known for people m...


Starting server with command: java -Xmx8G -cp /home/anurag/.stanfordnlp_resources/stanford-corenlp-4.1.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-ebf016ff01a64ff6.props -preload openie


KeyboardInterrupt: 

#references 
#1.https://www.tutorialspoint.com/camelcase-in-python