### Greek to English translation

In [None]:
#Insert a sentence in Greek

from transformers import FSMTTokenizer, FSMTForConditionalGeneration
import pandas as pd


mname = "lighteternal/SSE-TUC-mt-el-en-cased"

tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)

raw =[]
translated =[]

#Add your own sentences using the following filepath:
with open('data/demo_greek.txt') as file:
    for row in file:
        encoded = tokenizer.encode(row, return_tensors='pt')
        outputs = model.generate(encoded, num_beams=5, num_return_sequences=1, early_stopping=True)
        for output in outputs:
            raw.append(row)
            translated.append(tokenizer.decode(output, skip_special_tokens=True))

decoded = pd.DataFrame(list(zip(raw, translated)), 
               columns =['raw', 'translated']) 

print(decoded)

decoded.to_csv('data/translated.csv')  

### Information Extraction

In [None]:
import pandas as pd

#Note: Make sure that you have correctly setup the ClausIE and OpenIE libraries.
#Moreover, triple extraction is applied directly to the translated texts.
#The extractive summarization and coreference resolution modules are not included in this version.

from pyclausie import ClausIE
cl = ClausIE.get_instance()

from pyopenie import OpenIE5
extractor = OpenIE5('http://localhost:9000')

from allennlp.predictors.predictor import Predictor
predictor1 = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz")


from tqdm import tqdm
from itertools import islice
import spacy
import re
import numpy as np

subject =[]
predicate = []
object = []

import nltk

df = pd.read_csv('data/translated.csv') 
df.head()
    
for index, row in tqdm(df.iterrows()):
#for index, row in islice(f.iterrows(), 0, 1):
        #print(str(index), file=f, end='\n', flush=True)

    sentence = bytes(row['translated'], 'utf-8').decode('ascii', 'ignore')
 
    print(sentence)

    # Clausie triple extraction
    try:                
        triples = cl.extract_triples([sentence])
        for triple in triples:                                        
            #print('|-', triple)
            subject.append(triple[1])
            predicate.append(triple[2])
            object.append(triple[3])

    except:
        print("ClausIE failed at extracting a triple from sentence: ", row['translated'])
    pass


    # OpenIE triple extraction
    try:
        extractions = extractor.extract(sentence)
        for extraction in extractions:
            subject.append(extraction['extraction']['arg1']['text'])
            predicate.append(extraction['extraction']['rel']['text'])
            obj_args =[]
            for j in range(len(extraction['extraction']['arg2s'])):
                obj_args.append(extraction['extraction']['arg2s'][j]['text'])
            object.append(' '.join(obj_args))

    except:
        print("OpenIE failed at extracting a triple from sentence: ", row['translated'])
    pass

    #  AllenNLP triple extraction
    try:                
        extracted = predictor1.predict(sentence)
        for i in range(len(extracted)):
            result = (extracted['verbs'][i]['description'])
            subject.append(' '.join(re.findall(r"\[ARG0: (.*?)\]", result)))
            predicate.append(' '.join(re.findall(r"\[V: (.*?)\]", result)))
            object.append(' '.join(re.findall(r"\[ARG1:(.*?)\]", result)))

    except:
        print("AllenNLP failed at extracting a triple from sentence: ", row['translated'])
    pass

         
    
extracted_triples = pd.DataFrame(
    {'subject': subject,
     'predicate': predicate,
     'object': object,
    })

extracted_triples.replace('', np.nan, inplace=True)
extracted_triples = extracted_triples.dropna(how='any') 
    
extracted_triples.to_csv('data/extracted_triples_english.csv')


In [None]:
# Extracted triples in English
extracted_triples

### English to Greek back-translation

In [None]:
from transformers import FSMTTokenizer, FSMTForConditionalGeneration

mname = "lighteternal/SSE-TUC-mt-en-el-cased"

tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)


def translate(text):  
    encoded = tokenizer.encode(text, return_tensors='pt')
    outputs = model.generate(encoded, num_beams=5, num_return_sequences=1, early_stopping=True)
    for output in outputs:
        decoded = tokenizer.decode(output, skip_special_tokens=True)
    return ''.join(decoded)

final = pd.DataFrame(
    {'subject': subject,
     'predicate': predicate,
     'object': object,
    })

final['subject'] = extracted_triples['subject'].apply(translate)
final['predicate'] = extracted_triples['predicate'].apply(translate)
final['object'] = extracted_triples['object'].apply(translate)

final.replace('', np.nan, inplace=True)
final = final.dropna(how='any') 
final.to_csv('data/final_triples.csv')  

In [None]:
# Back-translated triples in Greek
final