In [1]:
#Script to load trained model and perform translations

import itertools

import torch
import torch.optim as optim
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.tokenizers.word_tokenizer import WordTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.nn.activations import Activation
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.modules.attention import DotProductAttention
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper, StackedSelfAttentionEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.predictors import SimpleSeq2SeqPredictor
from allennlp.training.trainer import Trainer

EN_EMBEDDING_DIM = 256
ZH_EMBEDDING_DIM = 256
HIDDEN_DIM = 256


CUDA_DEVICE = 0

#Loading the reader, vocab, embeddings and model structure 
reader = Seq2SeqDatasetReader(
    source_tokenizer=WordTokenizer(),
    target_tokenizer=CharacterTokenizer(),
    source_token_indexers={'tokens': SingleIdTokenIndexer()},
    target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')},
    lazy = True)

vocab = Vocabulary.from_files("/home/infili/translation/Translation/trained/20200202/paracrawl_vocabulary_20200202")


en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                         embedding_dim=EN_EMBEDDING_DIM)

encoder = StackedSelfAttentionEncoder(input_dim=EN_EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, projection_dim=128, feedforward_hidden_dim=128, num_layers=1, num_attention_heads=8)

source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

attention = DotProductAttention()

max_decoding_steps = 300  
model_pred = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                      target_embedding_dim=ZH_EMBEDDING_DIM,
                      target_namespace='target_tokens',
                      attention=attention,
                      beam_size=8,
                      use_bleu=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_pred.to(device) # without this there is no error, but it runs in CPU (instead of GPU). 


# Reload the trained model.
with open("/home/infili/translation/Translation/trained/20200202/paracrawl_model_20200202.th", 'rb') as f:
    model_pred.load_state_dict(torch.load(f))
    model_pred.eval()
    

In [None]:
#Predict on new text using loaded model

predictor = SimpleSeq2SeqPredictor(model_pred, dataset_reader=reader)


#To translate, write comma-separated sentences in quotes:

test = [
    "European Union law is a body of treaties and legislation, such as Regulations and Directives, which have direct effect or indirect effect on the laws of European Union member states. ",
    "What is European Union Law?",
    "a body of treaties and legislation",
    "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
    "Computational complexity theory",
    "Consciousness at its simplest form is sentience or awareness of internal or external existence",
    "She studied medicine but also became an expert of legal matters, by working as lawyer assistant when she was young .",
    "He was born in 1979 in Prague and was a member of the royal family.",
    "How are you feeling today?",
    "As far as I know, the economic crisis doesn't allow for military expenses.",
    "Yes I will do it",
    "I am thirsty for knowledge.",
    "Thousands reports from all over the world have confirmed the incident.", 
    "Three beers and a steak please.",
    "At least let me pay the bill this time!",
    "This product meets the requirements set by the clients.",
    "I do not intend to leave the city anytime soon.",
    "The socioeconomic problems have hurt the local industry the most.",
    "The food is really tasty, but the environment needs improvement.",
    "I would definitely recommend this place for vacation.",
    "According to Christopher Tolkien, it is no longer possible to trace the exact date of the work's composition. "
]

#Parsing the output to remove quotes and irrelevant characters

import re
regex = r"', '"
regex2 = r", \"'\", '"
subst = ""
subst2 = ""

for i in test:
    p = predictor.predict(i)['predicted_tokens']
    result = re.sub(regex, subst, str(p), 0, re.MULTILINE | re.IGNORECASE)
    result = re.sub(regex2, subst2, str(result), 0, re.MULTILINE | re.IGNORECASE)

    print(i,result) 

In [2]:
#Predict on SQUAD

import pandas as pd

predictor = SimpleSeq2SeqPredictor(model_pred, dataset_reader=reader)

squad = pd.read_csv('/home/infili/translation/Translation/SQUAD/squad_train_1.1.csv', sep='\t') 
squad = squad.drop(squad.index[75207])


print(squad.head())
print(squad.shape)
from nltk.tokenize import sent_tokenize
#print(squad.iloc[0,1])



                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                            question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                                    answer  
0               Saint Bernadette Soubirous  
1                a copper statue of Christ  
2                        the Main Building  
3  a Marian place of prayer and reflection  
4       a golden statue of the Virgin Mary  
(87598, 3)


In [3]:
# Create the pandas DataFrame of the translated SQuAD
squad_el = pd.DataFrame(columns = [ 'context', 'question', 'answer']) 

text_el = []
questions_el = []
answers_el = []

from tqdm import tqdm

for i in tqdm(range(0,len(squad))):
    text = sent_tokenize(squad.iloc[i,0])
    question = squad.iloc[i,1]
    answer = squad.iloc[i,2]

#Parsing the output to remove quotes and irrelevant characters

    import re
    regex = r"', '"
    regex2 = r", \"'\", '"
    subst = ""
    subst2 = ""
    translated1 = []

# Iterative translation of the original SQuAD:
    for j in text:
        p = predictor.predict(j)['predicted_tokens']
        result1 = re.sub(regex, subst, str(p), 0, re.MULTILINE | re.IGNORECASE)
        result1 = re.sub(regex2, subst2, str(result1), 0, re.MULTILINE | re.IGNORECASE)
        translated1.append(result1)
        
    text_el.append(' '.join(translated1))
    
    p2 = predictor.predict(question)['predicted_tokens']
    result2 = re.sub(regex, subst, str(p2), 0, re.MULTILINE | re.IGNORECASE)
    questions_el.append(re.sub(regex2, subst2, str(result2), 0, re.MULTILINE | re.IGNORECASE))
        
    p3 = predictor.predict(answer)['predicted_tokens']
    result3 = re.sub(regex, subst, str(p3), 0, re.MULTILINE | re.IGNORECASE)
    answers_el.append(re.sub(regex2, subst2, str(result3), 0, re.MULTILINE | re.IGNORECASE))
        
    
squad_el['context'] = text_el
squad_el['question'] = questions_el
squad_el['answer'] = answers_el
squad_el.head(10)

#Save to CSV
squad_el.to_csv('~/translation/Translation/SQUAD/squad_train_1.1_el_XXX.csv')

100%|██████████| 87598/87598 [14:55:47<00:00,  1.63it/s]   
