## test preprocessing code adopted from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

test process and train

In [2]:
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

In [3]:
from Model import Transformer

from torch import  optim
import numpy as np
import copy
import argparse
from Mask import create_masks


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
USE_CUDA = False

In [5]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.n_words = 3 # Count SOS and EOS
      
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = s.lower() #unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?,'])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?,ÄäÖöÜüẞß']+", r" ", s)
    return s

In [7]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    pairs = []
    line1 = open('data/train.de', encoding='utf-8').read().strip().split('\n')
    line2 = open('data/train.en', encoding='utf-8').read().strip().split('\n')  #.splitlines()
    
    for i in range(len(line1)):
        
        pairs.append([normalize_string(line1[i]), normalize_string(line2[i])])
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [8]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepare_data('ger', 'en')

# Print an example pair
print(random.choice(pairs))


Reading lines...
Read 196884 sentence pairs
Indexing words...
['dieser prozess fühlt sich toll und befriedigend an .', 'now that process feels great . it feels really satisfying .']


In [9]:
print(pairs[13])

['das meiste ist unerforscht , und doch gibt es schönheiten wie diese , die uns fesseln und uns vertrauter mit ihm machen .', "it 's mostly unexplored , and yet there are beautiful sights like this that captivate us and make us become familiar with it ."]


In [10]:
# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    var = Variable(torch.LongTensor(indexes).view(-1, 1))
    if USE_CUDA: var = var.cuda()
    return var

def variables_from_pair(pair, input_lang, output_lang):
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [11]:
def find_max_len(pair):
    result = 0
    
    
    for sents in pair:
        for item in sents:
            if len(item.split()) > result:
                
                result = len(item.split())
    return result

def find_len(element):
    result = 0
    for item in element:
        if len(item.split()) > result:
            result = len(item.split())
    return result

In [12]:
def paddingSOS(vector, max_len):
    vector = [SOS_token]+vector
    while len(vector)< max_len:
        vector.append(PAD_token)
    return vector

In [13]:
def paddingEOS(vector, max_len):
    vector = vector + [EOS_token]
    while len(vector)< max_len:
        vector.append(PAD_token)
    return vector

In [14]:
def padding_both(vector, max_len):
    vector = [SOS_token]+ vector + [EOS_token]
    while len(vector)< max_len:
        vector.append(PAD_token)
    return vector

In [15]:
def padding(vector, max_len):
    
    while len(vector)< max_len:
        vector.append(PAD_token)
    return vector

In [16]:

print('input_lang 0: ', input_lang.index2word[0])
print('input_lang 1: ', input_lang.index2word[1])
print('input_lang 2: ', input_lang.index2word[2])

input_lang 0:  PAD
input_lang 1:  SOS
input_lang 2:  EOS


In [17]:
max_len = find_max_len(pairs)+2
print('max_len: ', max_len)

max_len:  623


In [18]:
print('input: ', input_lang.name)
print('output: ', output_lang.name)

input:  ger
output:  en


## data store pair sentences converted to indexes

In [19]:
# print('convert sentence to indexes......')
# data = pair_to_indexes(pairs, max_len, input_lang, output_lang)

# print('all sentence convert finish!')

In [20]:
from Text import indexes_from_sentence
from Text import padding_both

from Mask import create_masks


In [21]:
def pair_to_indexes(pairs, max_len, input_lang, output_lang):
    source = np.zeros((len(pairs), max_len))
    target = np.zeros((len(pairs), max_len))
    for i in range(len(pairs)):
        # add start token for english
        sent2 = padding_both(indexes_from_sentence(output_lang, pairs[i][1]), max_len)
        sent2 = torch.Tensor(sent2)
        target[i] = sent2
        
        # add end token for german
        sent1 = padding(indexes_from_sentence(input_lang, pairs[i][0]), max_len)
        sent1 = torch.Tensor(sent1)
        source[i] = sent1
    
    return source, target

## TODO: 1. add bleu score for each epoch 2. change datas to pairs 

## restore model from .pt file

## translate

In [22]:
#print(data_1[0])
dim_model = 128
H = 8
N = 6
src_vocab = input_lang.n_words
trg_vocab = output_lang.n_words


model = Transformer(src_vocab, trg_vocab, dim_model, N, H)
model.load_state_dict(torch.load('models/mytraining59.pt', map_location='cpu'))


In [23]:
def traslante_sentence(curr_sent, max_len_1, input_lang, output_lang):
    source, target = pair_to_indexes(curr_sent, max_len_1, input_lang, output_lang)
    target_fake = np.zeros((1, max_len_1))
    target_fake[0][0] = 1
    target_temp = target_fake
    
    for i in range(max_len_1-2):
        sou = torch.from_numpy(source)
        tar = torch.from_numpy(target_fake)
        source_mask, target_mask = create_masks(sou, tar)
        preds = model(sou, tar, source_mask, target_mask)

        preds = preds[:, :-1,:].contiguous().view(-1, model.target_vocab)
        ss = torch.softmax(preds, dim=-1)
        mm = torch.max(ss, dim=-1)[1]
        target_temp[0][i+1] = mm[i]
        target_fake = target_temp
    result = ''
    for idx in mm:
        if idx == 0:
            break
        index = idx.item()
        if index == 2:
            break
        result += output_lang.index2word[index]+' '
#     print(result)
    return result



In [24]:
# data_1 = [element for element in pairs if find_len(element) < 100]
# data_1 = data_1[:20]
# max_len_1 = find_max_len(data_1) + 2
# print(max_len_1)
# data_curr = data_1[9:10]
# print(data_curr)
# traslante_sentence(data_curr, max_len_1, input_lang, output_lang)

In [25]:
max_len_1 = 100

In [26]:
def read_langs_test(lang1, lang2, reverse=False):
    print("Reading lines...")

    pairs = []
    line1 = open('data/dev.de', encoding='utf-8').read().strip().split('\n')
    line2 = open('data/dev.en', encoding='utf-8').read().strip().split('\n')  #.splitlines()
    
    for i in range(len(line1)):
        
        pairs.append([normalize_string(line1[i]), normalize_string(line2[i])])
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs

In [27]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs_test(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

input_lang_test, output_lang_test, pairs_test = prepare_data('ger', 'en')

# Print an example pair
print(random.choice(pairs_test))


Reading lines...
Read 7883 sentence pairs
Indexing words...
['weil all diese neuen technologien von heute sich von sich selber ernähren .', 'because all of these technologies are feeding back on themselves .']


In [28]:
max_len_test = find_max_len(pairs_test)+2
print('max_len test: ', max_len_test)

max_len test:  169


In [29]:
import time

data_test = [element for element in pairs_test if find_len(element) < 95]



In [30]:
data_test_processed = []
for i in range(len(data_test)):
    check = False
    for j in range(len(data_test[i])):
        
        for word in data_test[i][j].split(' '):
            
            if j == 0 and word not in input_lang.word2index.keys():
                check = True
            if j ==1 and word not in output_lang.word2index.keys():
                check = True
    if check == False:
        data_test_processed.append(data_test[i])
    check = False




In [31]:
print(len(data_test_processed))

5589


In [32]:
file_ref = open('data/test2/ref.en', 'w') 
file_out = open('data/test2/output.en','w') 
for i in range(200):
    curr_sent = data_test_processed[i:i+1]
#     print(curr_sent[0][1])
    file_ref.write(curr_sent[0][1]+'\n')
    translated = traslante_sentence(curr_sent, max_len_1, input_lang, output_lang)
    file_out.write(translated+'\n')
    if i % 50 == 0:
        print('number of sentence is ready: ', i)
    
file_ref.close()
file_out.close()



number of sentence is ready:  0


KeyboardInterrupt: 