In [2]:
import pandas as pd
import numpy as np

In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
mark_start = "ssss "
mark_end = " eeee"

In [5]:
data_src = []
data_dest = []

In [6]:
for line in open("tur.txt", encoding = "utf-8"):
    en_text, tr_text = line.rstrip().split("\t")
    
    tr_text = mark_start + tr_text + mark_end
    data_src.append(en_text)
    data_dest.append(tr_text)

In [7]:
len(data_src)

473035

In [8]:
len(data_dest)

473035

In [25]:
class TokenizerWrap(Tokenizer):
    def __init__(self, texts, padding, reverse=False, num_words=None):
        Tokenizer.__init__(self, num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))
        
        self.tokens = self.texts_to_sequences(texts)
        
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = "pre"
            
        else:
            truncating = "post"
            
        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.num_tokens) + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        
        self.tokens_padded = pad_sequences(self.tokens, maxlen=self.max_tokens, padding=padding, truncating=truncating)
        
    def token_to_word(self, token):
        word = " " if token == 0 else self.index_to_word[token]
        return word
    
    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token] for token in tokens if token != 0]
        text = " ".join(words)
        return text
    
    def text_to_tokens(self, text, padding, reverse=False):
        tokens = self.text_to_sequences([text])
        tokens = np.array(tokens)
        
        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = "pre"
            
        else:
            truncating = "post"
            
        tokens = pad_sequences(tokens, maxlen=self.max_tokens, padding=padding, truncating=truncating)
        
        return tokens
        

In [26]:
tokenizer_src = TokenizerWrap(texts=data_src, padding="pre", reverse=True, num_words=None)

In [27]:
tokenizer_dest = TokenizerWrap(texts=data_dest, padding="post", reverse=False, num_words=None)

In [29]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(473035, 11)
(473035, 10)


In [31]:
tokens_dest[200000]

array([   1, 2391,    4,   18, 4127,   48,    2,    0,    0,    0])

In [34]:
tokenizer_dest.tokens_to_string(tokens_dest[200000])

'ssss eksik bir şey görebiliyor musun eeee'

In [36]:
tokens_src[200000]

array([   0,    0,    0,    0,    0,    0, 1028,  113,   95,    5,   39])

In [38]:
tokenizer_src.tokens_to_string(tokens_src[200000])

'missing anything see you can'

In [40]:
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_start

1

In [41]:
token_end = tokenizer_dest.word_index[mark_end.strip()]
token_end

2