In [3]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()

import numpy as np

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
start_marker = 'ssssss '
end_marker = ' eeeeee'

In [6]:
src_data = []
tgt_data = []

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
for line in open('/content/drive/My Drive/Colab Notebooks/Research Techniques I/tur.txt', encoding='UTF8'):
  src_line, tgt_line = line.rstrip().split('\t')
  src_data.append(src_line)
  tgt_data.append(start_marker + tgt_line + end_marker)

In [9]:
src_data[100]

'I drove.'

In [10]:
tgt_data[100]

'ssssss Araba sürdüm. eeeeee'

In [11]:
len(src_data)

473035

In [12]:
class TokenizerWrapper(Tokenizer):
  def __init__(self, texts, padding, reverse=False, num_words=None):
    Tokenizer.__init__(self, num_words=num_words)

    self.fit_on_texts(texts)

    self.index_word = dict(zip(self.word_index.values(), self.word_index.keys()))

    self.tokens = self.texts_to_sequences(texts)

    if reverse:
      self.tokens = [list(reversed(x)) for x in self.tokens]
      truncating = 'pre'
    else:
      truncating = 'post'

    self.token_sizes = [len(x) for x in self.tokens]
    self.padding_size = np.mean(self.token_sizes) + 2 * np.std(self.token_sizes)
    self.padding_size = int(np.round(self.padding_size))

    self.tokens_padded = pad_sequences(self.tokens, maxlen=self.padding_size, padding=padding, truncating=truncating)

  def token_to_word(self, token):
    word = " " if token == 0 else self.index_word[token]
    return word

  def tokens_to_string(self, tokens):
    words = [self.index_word[token] for token in tokens if token != 0]
    text = " ".join(words)
    return text

  # Cümleyi tokenlere dönüştür
  def text_to_tokens(self, text, reverse=False, padding='post', truncating='post'):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            tokens = np.flip(tokens, axis=1)

        tokens = pad_sequences(tokens, maxlen=self.padding_size, padding=padding, truncating=truncating)
        return tokens

In [13]:
tokenizer_eng = TokenizerWrapper(texts = src_data, padding = 'pre', reverse = True, num_words=None)
tokenizer_tur = TokenizerWrapper(texts = tgt_data, padding = 'post', reverse = False, num_words=None)

In [14]:
tokens_eng = tokenizer_eng.tokens_padded
tokens_tur = tokenizer_tur.tokens_padded

In [15]:
print(tokens_eng.shape)
print(tokens_tur.shape)

(473035, 11)
(473035, 11)


In [16]:
tokens_tur[300000]

array([   1,    3,  212,   12,  214, 2351,    2,    0,    0,    0,    0],
      dtype=int32)

In [17]:
token_start = tokenizer_tur.word_index[start_marker.strip()]
token_start

1

In [18]:
token_end = tokenizer_tur.word_index[end_marker.strip()]
token_end

2

In [19]:
encoder_input_data = tokens_eng
decoder_input_data = tokens_tur[:, :-1]
decoder_output_data = tokens_tur[:, 1:]

In [20]:
encoder_input_data[100000]

array([  0,   0,   0,   0,   0,   0, 206,  36,   3,  59, 109], dtype=int32)

In [21]:
number_encoder_words = len(tokenizer_eng.word_index)
number_decoder_words = len(tokenizer_tur.word_index)

In [22]:
number_encoder_words

21315

In [23]:
number_decoder_words

94058

In [24]:
word2vec = {}
vocab_size = number_encoder_words + 1

In [25]:
with open('/content/drive/My Drive/Colab Notebooks/Research Techniques I/glove.6B.100d.txt', encoding='UTF8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec

In [26]:
embedding_size = 100

In [27]:
embedding_vector = np.random.uniform(-1, 1, (vocab_size, embedding_size))

In [28]:
# Kelime vektörlerini doldur
for word, i in tokenizer_eng.word_index.items():
    if i <= vocab_size:  # input_dim sınırı içinde olduğundan emin olun
        vec = word2vec.get(word)
        if vec is not None:
            embedding_vector[i] = vec  # İlgili embedding vektörünü doldur

In [29]:
embedding_vector.shape

(21316, 100)

In [30]:
encoder_input = Input(shape=(None,), name='encoder_input')

In [31]:
encoder_embedding = Embedding(input_dim=vocab_size,
                              output_dim=embedding_size,
                              trainable=True,
                              weights=[embedding_vector],
                              name='encoder_embedding')

In [32]:
state_size = 256

In [33]:
encoder_gru1 = GRU(state_size, name='encoder_gru1', return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2', return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3', return_sequences=False)

In [34]:
def encoder_model_connector():
  net = encoder_input

  net = encoder_embedding(net)

  net = encoder_gru1(net)
  net = encoder_gru2(net)
  net = encoder_gru3(net)

  return net

In [35]:
encoder_output = encoder_model_connector()

**decoder later**

In [36]:
decoder_initial_state = Input(shape=(state_size,), name='decoder_initial_state')

In [37]:
decoder_input = Input(shape=(None,), name='decoder_input')

In [38]:
decoder_embedding = Embedding(input_dim=number_decoder_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

In [39]:
decoder_gru1 = GRU(state_size, name='decoder_gru1', return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2', return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3', return_sequences=True)

In [40]:
decoder_dense = Dense(number_decoder_words,
                      activation='linear',
                      name='decoder_output')

In [41]:
def connect_decoder(initial_state):
    net = decoder_input

    net = decoder_embedding(net)

    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    decoder_output = decoder_dense(net)

    return decoder_output

In [42]:
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

In [43]:
model_encoder = Model(inputs=[encoder_input], outputs=[encoder_output])

In [44]:
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

In [45]:
def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

In [46]:
from tensorflow.keras.optimizers import RMSprop

optimizer = RMSprop(learning_rate=1e-3)

In [47]:
decoder_target = tf.keras.Input(shape=(None,), dtype='int32', name='decoder_target')

In [48]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy)

In [49]:
from tensorflow.keras.callbacks import ModelCheckpoint
path_checkpoint = 'checkpoint.weights.h5'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, save_weights_only=True)

In [50]:
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print('Error, failed to load checkpoint')
    print(error)

Error, failed to load checkpoint
[Errno 2] Unable to synchronously open file (unable to open file: name = 'checkpoint.weights.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


In [51]:
x_data = {'encoder_input': encoder_input_data, 'decoder_input': decoder_input_data}

In [52]:
y_data = {'decoder_output': decoder_output_data}

In [72]:
# Küçük veri seti için deneme - Sözlük yapılarını dilimle
# x_train_subset = {key: value[:10000] for key, value in x_data.items()}
# y_train_subset = {key: value[:10000] for key, value in y_data.items()}


In [73]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=256,
                epochs=5,
                callbacks=[checkpoint])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7eb8d162afe0>

In [None]:
# Modeli Drive'a kaydet
model_train.save("/content/drive/My Drive/Colab Notebooks/Research Techniques I/model_complete.keras")

#model_train = load_model("/content/drive/My Drive/Colab Notebooks/Research Techniques I/model_complete.keras", custom_objects={'sparse_cross_entropy': sparse_cross_entropy})
