In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import io
import time

In [16]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.join(path_to_zip, "spa-eng", "spa.txt")
print(path_to_file) 

/Users/ceylinekinci/.keras/datasets/spa-eng_extracted/spa-eng/spa.txt


In [9]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [10]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    w = '<start> ' + w + ' <end>'
    return w

In [11]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [12]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [13]:
print(path_to_zip)


/Users/ceylinekinci/.keras/datasets/spa-eng_extracted


In [21]:
en, sp = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [22]:
for i in range(100):
    print(len(en[i]), len(sp[i]))

18 18
18 20
18 20
18 22
18 20
19 21
19 22
19 23
20 21
20 24
20 24
20 21
20 33
20 23
20 21
20 21
20 21
20 20
20 20
20 22
20 23
21 24
21 24
21 20
21 21
21 22
21 26
21 25
21 23
21 33
21 22
22 25
22 22
22 23
22 30
22 24
22 27
22 28
22 25
22 30
22 24
22 22
22 24
22 20
22 21
22 21
22 22
22 24
22 32
19 32
22 31
22 23
22 23
22 24
22 28
22 33
22 33
22 25
22 30
22 27
22 26
22 22
22 21
22 26
22 26
22 27
22 23
22 23
22 24
22 29
22 23
22 28
23 32
23 21
23 33
23 32
23 24
23 29
23 28
23 22
23 23
23 24
23 23
23 24
23 21
23 20
23 22
23 21
23 22
23 21
23 24
23 28
23 22
23 21
23 19
23 20
23 21
23 22
23 28
23 23


In [23]:
len(en)

118964

In [18]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [19]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [20]:
def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer