----

CHECK TENSORFLOW

In [1]:
import tensorflow as tf
print(tf.test.gpu_device_name())

/device:GPU:0


In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [5]:
tf.__version__

'2.3.0'

---

In [45]:
import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [2]:
# Path to translation file
path_to_data = 'spa.txt'

# Read file
translation_file = open(path_to_data,"r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in  raw_data]
pairs = pairs[:-1] # skip last empty element

In [3]:
pairs2 = pairs[1000:20000]

for idx_sample in range(5,10):
    print('English example in pair {}:  {}'.format(idx_sample + 1, pairs2[idx_sample][0]))
    print('Spanish example in pair {}:  {}'.format(idx_sample + 1, pairs2[idx_sample][1]))

English example in pair 6:  Tom works.
Spanish example in pair 6:  Tomás trabaja.
English example in pair 7:  Tom'll go.
Spanish example in pair 7:  Tom irá.
English example in pair 8:  Tom's fat.
Spanish example in pair 8:  Tom está gordo.
English example in pair 9:  Tom's mad.
Spanish example in pair 9:  Tom está loco.
English example in pair 10:  Tom's sad.
Spanish example in pair 10:  Tom está triste.


# Tokenizer

In [4]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [5]:
print(clean_sentence("I will surf today !!"))

i will surf today 


In [6]:
text_examples = [
    'i will surf today',
    'this week i will travel to the beach',
    'he went to his house by the beach',]

# Create tokenizer
exp_text_tokenizer = Tokenizer()
# Create word index
exp_text_tokenizer.fit_on_texts(text_examples)

for key, value in exp_text_tokenizer.word_index.items():
    print("Word: {} is converted to number {}".format(key, value))

Word: i is converted to number 1
Word: will is converted to number 2
Word: to is converted to number 3
Word: the is converted to number 4
Word: beach is converted to number 5
Word: surf is converted to number 6
Word: today is converted to number 7
Word: this is converted to number 8
Word: week is converted to number 9
Word: travel is converted to number 10
Word: he is converted to number 11
Word: went is converted to number 12
Word: his is converted to number 13
Word: house is converted to number 14
Word: by is converted to number 15


In [9]:
exp_text_tokenized = exp_text_tokenizer.texts_to_sequences(text_examples)
exp_text_tokenized

[[1, 2, 6, 7], [8, 9, 1, 2, 10, 3, 4, 5], [11, 12, 3, 13, 14, 15, 4, 5]]

In [10]:
for sent, token_sent in zip(text_examples, exp_text_tokenized):
    print('Input sentence:  {}'.format(sent))
    print('Output vector: {} \n'.format(token_sent))

Input sentence:  i will surf today
Output vector: [1, 2, 6, 7] 

Input sentence:  this week i will travel to the beach
Output vector: [8, 9, 1, 2, 10, 3, 4, 5] 

Input sentence:  he went to his house by the beach
Output vector: [11, 12, 3, 13, 14, 15, 4, 5] 



In [11]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [18]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Maximum length spanish sentence: 68
Maximum length english sentence: 70
Spanish vocabulary is of 27613 unique words
English vocabulary is of 14305 unique words


# Padding

In [21]:
print('Maximum length of example sentence: {}'.format(len(max(exp_text_tokenized,key=len))))
# Pad tokenize vectors
exp_pad_sentence = pad_sequences(exp_text_tokenized, 8, padding = "post") # 8 is the max length
for index, pad_sentence in enumerate(exp_pad_sentence):
    print("Example sentence {}:".format(index+1))
    print("  -Input:{}".format(exp_text_tokenized[index]))
    print("  -Output:{}".format(pad_sentence))

Maximum length of example sentence: 8
Example sentence 1:
  -Input:[1, 2, 6, 7]
  -Output:[1 2 6 7 0 0 0 0]
Example sentence 2:
  -Input:[8, 9, 1, 2, 10, 3, 4, 5]
  -Output:[ 8  9  1  2 10  3  4  5]
Example sentence 3:
  -Input:[11, 12, 3, 13, 14, 15, 4, 5]
  -Output:[11 12  3 13 14 15  4  5]


In [36]:
max_sentence_length = max(len(max(spa_text_tokenized,key=len)), len(max(eng_text_tokenized,key=len)))
spa_pad_sentence = pad_sequences(spa_text_tokenized, max_sentence_length, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_sentence_length, padding = "post")

# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

In [41]:
input_shape = (max_sentence_length, 1)
input_sequence = Input(input_shape, name='InputLayer')

In [42]:
rnn = LSTM(256, return_sequences=True, dropout=0.5, name='RNNLayer')(input_sequence)

In [43]:
logits = TimeDistributed(Dense(spanish_vocab), name='TimeDistributed')(rnn)

In [46]:
model = Model(input_sequence, Activation('softmax')(logits))
model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-2),
              metrics=['accuracy'])

In [47]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
InputLayer (InputLayer)      [(None, 70, 1)]           0         
_________________________________________________________________
RNNLayer (LSTM)              (None, 70, 256)           264192    
_________________________________________________________________
TimeDistributed (TimeDistrib (None, 70, 27613)         7096541   
_________________________________________________________________
activation_1 (Activation)    (None, 70, 27613)         0         
Total params: 7,360,733
Trainable params: 7,360,733
Non-trainable params: 0
_________________________________________________________________


In [48]:
model_results = model.fit(eng_pad_sentence, spa_pad_sentence, batch_size=30, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
model.save("rnn_wholeDataset")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: rnn_wholeDataset\assets


In [50]:
def logits_to_sentence(logits, tokenizer):
    
    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [51]:
index = 10
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(model.predict(eng_pad_sentence[index:index+1])[0], spa_text_tokenizer))                                   

The english sentence is: who
The spanish sentence is: quién
The predicted sentence is :
es un <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty>


In [52]:
prueba = ["I will try to translate this sentence never trained before, I think.",
          "Here is another one, maybe it was trained or maybe not, who knows?"]

prueba = [clean_sentence(s) for s in prueba]

prueba_tokenizada = eng_text_tokenizer.texts_to_sequences(prueba)
prueba_padding= pad_sequences(prueba_tokenizada, max_sentence_length, padding = "post")
prueba_padding = prueba_padding.reshape(*prueba_padding.shape, 1)

for i, s in enumerate(prueba):
    print(f"La frase a traducir es: {s}")
    pred = model.predict(eng_pad_sentence[i:i+1])
    print("valor numerico predicho")
    print(pred)
    logits = pred[0]
    pred_sent = logits_to_sentence(logits, spa_text_tokenizer)
    print(f"Lo traducido sería: {pred_sent}")
    print("\n")

La frase a traducir es: i will try to translate this sentence never trained before i think
valor numerico predicho
[[[4.1163930e-09 1.1301072e-03 1.8155495e-04 ... 1.5372554e-07
   2.3025748e-08 2.4044414e-07]
  [9.8846783e-04 4.3910826e-03 4.8491065e-03 ... 8.2510603e-08
   3.7283610e-07 9.3152641e-09]
  [1.1989490e-01 5.1146280e-02 2.7203370e-02 ... 1.2704072e-06
   8.3803587e-07 9.6696844e-08]
  ...
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]]]
Lo traducido sería: no te <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <

In [63]:
pred[0]

array([[4.1163930e-09, 1.1301072e-03, 1.8155495e-04, ..., 1.5372554e-07,
        2.3025748e-08, 2.4044414e-07],
       [9.8846783e-04, 4.3910826e-03, 4.8491065e-03, ..., 8.2510603e-08,
        3.7283610e-07, 9.3152641e-09],
       [1.1989490e-01, 5.1146280e-02, 2.7203370e-02, ..., 1.2704072e-06,
        8.3803587e-07, 9.6696844e-08],
       ...,
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]], dtype=float32)

In [64]:
np.argmax(pred[0], 1)

array([ 3, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0], dtype=int64)