In [11]:
from zipfile import ZipFile
with ZipFile('/content/drive/My Drive/Colab Notebooks/spa-eng.zip', 'r') as zipObj:
   zipObj.extractall()

In [2]:
with open('/content/spa.txt',mode="r",encoding='utf-8') as f:
  lines=f.read().split("\n")
  print("No. of sentences in the set:",len(lines))

No. of sentences in the set: 124326


In [39]:
text=lines[100].split("\t")
text

['If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.',
 'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.',
 'CC-BY 2.0 (France) Attribution: tatoeba.org #953962 (CK) & #1218695 (marcelostockle)']

In [11]:
eng_texts=[]
spa_texts=[]
eng_chars=[]
spa_chars=[]
for i in range(100000):  
  text=lines[i].split("\t")
  eng_text=text[0]
  spa_text=text[1]
  spa_text="\t"+spa_text+"\n"
  eng_texts.append(eng_text)
  spa_texts.append(spa_text)

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokeniser_en=Tokenizer(char_level=True,lower=False,filters=None)
tokeniser_en.fit_on_texts(eng_texts)
eng_index=tokeniser_en.word_index
seq_eng=tokeniser_en.texts_to_sequences(eng_texts)
tokeniser_spa=Tokenizer(char_level=True,lower=False,filters=None)
tokeniser_spa.fit_on_texts(spa_texts)
spa_index=tokeniser_spa.word_index
seq_spa=tokeniser_spa.texts_to_sequences(spa_texts)

In [13]:
seq_eng[1],eng_texts[1]

([49, 3, 11], 'Go.')

In [14]:
max_eng_tokens=len(eng_index)
max_spa_tokens=len(spa_index)
encoder_seq_length=max([len(s) for s in eng_texts])
decoder_seq_length=max([len(s) for s in spa_texts])

In [15]:
max_eng_tokens,max_spa_tokens,encoder_seq_length,decoder_seq_length

(86, 104, 41, 77)

In [16]:
import numpy as np
encoder_input_data = np.zeros((len(eng_texts),encoder_seq_length, max_eng_tokens),dtype='float32')
decoder_input_data = np.zeros((len(eng_texts),decoder_seq_length, max_spa_tokens),dtype='float32')
decoder_target_data = np.zeros((len(eng_texts),decoder_seq_length, max_spa_tokens),dtype='float32')

In [19]:
for i,text in enumerate(eng_texts):
  for j,char in enumerate(text):
    encoder_input_data[i,j,eng_index[char]-1]=1
for i,text in enumerate(spa_texts):
  for j,char in enumerate(text):
    if  j>0:
      decoder_target_data[i,j-1,spa_index[char]-1]=1    
    decoder_input_data[i,j,spa_index[char]-1]=1


# Model

In [27]:
import tensorflow as tf
##Encoder
encoder_inputs=tf.keras.Input(shape=(None,max_eng_tokens))
encoder=tf.keras.layers.LSTM(256,return_state=True)
_,state_h,state_c=encoder(encoder_inputs)
encoder_states=[state_h,state_c]

##Decoder
decoder_inputs=tf.keras.Input(shape=(None,max_spa_tokens))
decoder_lstm=tf.keras.layers.LSTM(256,return_state=True,return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(max_spa_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model=tf.keras.Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# model.fit([encoder_input_data, decoder_input_data], decoder_target_data,batch_size=100, epochs=50)


# Inference Model

In [33]:
#Encoder inference
encoder_inf_model=tf.keras.Model(encoder_inputs,encoder_states)

# Decoder inference
decoder_input_state_h=tf.keras.Input(shape=(256,))
decoder_input_state_c=tf.keras.Input(shape=(256,))
decoder_input_states=[decoder_input_state_h,decoder_input_state_c]
decoder_outputs,decoder_h,decoder_c=decoder_lstm(decoder_inputs,initial_state=decoder_input_states)
decoder_states=[decoder_h,decoder_c]
decoder_outputs=decoder_dense(decoder_outputs)
decoder_inf_model=tf.keras.Model(inputs=[decoder_inputs]+decoder_input_states,outputs=[decoder_outputs]+decoder_states)

In [37]:
reverse_input_char_index = {i:char for char, i in eng_index.items()}
reverse_target_char_index = {i:char for char, i in spa_index.items()}

def decode_seq(input_sequence):
  states=encoder_inf_model.predict(input_sequence)

  target_seq=np.zeros((1,1, max_eng_tokens),dtype='float32') #empty seq of length 1.
  target_seq[0,0,spa_index["\t"]]

  translated_text=""
  stop_condn=False

  while not stop_condn:
    decoder_out,state_h,state_c=decoder_inf_model([target_seq]+states)
# Update states    
    states=[state_h,state_c]

    sampled_token_index = np.argmax(decoder_out[0, -1, :])
    sampled_char = reverse_target_char_index[sampled_token_index+1] 
    translated_text += sampled_char

# Exit condition: either hit max length or find stop character.

    if (sampled_char == '\n' or len(translated_text) >decoder_seq_length):
      stop_condn= True

# Update the target sequence (of length 1).
    target_seq = np.zeros((1, 1,max_eng_tokens))
    target_seq[0, 0, sampled_token_index] = 1

  return translated_text


In [None]:
## example from training set
for seq_index in range(5):
  input_seq = encoder_input_data[seq_index: seq_index + 1]
  decoded_sentence = decode_seq(input_seq)
  print('*********************')
  print('Input sentence:', input_texts[seq_index])
  print('tranlated sentence:', decoded_sentence)

In [34]:
## our own example
no_of_sentences=
example=[""]
example_seq=tokeniser_en.texts_to_sequences(example)
for i,sentence in enumerate(example):
  input_seq=np.zeros((no_of_sentences,encoder_seq_length, max_eng_tokens),dtype='float32')
  for j,char in enumerate(sentence):
    input_seq[i,j,eng_index[char]-1]=1
  decoded_sentence = decode_seq(input_seq)
  print('*********************')
  print('Input sentence:', input_texts[seq_index])
  print('tranlated sentence:', decoded_sentence)  