# Chinese English Translation

In [None]:
pip install opencc

Collecting opencc
  Downloading OpenCC-1.1.2-cp37-cp37m-manylinux1_x86_64.whl (765 kB)
[?25l[K     |▍                               | 10 kB 24.2 MB/s eta 0:00:01[K     |▉                               | 20 kB 29.1 MB/s eta 0:00:01[K     |█▎                              | 30 kB 12.5 MB/s eta 0:00:01[K     |█▊                              | 40 kB 9.4 MB/s eta 0:00:01[K     |██▏                             | 51 kB 5.2 MB/s eta 0:00:01[K     |██▋                             | 61 kB 5.7 MB/s eta 0:00:01[K     |███                             | 71 kB 5.5 MB/s eta 0:00:01[K     |███▍                            | 81 kB 6.2 MB/s eta 0:00:01[K     |███▉                            | 92 kB 4.6 MB/s eta 0:00:01[K     |████▎                           | 102 kB 5.0 MB/s eta 0:00:01[K     |████▊                           | 112 kB 5.0 MB/s eta 0:00:01[K     |█████▏                          | 122 kB 5.0 MB/s eta 0:00:01[K     |█████▋                          | 133 kB 5.0 MB/s e

In [None]:
pip install bpemb

Collecting bpemb
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.2 MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.3 sentencepiece-0.1.96


In [None]:
import tensorflow as tf
import jieba
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from opencc import OpenCC
import unicodedata
import re
import numpy as np
import os
import io
import time
import tensorflow_datasets as tfds
from bpemb import BPEmb

In [None]:
bpemb_en = BPEmb(lang="en", dim=100,vs=100000)
bpemb_zh = BPEmb(lang="zh", dim=100,vs=100000)
bpemb_de = BPEmb(lang="de", dim=100,vs=100000)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs100000.model


100%|██████████| 1987533/1987533 [00:00<00:00, 2879840.54B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 37969196/37969196 [00:02<00:00, 14913049.29B/s]


downloading https://nlp.h-its.org/bpemb/zh/zh.wiki.bpe.vs100000.model


100%|██████████| 1849493/1849493 [00:00<00:00, 3073755.81B/s]


downloading https://nlp.h-its.org/bpemb/zh/zh.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 37914051/37914051 [00:02<00:00, 17706096.45B/s]


downloading https://nlp.h-its.org/bpemb/de/de.wiki.bpe.vs100000.model


100%|██████████| 2068307/2068307 [00:00<00:00, 3436210.40B/s]


downloading https://nlp.h-its.org/bpemb/de/de.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 38035802/38035802 [00:02<00:00, 15588865.01B/s]


In [None]:
class Encoder_cn(tf.keras.Model):
  def __init__(self, enc_units, batch_sz):
    super(Encoder_cn, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(100000, 256)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):

    x = self.embedding(x)

    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
class Encoder_en_de(tf.keras.Model):
  def __init__(self, enc_units, batch_sz):
    super(Encoder_en_de, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(100000, 256)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):

    x = self.embedding(x)

    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)

    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
class Decoder_en(tf.keras.Model):
  def __init__(self,  dec_units, batch_sz):
    super(Decoder_en, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(100000, 256)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(100000)
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [None]:
class Decoder_de(tf.keras.Model):
  def __init__(self,  dec_units, batch_sz):
    super(Decoder_de, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(100000, 256)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(100000)
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [None]:
d_model = 128
# d_model = train_step_en_de(inp, targ, enc_hidden)
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=6000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
optimizer = tf.keras.optimizers.Adam(0.003, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
path_to_file = "drive/MyDrive/Colab Notebooks/cmn-clean.txt"
path_to_file_de = "drive/MyDrive/Colab Notebooks/deu.txt"
path_to_singleCorpus = "drive/MyDrive/Colab Notebooks/single_corpus_zh.txt"
path_to_newCorpus = "drive/MyDrive/Colab Notebooks/new_corpus.txt"
from google.colab import drive
drive.mount('drive')
input_file = open(path_to_file,"r",encoding="utf-8")

Mounted at drive


In [None]:
cc = OpenCC('t2s')

In [None]:
def preprocess_sentence(w):
    w = w.lower().strip()
    w = cc.convert(w)
    w = re.sub(r"[^\u4e00-\u9fa5\u0080-\uFFFF_a-zA-Z0-9?.!,:：？。，！']+", " ", w)

    w = w.rstrip().strip()
    w = '| ' + w + ' /'
    return w

In [None]:
def create_dataset(path):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines]
    # print(word_pairs[200])
    return zip(*word_pairs)

In [None]:
# def max_length(tensor):
#     return max(len(t) for t in tensor)

In [None]:
def uniformLength_en_cn(list1,list2,maxlen,minlen):
  print(len(list1),len(list2))
  length = len(list1)
  a= 0
  l =0
  while length != l:
    l = len(list1)
    for i in range(a,length):
      if (len(bpemb_en.encode_ids(list1[i]))> maxlen or len(bpemb_en.encode_ids(list1[i]))< minlen
          or len(bpemb_zh.encode_ids(list2[i]))> maxlen or len(bpemb_zh.encode_ids(list2[i]))< minlen):
        del list1[i]
        del list2[i]
        length = len(list2)
        a=i
        break
      else:
        list1[i] = list1[i] + ' pad'*(maxlen-len(bpemb_en.encode_ids(list1[i])))
        list2[i] = list2[i] + ' pad'*(maxlen-len(bpemb_zh.encode_ids(list2[i])))
    
  # print(len(list1),len(list2))
  # print(list2[100])
  return list2,list1

In [None]:
def cutDataset_en_cn(targ_lang,inp_lang,maxlen,minlen):
    targ_lang=list(targ_lang)
    inp_lang=list(inp_lang)
    targ_lang, inp_lang =uniformLength_en_cn(targ_lang, inp_lang,maxlen,minlen)
    return inp_lang, targ_lang

In [None]:
def uniformLength_en_de(list1,list2,maxlen,minlen):
  print(len(list1),len(list2))
  length = len(list1)
  a= 0
  l =0
  while length != l:
    l = len(list1)
    for i in range(a,length):
      if (len(bpemb_en.encode_ids(list1[i]))> maxlen or len(bpemb_en.encode_ids(list1[i]))< minlen
          or len(bpemb_de.encode_ids(list2[i]))> maxlen or len(bpemb_de.encode_ids(list2[i]))< minlen):
        del list1[i]
        del list2[i]
        length = len(list2)
        a=i
        break
      else:
        list1[i] = list1[i] + ' pad'*(maxlen-len(bpemb_en.encode_ids(list1[i])))
        list2[i] = list2[i] + ' pad'*(maxlen-len(bpemb_de.encode_ids(list2[i]))
  # print(len(list1),len(list2))
  return list2,list1

In [None]:
def cutDataset_en_de(targ_lang,inp_lang,maxlen,minlen):
    targ_lang=list(targ_lang)
    inp_lang=list(inp_lang)
    targ_lang, inp_lang =uniformLength_en_de(targ_lang, inp_lang,maxlen,minlen)
    return inp_lang, targ_lang

In [None]:
en_cn, cn = create_dataset(path_to_file)
en_cn, cn =cutDataset_en_cn(en_cn, cn,15,1)
en_de, de = create_dataset(path_to_file_de)
en_de, de =cutDataset_en_de(en_de, de,15,5)

26828 26828
240828 240828


In [None]:
cn_enc=bpemb_zh.encode_ids(cn[0])
for i in range(1,len(cn)):
  a = bpemb_zh.encode_ids(cn[i])
  cn_enc= np.append(cn_enc,a)
cn_enc = cn_enc.reshape(len(cn),15)


In [None]:
en_enc=bpemb_en.encode_ids(en_cn[0])
for i in range(1,len(en_cn)):
  a = bpemb_en.encode_ids(en_cn[i])
  en_enc= np.append(en_enc,a)
en_enc = en_enc.reshape(len(en_cn),15)

In [None]:
en_cn_tensor_train, en_cn_tensor_val, cn_tensor_train, cn_tensor_val = train_test_split(en_enc, cn_enc, test_size=0.1)
print(len(en_cn_tensor_train), len(cn_tensor_train), len(en_cn_tensor_val), len(cn_tensor_val))

22995 22995 2555 2555


In [None]:
BUFFER_SIZE = len(cn_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(cn_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024

dataset_cn2en = tf.data.Dataset.from_tensor_slices((cn_tensor_train, en_cn_tensor_train)).shuffle(BUFFER_SIZE)
dataset_cn2en = dataset_cn2en.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
encoder_bpe_cn = Encoder_cn(units, BATCH_SIZE)
attention_layer_bpe_cn = BahdanauAttention(10)
decoder_bpe_en = Decoder_en(units, BATCH_SIZE)

In [None]:
checkpoint_dir = './training_checkpoints_bpe_cn2en'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint_bpe_cn2en = tf.train.Checkpoint(optimizer=optimizer, encoder_bpe_cn=encoder_bpe_cn, decoder_bpe_en=decoder_bpe_en)

In [None]:
@tf.function
def train_step_bpe_zh_en(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    # print('start')
    enc_output, enc_hidden = encoder_bpe_cn(inp, enc_hidden)
    # print('end')
    # print(enc_output.shape)
    # print(enc_output)
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims(bpemb_en.encode_ids("|") * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):

      predictions, dec_hidden, _ = decoder_bpe_en(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

    # print('taching')

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder_bpe_cn.trainable_variables + decoder_bpe_en.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 25

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder_bpe_cn.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset_cn2en.take(steps_per_epoch)):
    # print(type(inp))
    # print(batch)
    batch_loss = train_step_bpe_zh_en(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint_bpe_cn2en.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 10.7454
Epoch 1 Batch 100 Loss 3.7534
Epoch 1 Batch 200 Loss 3.2391
Epoch 1 Batch 300 Loss 3.0993
Epoch 1 Loss 3.5655
Time taken for 1 epoch 175.94539070129395 sec

Epoch 2 Batch 0 Loss 2.9395
Epoch 2 Batch 100 Loss 2.9454
Epoch 2 Batch 200 Loss 2.2026
Epoch 2 Batch 300 Loss 2.3377
Epoch 2 Loss 2.4857
Time taken for 1 epoch 159.26229667663574 sec

Epoch 3 Batch 0 Loss 1.9486
Epoch 3 Batch 100 Loss 1.9484
Epoch 3 Batch 200 Loss 1.9432
Epoch 3 Batch 300 Loss 1.9120
Epoch 3 Loss 1.8914
Time taken for 1 epoch 152.03226351737976 sec

Epoch 4 Batch 0 Loss 1.6481
Epoch 4 Batch 100 Loss 1.6310
Epoch 4 Batch 200 Loss 1.4776
Epoch 4 Batch 300 Loss 1.7743
Epoch 4 Loss 1.5887
Time taken for 1 epoch 159.1578299999237 sec

Epoch 5 Batch 0 Loss 1.2873
Epoch 5 Batch 100 Loss 1.3381
Epoch 5 Batch 200 Loss 1.2909
Epoch 5 Batch 300 Loss 1.2061
Epoch 5 Loss 1.3326
Time taken for 1 epoch 151.64581632614136 sec

Epoch 6 Batch 0 Loss 1.0657
Epoch 6 Batch 100 Loss 0.8329
Epoch 6 Batch 200

In [None]:
checkpoint_bpe_cn2en.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.InitializationOnlyStatus at 0x7fd8a695a590>

In [None]:
def evaluate(sentence):
    # attention_plot = np.zeros((max_length_targ, max_length_inp))
    sen = preprocess_sentence(sentence)
    if len(bpemb_zh.encode_ids(sen))<15:
      sen = sen + ' pad'*(15-len(bpemb_zh.encode_ids(sen)))
    sen_enc = [bpemb_zh.encode_ids(sen)]
    inputs = tf.convert_to_tensor(sen_enc)
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder_bpe_cn(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([10386] , 0)

    for t in range(15):
        predictions, dec_hidden, attention_weights = decoder_bpe_en(dec_input, dec_hidden, enc_out)

        # store attention weights to plot attention figures
        # attention_weights = tf.reshape(attention_weights, (-1, ))
        # attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result += bpemb_en.decode_ids([int(predicted_id)]) + ' '

        if predicted_id == 2781:
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [None]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    return result

In [None]:
sen = (u'我明天早上回家。')

In [None]:
translate(sen)

In [None]:
def model (w,matrix_source):
  return matrix_source@w

In [None]:
def cost_function(w,matrix_source,matrix_target):
    n = 64
    return 0.5/n * (np.square(matrix_target-matrix_source@w)).sum()

In [None]:
def optimize(w,matrix_source,matrix_target):
    n = 64
    alpha = 5*1e-2
    y_hat = model(w,matrix_source)
    da = (1.0/n) * ((y_hat-matrix_target)*matrix_source).sum()
    w = w - alpha*da
    return w

In [None]:
def iterate(w,matrix_source,matrix_target,times):
    for i in range(times):
        w = optimize(w,matrix_source,matrix_target)

    y_hat=model(w,matrix_source)
    cost = cost_function(w,matrix_source,matrix_target)
    print(w,cost)

    return w

In [None]:
for i in context_vector_cn:
  w = iterate(w,context_vector_cn.numpy()[i],context_vector_en_cn.numpy()[i],10000)

[[0.00359747 0.02975853 0.07078533 ... 0.02138552 0.08781181 0.04771207]
 [0.03562495 0.04025518 0.0523388  ... 0.02231064 0.06145953 0.0107934 ]
 [0.00963287 0.0963925  0.02958895 ... 0.02512286 0.01474058 0.02905766]
 ...
 [0.03495453 0.0611114  0.00345175 ... 0.02899886 0.03915337 0.05887099]
 [0.05802387 0.06030348 0.02565435 ... 0.07376109 0.0143155  0.00013901]
 [0.01526998 0.03803491 0.00755802 ... 0.01191425 0.06561932 0.02601225]] 0.05662886224731087


# Translate English to German

In [None]:
en_cn, cn = create_dataset(path_to_file)
en_cn, cn =cutDataset_en_cn(en_cn, cn,15,1)
en_de, de = create_dataset(path_to_file_de)
en_de, de =cutDataset_en_de(en_de, de,15,5)

In [None]:
de_enc=bpemb_de.encode_ids(de[0])
for i in range(1,120000):
# for i in range(1,10):
  a = bpemb_de.encode_ids(de[i])
  de_enc= np.append(de_enc,a)
de_enc = de_enc.reshape(120000,15)


In [None]:
en_de_enc=bpemb_en.encode_ids(en_de[0])
for i in range(1,120000):
# for i in range(1,10):
  a = bpemb_en.encode_ids(en_de[i])
  en_de_enc= np.append(en_de_enc,a)
en_de_enc = en_de_enc.reshape(120000,15)

In [None]:
# en_cn_tensor_train, en_cn_tensor_val, cn_tensor_train, cn_tensor_val = train_test_split(en_enc, cn_enc, test_size=0.1)
print(len(en_de_tensor_train), len(de_tensor_train), len(en_de_tensor_val), len(de_tensor_val))

In [None]:
BUFFER_SIZE = len(en_de_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(en_de_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024


In [None]:
dataset_en2de = tf.data.Dataset.from_tensor_slices((en_de_tensor_train, de_tensor_train)).shuffle(BUFFER_SIZE)
dataset_en2de = dataset_en2de.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
encoder_bpe_en_de = Encoder_en_de(units, BATCH_SIZE)
attention_layer_bpe_cn = BahdanauAttention(10)
decoder_bpe_de = Decoder_de(units, BATCH_SIZE)

In [None]:
checkpoint_dir = './training_checkpoint_bpe_en2de'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint_bpe_en2de = tf.train.Checkpoint(optimizer=optimizer, encoder_bpe_en_de=encoder_bpe_en_de, decoder_bpe_de=decoder_bpe_de)

In [None]:
bpemb_de.encode_ids("|")

In [None]:
@tf.function
def train_step_bpe_en_de(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    # print('start')
    enc_output, enc_hidden = encoder_bpe_en_de(inp, enc_hidden)
    # print('end')
    # print(enc_output.shape)
    # print(enc_output)
    dec_hidden = enc_hidden
    # dec_input = dec_inp

    dec_input = tf.expand_dims(bpemb_de.encode_ids("|") * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder_bpe_de(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

    # print('taching')

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder_bpe_en_de.trainable_variables + decoder_bpe_de.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 25

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder_bpe_en_de.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset_en2de.take(steps_per_epoch)):
    # print(type(inp))
    # print(batch)
    batch_loss = train_step_bpe_en_de(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint_bpe_en2de.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
checkpoint_bpe_en2de.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
def evaluate_en2de(sentence):
    sen = preprocess_sentence(sentence)
    if len(bpemb_en.encode_ids(sen))<15:
      sen = sen + ' pad'*(15-len(bpemb_en.encode_ids(sen)))
    sen_enc = [bpemb_en.encode_ids(sen)]
    # print(sen_enc)
    inputs = tf.convert_to_tensor(sen_enc)
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder_bpe_en_de(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([7641] , 0)

    for t in range(15):
        predictions, dec_hidden, attention_weights = decoder_bpe_de(dec_input, dec_hidden, enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()
        result += bpemb_de.decode_ids([int(predicted_id)]) + ' '

        if predicted_id == 1914:
            return result, sentence

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [None]:
def translate_en2de(sentence):
    result, sentence = evaluate_en2de(sentence)
    return result

In [None]:
def cleanSentence(sentence):
  sentence = preprocess_sentence(sentence)
  strinfo1 = re.compile('\|')
  sentence = strinfo1.sub('', sentence).strip()
  strinfo2 = re.compile('/')
  sentence = strinfo2.sub('', sentence).strip()
  strinfo3 = re.compile('pad')
  sentence = strinfo3.sub('', sentence).strip()
  return sentence 


In [None]:
#BLEU score of English German Translation
score = 0
from nltk.translate.bleu_score import sentence_bleu
for i in range(100,200):
  s = en_de_tensor_val[i]

  sentence = bpemb_en.decode_ids(s)
  print(sentence)
  strinfo1 = re.compile('\|')
  sentence = strinfo1.sub('', sentence).strip()
  strinfo2 = re.compile('/')
  sentence = strinfo2.sub('', sentence).strip()
  strinfo3 = re.compile('pad')
  sentence = strinfo3.sub('', sentence).strip()
  sentence_de = translate_en2de(sentence)
  sentence_de = strinfo2.sub('', sentence_de).strip()
  result = sentence_de
  # print(result)
  result = result.split()
  # print(de_tensor_val[i])
  sen_val = bpemb_de.decode_ids(de_tensor_val[i])
  # print(sen_val)
  reference = [cleanSentence(sen_val).split()]
  score =score+ sentence_bleu(reference, result)
print(score/100)

In [None]:
#BLEU score of Chinese English Translation
score = 0
for i in range(100,300):
  s = cn_tensor_val[i]
  sentence = bpemb_zh.decode_ids(s)
  strinfo1 = re.compile('\|')
  sentence = strinfo1.sub('', sentence).strip()
  strinfo2 = re.compile('/')
  sentence = strinfo2.sub('', sentence).strip()
  strinfo3 = re.compile('pad')
  sentence = strinfo3.sub('', sentence).strip()
  sentence_cn2en = translate(sentence)
  sentence_cn2en = strinfo2.sub('', sentence_cn2en).strip()
  result = sentence_cn2en.split()
  sen_val = bpemb_en.decode_ids(en_cn_tensor_val[i])
  reference = [cleanSentence(sen_val).split()]
  score =score+ sentence_bleu(reference, result)
print(score/200)

In [None]:
sen_val = bpemb_de.decode_ids(de_tensor_val[i])
reference = [cleanSentence(sen_val).split()]

In [None]:
from nltk.translate.bleu_score import sentence_bleu
score = sentence_bleu(reference, result)
print(score)

In [None]:
path_mono_de_test = "drive/MyDrive/Colab Notebooks/single_corpus_de_test.txt"
path_mono_en_test = "drive/MyDrive/Colab Notebooks/single_corpus_en_test.txt"
path_mono_zh_test = "drive/MyDrive/Colab Notebooks/single_corpus_zh_test.txt"
from google.colab import drive
drive.mount('drive')
file_mono_de_test = open(path_mono_de_test,"r",encoding="utf-8")
file_mono_en_test = open(path_mono_en_test,"r",encoding="utf-8")
file_mono_zh_test = open(path_mono_zh_test,"r",encoding="utf-8")

In [None]:
path_mono_de = "drive/MyDrive/Colab Notebooks/single_corpus_de.txt"
path_mono_en = "drive/MyDrive/Colab Notebooks/single_corpus_en.txt"
path_mono_zh = "drive/MyDrive/Colab Notebooks/single_corpus_zh.txt"
from google.colab import drive
drive.mount('drive')
file_mono_de = open(path_mono_de,"r",encoding="utf-8")
file_mono_en = open(path_mono_en,"r",encoding="utf-8")
file_mono_zh = open(path_mono_zh,"r",encoding="utf-8")

In [None]:
def openFile(path):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  # for line in lines:
  #   line = preprocess_sentence(line)

    # word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines]
    # print(word_pairs[200])

  return lines

In [None]:
mono_de = openFile(path_mono_de)
mono_en = openFile(path_mono_en)
mono_zh = openFile(path_mono_zh)

In [None]:
mono_de_test = openFile(path_mono_de_test)
mono_en_test = openFile(path_mono_en_test)
mono_zh_test = openFile(path_mono_zh_test)

In [None]:
translate_en2de(u'i am my whole on my section.')

In [None]:
#BLEU score of Chinese-German Translation
score = 0
for i in range(100):
  strinfo2 = re.compile('/')
  sentence_cn2en = translate(mono_zh_test[i])
  sentence_cn2en = strinfo2.sub('', sentence_cn2en).strip()
  result = sentence_cn2en.split()
  # print(result)
  sen_val = preprocess_sentence(mono_de_test[i]) 
  # print(sen_val)
  reference = [cleanSentence(sen_val).split()]
  # print(reference)
  score =score+ sentence_bleu(reference, result)
print(score/100)