In [None]:
pip install opencc



In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from opencc import OpenCC
import unicodedata
import re
import numpy as np
import os
import io
import time
import tensorflow_datasets as tfds
cc = OpenCC('t2s')

In [None]:
#Define an Encoder 
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
#Define a BahdanauAttention
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # The shape of the hidden layer == (batch size, hidden layer size)
    # The shape of hidden_with_time_axis  == （batch size，1, hidden layer size）
    # calculate the score 
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # The shape of the score == (batch size, maximum length, 1)
    # We get 1 on the last axis because we apply the score to self.V
    # Before applying self.V, the shape of the tensor is (batch size, maximum length, unit)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))
    # The shape of attention_weights == (batch size, maximum length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)
    # The shape of context_vector == batch size, hidden layer size
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [None]:
#Define a Decoder
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    # using attention
    self.attention = BahdanauAttention(self.dec_units)
  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    # Word embedding encoding on X
    x = self.embedding(x)
    # The shape of x after concatenation == （batch size，1，embedding dimensions + hidden layer size）
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    #Send the merged vector to GRU
    output, state = self.gru(x)
    # The shape of output == （batch size * 1，hidden layer size）
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state, attention_weights

In [None]:
d_model = 128
# d_model = train_step_en_de(inp, targ, enc_hidden)
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=6000):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(0.003, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
path_to_file = "drive/MyDrive/Colab Notebooks/cmn-clean.txt"
path_to_file_de = "drive/MyDrive/Colab Notebooks/deu.txt"
path_to_singleCorpus = "drive/MyDrive/Colab Notebooks/single_corpus_zh.txt"
path_to_newCorpus = "drive/MyDrive/Colab Notebooks/new_corpus.txt"
from google.colab import drive
drive.mount('drive')
input_file = open(path_to_file,"r",encoding="utf-8")

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
cc = OpenCC('t2s')

In [None]:
#Data cleaning
def preprocess_sentence(w): 
    w = re.sub(r"([\u4e00-\u9fa5_?.!,:：？。，！])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # Replace all characters with spaces except
    # (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^\u4e00-\u9fa5\u0080-\uFFFF_a-zA-Z0-9?.!,:：？。，！']+", " ", w)
    w = w.rstrip().strip()
    w = '<s> ' + w + ' <s/>'
    return w

In [None]:
def create_dataset(path):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines]
    return zip(*word_pairs)

In [None]:
# Calculate the maximum length of Tensor
def max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
# Tokenize
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  print (tensor),print(lang_tokenizer)
  return tensor, lang_tokenizer

In [None]:
# Set the length of the input sequence
def uniformLength(list1,list2,maxlen,minlen):
  print(len(list1),len(list2))
  length = len(list1)
  a= 0
  l =0
  while length != l:
    l = len(list1)
    for i in range(a,length):
      if (len(list1[i].strip().split())> maxlen or len(list1[i].strip().split())< minlen
          or len(list2[i].strip().split())> maxlen or len(list2[i].strip().split())< minlen):
        del list1[i]
        del list2[i]
        length = len(list2)
        a=i
        break
  return list1,list2

In [None]:
# Load the dataset
def load_dataset(path,maxlen,minlen):
    # Create cleaned input and output pairs
    targ_lang, inp_lang = create_dataset(path)
    targ_lang=list(targ_lang)
    inp_lang=list(inp_lang)
    targ_lang, inp_lang =uniformLength(targ_lang, inp_lang,maxlen,minlen)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
de_tensor, en_de_tensor, de_lang, en_de_lang = load_dataset(path_to_file_de,15,5)
# Calculate the maximum length of the target tensor （max_length）
max_length_targ, max_length_inp = max_length(de_tensor), max_length(en_de_tensor)

240828 240828
[[    1   397    14 ...     0     0     0]
 [    1   748    14 ...     0     0     0]
 [    1   394   457 ...     0     0     0]
 ...
 [    1    19 34034 ...     2     0     0]
 [    1    32   254 ...  6169     3     2]
 [    1    14     8 ... 34041     3     2]]
<keras_preprocessing.text.Tokenizer object at 0x7fbea2bcfb10>
[[   1   13   17 ...    0    0    0]
 [   1   13   17 ...    0    0    0]
 [   1   49   35 ...    0    0    0]
 ...
 [   1   77 6544 ... 3609    3    2]
 [   1 4203 4144 ...    3    2    0]
 [   1  192   10 ...  293    3    2]]
<keras_preprocessing.text.Tokenizer object at 0x7fbea1123950>


In [None]:
# Split training set and validation set
en_de_tensor_train, en_de_tensor_val, de_tensor_train, de_tensor_val = train_test_split(en_de_tensor, de_tensor, test_size=0.7)

# 显示长度
print(len(en_de_tensor_train), len(de_tensor_train), len(en_de_tensor_val), len(de_tensor_val))

68523 68523 159890 159890


In [None]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
print ("Input test Language; index to word mapping")
convert(en_de_lang, en_de_tensor_train[0])
print ()
print ("Target train Language; index to word mapping")
convert(de_lang, de_tensor_train[0])

Input test Language; index to word mapping
1 ----> <s>
6 ----> i
96 ----> got
197 ----> these
2894 ----> earrings
78 ----> from
25 ----> my
1341 ----> grandmother
3 ----> .
2 ----> <s/>

Target train Language; index to word mapping
1 ----> <s>
5 ----> ich
22 ----> habe
92 ----> diese
3601 ----> ohrringe
50 ----> von
252 ----> meiner
1394 ----> großmutter
314 ----> bekommen
3 ----> .
2 ----> <s/>


In [None]:
#Set model parameters
BUFFER_SIZE = len(en_de_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(en_de_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_en_de_size = len(en_de_lang.word_index)+1
vocab_de_size = len(de_lang.word_index)+1

In [None]:
dataset_en_de = tf.data.Dataset.from_tensor_slices((en_de_tensor_train, de_tensor_train)).shuffle(BUFFER_SIZE)
dataset_en_de = dataset_en_de.batch((BATCH_SIZE), drop_remainder=True)

In [None]:
encoder = Encoder(vocab_en_de_size, embedding_dim, units, BATCH_SIZE)
attention_layer = BahdanauAttention(10)
decoder = Decoder(vocab_de_size, embedding_dim, units, BATCH_SIZE)

In [None]:
#Save the model training results using checkpoints
checkpoint_dir = 'drive/MyDrive/Colab Notebooks/training_checkpoints_en_de'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)

In [None]:
# Model training
@tf.function
def train_step_en_de(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([de_lang.word_index['<s>']] * BATCH_SIZE, 1)
    for t in range(1, targ.shape[1]):
      #Using teaching enforcing - Use the target word as the next input
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)
  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return batch_loss

In [None]:
EPOCHS = 30
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset_en_de.take(steps_per_epoch)):
    batch_loss = train_step_en_de(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  # store the checkpoint
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 5.6415
Epoch 1 Batch 100 Loss 2.7966
Epoch 1 Batch 200 Loss 2.5998
Epoch 1 Batch 300 Loss 2.3064
Epoch 1 Batch 400 Loss 2.0900
Epoch 1 Batch 500 Loss 2.4119
Epoch 1 Batch 600 Loss 2.1364
Epoch 1 Batch 700 Loss 2.2787
Epoch 1 Batch 800 Loss 2.1781
Epoch 1 Batch 900 Loss 2.2127
Epoch 1 Batch 1000 Loss 2.0359
Epoch 1 Loss 2.3381
Time taken for 1 epoch 199.91803431510925 sec

Epoch 2 Batch 0 Loss 2.0389
Epoch 2 Batch 100 Loss 1.9963
Epoch 2 Batch 200 Loss 2.0377
Epoch 2 Batch 300 Loss 2.0651
Epoch 2 Batch 400 Loss 1.9092
Epoch 2 Batch 500 Loss 2.1306
Epoch 2 Batch 600 Loss 2.1743
Epoch 2 Batch 700 Loss 1.7322
Epoch 2 Batch 800 Loss 1.8333
Epoch 2 Batch 900 Loss 1.7873
Epoch 2 Batch 1000 Loss 1.8747
Epoch 2 Loss 1.9012
Time taken for 1 epoch 178.14481568336487 sec

Epoch 3 Batch 0 Loss 1.3213
Epoch 3 Batch 100 Loss 1.6131
Epoch 3 Batch 200 Loss 1.4259
Epoch 3 Batch 300 Loss 1.4720
Epoch 3 Batch 400 Loss 1.4512
Epoch 3 Batch 500 Loss 1.3531
Epoch 3 Batch 600 Loss 1.3211


In [None]:
# Evaluate the trained model
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)
    strinfo = re.compile('<s/>')
    sentence = strinfo.sub('', sentence).strip()
    strinfo = re.compile('<s>')
    sentence = strinfo.sub('', sentence).strip()
    string = ''
    for i in sentence.split(' '):
      try:
        en_de_lang.word_index[i]
        string=string+i+' '
      except Exception:
        string=string+' un '
    sentence = string 
    sentence = preprocess_sentence(sentence)
    inputs = [en_de_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([de_lang.word_index['<s>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += de_lang.index_word[predicted_id] + ' '
        if de_lang.index_word[predicted_id] == '<s/>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

In [None]:
#Translate the sentence
def translate(sentence):
    result, sentence,_ = evaluate(sentence)
    return result

In [None]:
translate(u'i want to have a meeting with you.')

tf.Tensor([[  1   6  38   7  20  10 378  37   5   3   2   0   0   0   0]], shape=(1, 15), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>


'ich möchte mit dir ein gespräch mit dir . <s/> '

In [None]:
def merge(lang,tensor):
  s=''
  for t in tensor:
    if t!=0:
      s=s+' '+lang.index_word[t]
  strinfo = re.compile('<s>')
  s = strinfo.sub('', s)
  strinfo = re.compile('<s/>')
  s = strinfo.sub('', s)
  return s

In [None]:
#BLEU score of English German Translation
score = 0
from nltk.translate.bleu_score import sentence_bleu
for i in range(110):

  sentence = cleanSentence(mono_en[i])
  # sentence = merge(en_de_lang,sentence).strip()
  # print(sentence)
  strinfo = re.compile('<s/>')
  sentence_de = strinfo.sub('', sentence)
  sentence_de = translate(sentence)
  result = strinfo.sub('', sentence_de)
  # print(result)
  result = result.split()
  reference = [cleanSentence(mono_de[i]).strip().split()]

  score =score+ sentence_bleu(reference, result)
print(score/110)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.5374300329920739


In [None]:
path_mono_de = "drive/MyDrive/Colab Notebooks/single_corpus_de.txt"
path_mono_en = "drive/MyDrive/Colab Notebooks/single_corpus_en.txt"
path_mono_zh = "drive/MyDrive/Colab Notebooks/single_corpus_zh.txt"
from google.colab import drive
drive.mount('drive')
file_mono_de = open(path_mono_de,"r",encoding="utf-8")
file_mono_en = open(path_mono_en,"r",encoding="utf-8")
file_mono_zh = open(path_mono_zh,"r",encoding="utf-8")

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
def openFile(path):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  # for line in lines:
  #   line = preprocess_sentence(line)

    # word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines]
    # print(word_pairs[200])

  return lines

In [None]:
mono_de = openFile(path_mono_de)
mono_en = openFile(path_mono_en)
mono_zh = openFile(path_mono_zh)

In [None]:
def cleanSentence(sentence):
  sentence = preprocess_sentence(sentence)
  strinfo1 = re.compile('<s>')
  sentence = strinfo1.sub('', sentence).strip()
  strinfo2 = re.compile('<s/>')
  sentence = strinfo2.sub('', sentence).strip()
  return sentence 

In [None]:
cleanSentence(mono_de[0])

'durch weitere experimente fanden die wissenschaftler das heraus .'

# Translate chinese to english

In [None]:
# input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file)
cn_tensor, en_cn_tensor, cn_lang, en_cn_lang = load_dataset(path_to_file,15,1)
# target_tensor, input_tensor,  inp_lang, targ_lang = load_dataset(path_to_file)
max_length_targ, max_length_inp = max_length(cn_tensor), max_length(en_cn_tensor)

26828 26828
[[   1 2755    3 ...    0    0    0]
 [   1 1987    3 ...    0    0    0]
 [   1    6   31 ...    0    0    0]
 ...
 [   1  517   13 ...    0    0    0]
 [   1  732  444 ... 1581    3    2]
 [   1  517  602 ...    2    0    0]]
<keras_preprocessing.text.Tokenizer object at 0x7fbe3463a350>
[[   1 3814    3 ...    0    0    0]
 [   1 1289    3 ...    0    0    0]
 [   1 1289    3 ...    0    0    0]
 ...
 [   1    4  293 ...    0    0    0]
 [   1    4 1777 ...    0    0    0]
 [   1    4 1162 ...    0    0    0]]
<keras_preprocessing.text.Tokenizer object at 0x7fbe34619710>


In [None]:
cn_tensor_train, cn_tensor_val, en_cn_tensor_train, en_cn_tensor_val = train_test_split(cn_tensor, en_cn_tensor, test_size=0.1)

In [None]:
BUFFER_SIZE_CN = len(cn_tensor_train)
steps_per_epoch_cn_en = len(cn_tensor_train)//BATCH_SIZE
vocab_cn_size = len(cn_lang.word_index)+1
vocab_en_cn_size = len(en_cn_lang.word_index)+1

dataset_cn_en = tf.data.Dataset.from_tensor_slices((cn_tensor_train, en_cn_tensor_train)).shuffle(BUFFER_SIZE_CN)
dataset_cn_en = dataset_cn_en.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
encoder_cn = Encoder(vocab_cn_size, embedding_dim, units, BATCH_SIZE)
attention_layer_cn = BahdanauAttention(10)
decoder_cn = Decoder(vocab_en_cn_size, embedding_dim, units, BATCH_SIZE)

In [None]:
checkpoint_dir = 'drive/MyDrive/Colab Notebooks/training_checkpoints_zh_en'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint_cn = tf.train.Checkpoint(optimizer=optimizer,encoder_cn=encoder_cn,decoder_cn=decoder_cn)

In [None]:
@tf.function
def train_step_zh_en(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder_cn(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_cn_lang.word_index['<s>']] * BATCH_SIZE, 1)
    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder_cn(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)
  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder_cn.trainable_variables + decoder_cn.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return batch_loss

In [None]:
EPOCHS = 30
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder_cn.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset_cn_en.take(steps_per_epoch)):
    batch_loss = train_step_zh_en(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint_cn.save(file_prefix = checkpoint_prefix)
  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.4476
Epoch 1 Batch 100 Loss 2.3627
Epoch 1 Batch 200 Loss 2.3417
Epoch 1 Batch 300 Loss 2.2067
Epoch 1 Loss 0.7347
Time taken for 1 epoch 44.68572378158569 sec

Epoch 2 Batch 0 Loss 1.9803
Epoch 2 Batch 100 Loss 1.8171
Epoch 2 Batch 200 Loss 1.7916
Epoch 2 Batch 300 Loss 1.8280
Epoch 2 Loss 0.5708
Time taken for 1 epoch 27.40431547164917 sec

Epoch 3 Batch 0 Loss 1.5292
Epoch 3 Batch 100 Loss 1.6269
Epoch 3 Batch 200 Loss 1.6787
Epoch 3 Batch 300 Loss 1.7141
Epoch 3 Loss 0.5014
Time taken for 1 epoch 25.514168977737427 sec

Epoch 4 Batch 0 Loss 1.5105
Epoch 4 Batch 100 Loss 1.5352
Epoch 4 Batch 200 Loss 1.4642
Epoch 4 Batch 300 Loss 1.3486
Epoch 4 Loss 0.4359
Time taken for 1 epoch 26.194139003753662 sec

Epoch 5 Batch 0 Loss 1.1443
Epoch 5 Batch 100 Loss 1.2415
Epoch 5 Batch 200 Loss 1.2172
Epoch 5 Batch 300 Loss 1.3134
Epoch 5 Loss 0.3726
Time taken for 1 epoch 25.45593285560608 sec

Epoch 6 Batch 0 Loss 1.0048
Epoch 6 Batch 100 Loss 1.0998
Epoch 6 Batch 200 Lo

In [None]:
def evaluate_cn_en(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    string = ''
    for i in sentence.split(' '):
      try:
        cn_lang.word_index[i]
        string=string+i+' '
        # print (string)
      except Exception:
        string=string+' OOV'
        print (string)
    sentence = string 
    print(sentence)
    sentence = preprocess_sentence(sentence)
    inputs = [cn_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder_cn(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_cn_lang.word_index['<s>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder_cn(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += en_cn_lang.index_word[predicted_id] + ' '
        if en_cn_lang.index_word[predicted_id] == '<s/>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

In [None]:
def translate_cn2en(sentence):
    result, sentence,_ = evaluate_cn_en(sentence)
    return result

In [None]:
print(translate_cn2en(u"我喜欢你。"))

In [None]:
#BLEU score of Chinese English Translation
score = 0
from nltk.translate.bleu_score import sentence_bleu
for i in range(100,200):

  # sentence = cleanSentence(mono_en[i])
  sentence = merge(cn_lang,cn_tensor_val[i]).strip()
  # print(sentence)
  strinfo = re.compile('<s/>')
  sentence_cn = strinfo.sub('', sentence)
  sentence_en = translate_cn2en(sentence_cn)
  result = strinfo.sub('', sentence_en)
  # print(result)
  result = result.split()
  reference = [merge(en_cn_lang,en_cn_tensor_val[i]).strip().split()]

  score =score+ sentence_bleu(reference, result)
print(score/100)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.5687710974378809


In [None]:
#BLEU score of Chinese English Translation
score = 0
from nltk.translate.bleu_score import sentence_bleu
for i in range(110):

  sentence = cleanSentence(mono_zh[i])
  # sentence = merge(en_de_lang,sentence).strip()
  # print(sentence)
  strinfo = re.compile('<s/>')
  sentence_de = strinfo.sub('', sentence)
  sentence_de = translate_cn2en(sentence)
  result = strinfo.sub('', sentence_de)
  # print(result)
  result = result.split()
  reference = [cleanSentence(mono_en[i]).strip().split()]

  score =score+ sentence_bleu(reference, result)
print(score/110)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.49547567249833757


In [None]:
translate_cn2en('我 在 我 的 实 验 室 裏 。')

我 在 我 的 实 验 室 裏 。 


"i'm talking in my kitchen . <s/> "

In [None]:
translate(u'i chicken you like like .')

'ich möchte ihnen wohl so wie du magst . <s/> '

In [None]:
# Compute the mapping matrix
def model (w,matrix_source):
  return matrix_source@w

In [None]:
def cost_function(w,matrix_source,matrix_target):
    n = 64
    return 0.5/n * (np.square(matrix_target-matrix_source@w)).sum()

In [None]:
def optimize(w,matrix_source,matrix_target):
    n = 64
    alpha = 5*1e-2
    y_hat = model(w,matrix_source)
    da = (1.0/n) * ((y_hat-matrix_target)*matrix_source).sum()
    w = w - alpha*da
    return w

In [None]:
def iterate(w,matrix_source,matrix_target,times):
    for i in range(times):
        w = optimize(w,matrix_source,matrix_target)

    y_hat=model(w,matrix_source)
    cost = cost_function(w,matrix_source,matrix_target)
    print(w,cost)

    return w

In [None]:
for i in context_vector_cn:
  w = iterate(w,context_vector_cn.numpy()[i],context_vector_en_cn.numpy()[i],10000)

[[0.00359747 0.02975853 0.07078533 ... 0.02138552 0.08781181 0.04771207]
 [0.03562495 0.04025518 0.0523388  ... 0.02231064 0.06145953 0.0107934 ]
 [0.00963287 0.0963925  0.02958895 ... 0.02512286 0.01474058 0.02905766]
 ...
 [0.03495453 0.0611114  0.00345175 ... 0.02899886 0.03915337 0.05887099]
 [0.05802387 0.06030348 0.02565435 ... 0.07376109 0.0143155  0.00013901]
 [0.01526998 0.03803491 0.00755802 ... 0.01191425 0.06561932 0.02601225]] 0.05662886224731087


In [None]:
def evaluate_tsf_cn_en(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    string = ''
    for i in sentence.split(' '):
      try:
        cn_lang.word_index[i]
        string=string+i+' '
        # print (string)
      except Exception:
        string=string+' OOV'
        print (string)
    sentence = string 
    print(sentence)
    sentence = preprocess_sentence(sentence)
    # print(sentence)
    inputs = [cn_lang.word_index[i] for i in sentence.split(' ')]
    # print(inputs)
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out_cn, enc_hidden = encoder_cn(inputs, hidden)
    enc_out_cn_tsf = enc_out_cn@w
    enc_hidden = enc_out_cn_tsf[-1] 
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_cn_lang.word_index['<s>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder_cn(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += en_cn_lang.index_word[predicted_id] + ' '
        if en_cn_lang.index_word[predicted_id] == '<s/>':
            return result, sentence, attention_plot
        # 预测的 ID 被输送回模型
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

In [None]:
def translate_cn2de(sentence):
  s = translate_cn2en(sentence)
  strinfo = re.compile('<s/>')
  sentence_en = strinfo.sub('', s).strip()
  print(sentence_en)
  result = translate(sentence_en)
  
  # print (result)
  strinfo = re.compile('<s/>')
  result = strinfo.sub('', result).strip()
  return result
