# Libraries

In [1]:
!nvidia-smi

Fri Sep  8 17:53:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 12.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:1B:00.0 Off |                    0 |
| N/A   30C    P0    44W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import regex
import re
import random
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

KeyboardInterrupt: 

In [None]:
# parameters
hidden_size = 512
batch_size  = 80
epochs      = 100
MAX_LENGTH = 200
# load
LOAD_MODEL = False
SOURCE_LANGUAGE = "中文"
TARGET_LANGUAGE = "台文"
WITHOUT_聖經 = True
if WITHOUT_聖經:
    model_path = f"model/Scratch_model/{SOURCE_LANGUAGE}翻{TARGET_LANGUAGE}_無聖經_Embed{hidden_size}_batch{batch_size}_epoch{epochs}.pt"
else:
    model_path = f"model/Scratch_model/{SOURCE_LANGUAGE}翻{TARGET_LANGUAGE}_有聖經_Embed{hidden_size}_batch{batch_size}_epoch{epochs}.pt"

# Load Data

In [None]:
# %pip install KeSi
from kesi import Ku
def lomaji2POJ(lomaji)->str:
    '''
    轉白話字
    '''
    ji_ls = re.split(r' |\xa0|\u3000', str(lomaji))
    trans_ji = ' '.join([Ku(ji).POJ().hanlo for ji in ji_ls])
    return trans_ji
def lomaji2KIP(lomaji)->str:
    '''
    轉羅馬字
    '''
    ji_ls = re.split(r' |\xa0|\u3000', str(lomaji))
    trans_ji = ' '.join([Ku(ji).KIP().hanlo for ji in ji_ls])
    return trans_ji

def fill_na(row):
    if pd.isnull(row['台羅']):
        row['台羅'] = lomaji2KIP(row['白話字'])
    if pd.isnull(row['白話字']):
        row['白話字'] = lomaji2POJ(row['台羅'])
    return row

# load data
df1 = pd.read_csv("data/moedict_平行_人工調整.csv")
df2 = pd.read_csv("../平行語料/聖經平行語料_p_final.csv")
df3 = pd.read_csv("../平行語料/TAT_p.csv")
if WITHOUT_聖經:
    df = pd.concat([df1, df3], axis=0).drop_duplicates()
else:
    df = pd.concat([df1, df2, df3], axis=0).drop_duplicates()
df = df[['中文', '台文', '台羅', '白話字']]
# 轉白話字或漢羅
df = df.apply(fill_na, axis=1).dropna()
df = df[df[SOURCE_LANGUAGE].str.len() <= MAX_LENGTH]
df = df[df[TARGET_LANGUAGE].str.len() <= MAX_LENGTH]
print(f"Number of traning data: {df.shape[0]}")

Number of traning data: 16873


# Data Preprocess

In [None]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3

def split_POJ_sentence(sentence):
    pattern = r'(?=-)|(?<=\p{Punct})|(?=\p{Punct})|(?<=\s)|(?=\s)'
    final_result = []
    result = regex.split(pattern, sentence)
    for i, word in enumerate(result):
        if i > 0 and result[i-1] == '-':
            final_result[-1] += word
        else:
            final_result.append(word)
    return [word for word in final_result if word]

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"[PAD]": 0,
                           "[SOS]": 1, 
                           "[EOS]": 2, 
                           "[UNK]": 3}
        self.word2count = {}
        self.index2word = {0: "[PAD]", 
                           1: "[SOS]", 
                           2: "[EOS]", 
                           3: "[UNK]"}
        self.n_words = 4  # Count SOS and EOS

    def add_Sentence_Word(self, sentence):
        if self.name == "中文" or self.name == "台文":
            for _ in sentence:
                self.addWord(_)
        else:
            split_sen = split_POJ_sentence(sentence)
            for _ in split_sen:
                self.addWord(_)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def add_Vocab(self, vocab_path):
        with open(vocab_path, 'r', encoding='utf-8') as file:
            vocab_list = [vocab.rstrip() for vocab in file.readlines()]
        for _ in vocab_list[5:]:
            if _ not in self.word2index:
                self.word2index[_] = self.n_words
                self.word2count[_] = 1
                self.index2word[self.n_words] = _
                self.n_words += 1
            else:
                self.word2count[_] += 1

In [None]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def load_Langs(lang1, lang2, df, reverse=False):
    print("Reading DataFrame...")
    # Split every line into pairs and normalize
    pairs = []
    for _ in range(df.shape[0]):
        if lang1 == "中文" or lang1 == "台文":
            #pair1 = unicodeToAscii(df[lang1].iloc[_])
            pair1 = df[lang1].iloc[_]
        else: 
            pair1 = normalizeString(df[lang1].iloc[_])
        if lang2 == "中文" or lang2 == "台文":
            #pair2 = unicodeToAscii(df[lang2].iloc[_])
            pair2 = df[lang2].iloc[_]
        else: 
            pair2 = normalizeString(df[lang2].iloc[_])
        pairs.append((pair1, pair2))
    # Reverse pairs, make Lang instances
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def prepareData(lang1, lang2, df, reverse=False):
    input_lang, output_lang, pairs = load_Langs(lang1, lang2, df, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Adding vocab or words...")
    # add vocab
    for pair in pairs:
        input_lang.add_Sentence_Word(pair[0])
        output_lang.add_Sentence_Word(pair[1])

    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# sample
input_lang, output_lang, pairs = prepareData(SOURCE_LANGUAGE, TARGET_LANGUAGE, df, False)
print(f"Input: {input_lang.name}\nOutput: {output_lang.name}")
print(random.choice(pairs))

Reading DataFrame...


Read 16873 sentence pairs
Adding vocab or words...
Counted words:
中文 3414
台文 3407
Input: 中文
Output: 台文
('拜託他做事，他都做得很用心。', '拜託伊做代誌，伊攏做甲真入心。')


# Model

## Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

## Decoder

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

## Attention

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

## Model

In [None]:
class transformer_translator(nn.Module):
    def __init__(self, hidden_size, input_lang, output_lang):
        super(transformer_translator, self).__init__()
        self.encoder = EncoderRNN(input_lang.n_words, hidden_size)
        self.decoder = AttnDecoderRNN(hidden_size, output_lang.n_words)
    def forward(self, input_tensor, target_tensor = None):
        encoder_outputs, encoder_hidden = self.encoder(input_tensor)
        decoder_outputs, _, _ = self.decoder(encoder_outputs, encoder_hidden, target_tensor)
        return decoder_outputs

# Training Preprocess (Define and Prepare Data)

## Preparing Training Data

In [None]:
def indexesFromSentence(lang, sentence):
    '''
    "我是一個句子" --> [0,1,2,3,4,5]
    '''
    if lang.name =="中文" or lang.name =="台文":
        encode= []
        for word in sentence:
            if word in lang.word2index.keys():
                encode.append(lang.word2index[word])
            else:
                encode.append(lang.word2index["[UNK]"])
        return encode
    else:
        encode= []
        for word in split_POJ_sentence(sentence):
            if word in lang.word2index.keys():
                encode.append(lang.word2index[word])
            else:
                encode.append(lang.word2index["[UNK]"])
        return encode

def tensorFromSentence(lang, sentence):
    '''
    "我是一個句子" --> tensor([0,1,2,3,4,5])
    '''
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    '''
    ("我是一個句子", "第二句") --> (tensor([0,1,2,3,4,5]), tensor([6,7,4]))
    '''
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData(SOURCE_LANGUAGE, TARGET_LANGUAGE, df, False)
    print(f"Input: {input_lang.name}\nOutput: {output_lang.name}")
    print(random.choice(pairs))
    
    n = len(pairs)
    # [PAD] here
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

Reading DataFrame...
Read 16873 sentence pairs
Adding vocab or words...


Counted words:
中文 3414
台文 3407
Input: 中文
Output: 台文
('煙往上衝', '煙蓬蓬衝')


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 31.75 GiB total capacity; 5.64 GiB already allocated; 18.75 MiB free; 5.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
next(iter(train_dataloader))[0][0]

tensor([  99,   32, 1721, 1722,  685,  101, 2977,  175,   41,    2,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

## Plotting results

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

## Define Training

In [None]:
def train_epoch(dataloader, model, optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        optimizer.zero_grad()
        input_tensor, target_tensor = data
        decoder_outputs = model(input_tensor, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def evaluate_Sentence(model, sentence, input_lang, output_lang):
    # beam search
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        decoder_outputs = model(input_tensor)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('') # "EOS"
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words

In [None]:
def train(train_dataloader, model, n_epochs, learning_rate=0.001, 
          print_every=100, plot_every=100):
    print("Start training:   ")
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total  = 0  # Reset every plot_every
    best_loss = float("inf")
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_token) # ignore PAD
    test_pair = random.choice(pairs)

    for epoch in range(1, n_epochs + 1):
        print(f'Epoch: {epoch}')
        loss = train_epoch(train_dataloader, model, optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        print(f'loss: {loss}, best loss: {best_loss}')
        if loss < best_loss:
            best_loss = loss
            # save model
            torch.save(model.state_dict(), model_path) #"台翻中"
            print(f"model is save at\n{model_path}")
            
        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg))
            print("Testing a translation sample: ")
            print(f'Source Sentence ({SOURCE_LANGUAGE:3s}): ', test_pair[0])
            print(f'Target Sentence ({TARGET_LANGUAGE:3s}): ', test_pair[1])
            output_words = evaluate_Sentence(model, test_pair[0], input_lang, output_lang)
            print('Translation Sentence:   ', ''.join(output_words))
            
        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

## Define Evaluation

In [None]:
def evaluateRandomly(model, n=5):
    for i in range(n):
        pair = random.choice(pairs)
        print('Source:\n', pair[0])
        print('Target:\n', pair[1])
        output_words = evaluate_Sentence(model, pair[0], input_lang, output_lang)
        output_sentence = ''.join(output_words)
        print('Translation:\n', output_sentence)
        print('')

# Training and Evaluating

## Training

In [None]:
model = transformer_translator(hidden_size, input_lang, output_lang).to(device)

In [None]:
if LOAD_MODEL:
    # Load  MOdel
    model.load_state_dict(torch.load(model_path))
else:
    print(f"train model with parameters: embedding dimension:{hidden_size} batch size:{batch_size} epoches:{epochs}")
    train(train_dataloader, model, epochs, print_every=5, plot_every=5)

train model with parameters: embedding dimension:512 batch size:70 epoches:100
Start training:   
Epoch: 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 31.75 GiB total capacity; 5.64 GiB already allocated; 18.75 MiB free; 5.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Evaluating

In [None]:
model.eval()
evaluateRandomly(model)

Source:
 這是挪亞的三個兒子，他們的後裔散佈全地。
Target:
 此三個是挪亞的子；𪜶的後代分散去全世界。
Translation:
 挪亞全三分裂全地。

Source:
 這個東西被人冒名領走了。
Target:
 這項物件予人食名領去矣。


Translation:
 彼蕊物件由矣。

Source:
 兩根柱子和柱子頂上兩個如碗的柱頂，以及蓋着如碗柱頂的兩個網子；
Target:
 伊所造的有：
兩支柱、柱頂碗形的柱斗、蓋碗形柱斗的網仔；
Translation:
 托廟妝飾柱頂的柱斗；柱斗頂妝飾柱斗造兩綴銅碗牆頂碗牆角暈製造妝飾鍊鍊雕刻鍊𫞼兩圈妝飾圈套芳油雕造斗𫞼造兩斗柱斗邊碗牆造兩斗斗柱斗的網圈套妝飾柱斗的網仔斗造斗斗角暈幔造妝飾牙斗造妝飾牙籠造兩籠妝飾汗。

Source:
 要娶妻生兒養女，為你們的兒子娶妻，使你們的女兒嫁人，生兒養女。你們要在那裏生養眾多，不可減少。
Target:
 娶某生子，互恁的後生查某子結婚，養育子女。恁著佇遐生湠，毋通互恁的人口減少。
Translation:
 娶某子養飼女子替翁某囝作某，生子兒女。恁著佇遐生湠真多，毋通減少。

Source:
 於是耶穌說：「婦人，你的信心好大呀！照你所要的，給你成全吧！」她的女兒就在那時候好起來了。
Target:
 後來，耶穌應講：「婦仁人，你的信心真大。互你的心願實現！」伊的查某子佇彼陣就得著醫好。
Translation:
 然婦婦仁人，你的信心愈赴功效！」伊的查某子𫢶佇彼時陣。

Source:
 你看，他還公開講道，他們也不對他說甚麼。難道官長真的認為這是基督嗎？
Target:
 你看，伊公開咧講話，𪜶嘛無講什麼給伊反對。豈講諸個官長真正知此個人是基督？
Translation:
 開別句話歪喙，𪜶嘛𣍐給伊赦免什麼。

Source:
 所羅門王用黎巴嫩木，
為自己製造車子。
Target:
 所羅門王用黎巴嫩的柴，
為家己做一頂轎。
Translation:
 黎巴嫩林，
掘池臟持樓梯。

Source:
 於是末底改照以斯帖一切所吩咐的去做。
Target:
 末底改就照以斯帖的吩咐去做。
Translation:
 再換以斯帖之楗

Source:
 於是，約書亞派遣他們前去。他們行軍到埋伏的地方，伏在伯特利和艾城的中間，就是艾城的西邊。這夜，約書亞在士兵中間過夜。
Target:
 約書亞派𪜶去，𪜶就去埋伏佇艾城的西旁，伯特利及艾城的中間。彼暝，約書亞及戰士做夥佇軍營歇睏。
Translation:
 約瑟派偵探敘利亞畢閃避輪；𪜶𤆬伏伏兵閃避埋伏閃避掃羅珥滅亡。這暝暝時，約書亞佇士革夫中間。

Source

# Visualizing Attention

In [None]:
def evaluateAndShowAttention(input_sentence):
    output_words = evaluate_Sentence(model, input_sentence, input_lang, output_lang)
    print('input  =', input_sentence)
    print('output =', ''.join(output_words))

evaluateAndShowAttention(pairs[1][0])
evaluateAndShowAttention(pairs[2][0])
evaluateAndShowAttention(pairs[3][0])

input = 蔡崇名目前還是十八個認養兒，還是國內、國外各半的乾爹，是雲嘉南區最多乾兒子的愛心爸爸。
output = 押沙龍眼副養飼脯浴料還予政府、加巾舍票車藤懸戀耍呢乾丸怪？


input = 家扶中心社工和認養兒，稱呼他是「聯合國爸爸」，代表他的愛心不分海內外，更加重要是持續至今，對他有著深深感恩和尊敬。
output = 扶賊腐心佮認養主張聯聯盟友」，代表達薛月予東方斗閣較闊蒂頭足今暝日逼耙謳咾輕鬆，閣共占深事惹天情卑輕鬆威。


  ax.set_xticklabels([''] + input_sentence.split(' ') +['<EOS>'], rotation=90)
  ax.set_yticklabels([''] + output_words)


input = 蔡崇名是現任新營市大宏里長，和新營公益慈善基金會董事長。
output = 十萬新館市林口疊車踅、社營圍困 美妙直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直直徑屠高速直直。
input = 難道要強化兩岸經貿交流，也必須透過降低台灣的民主自由程度，來達成與中國同樣極權統治的水準？
output = 恁化滅負責任泰爾兇省，嘛著想欲降低此塔、民爭執政統治國內成做游泳池？


# BLEU adn CHRF++

In [None]:
test_df = pd.read_csv("data/iCorpus_test_sample100.csv").drop_duplicates().dropna()
test_df = test_df[test_df[SOURCE_LANGUAGE].str.len() <= MAX_LENGTH]
test_df = test_df[test_df[TARGET_LANGUAGE].str.len() <= MAX_LENGTH]

In [None]:
import evaluate
def compute_batch_sentence_BLEU(translation_sentence_list:list, correct_sentence_list:list):
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=translation_sentence_list, 
                           references=correct_sentence_list,
                           smooth=True)
    return results['bleu']

def compute_batch_sentence_CHRF_plus_plus(translation_sentence_list:list, correct_sentence_list:list):
    chrf = evaluate.load("chrf")
    results = chrf.compute(predictions=translation_sentence_list, 
                           references=correct_sentence_list,
                           word_order=2)
    return results['score']
    
def generate(input_sentence, for_bleu = False):
    output_words = evaluate_Sentence(model, input_sentence, input_lang, output_lang)
    if for_bleu:
        output_words = " ".join(output_words)
    else:
        output_words = "".join(output_words)
    return output_words

test_df['翻譯'+TARGET_LANGUAGE] = test_df[SOURCE_LANGUAGE].apply(generate)
if WITHOUT_聖經:
    test_df.to_csv(f"output/iCorpus_test_sample100_without_聖經_Source{SOURCE_LANGUAGE}_TARGET{TARGET_LANGUAGE}.csv", index = False)
else:
    test_df.to_csv(f"output/iCorpus_test_sample100_Source{SOURCE_LANGUAGE}_TARGET{TARGET_LANGUAGE}.csv", index = False)
t = test_df['翻譯'+TARGET_LANGUAGE]
c = test_df[SOURCE_LANGUAGE]
t = [" ".join([word for word in sentence]) for sentence in t]
c = [[" ".join([word for word in sentence])] for sentence in c]

print(f"COmputing the BLEU Score ({SOURCE_LANGUAGE} to {TARGET_LANGUAGE}):")
bleu_score = compute_batch_sentence_BLEU(t, c)
chrf_score = compute_batch_sentence_CHRF_plus_plus(t, c) / 100
print(f"BLEU:   {bleu_score:.5f}\nCHRF++: {chrf_score:.5f}")

BLEU:   0.51159
CHRF++: 0.50190
