# Load and check libraries

In [1]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
os.environ["CUDA_LAUNCH_BLOCKING"]="1"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm

In [2]:
#%ip install accelerate -U
#%pip install -U transformers
import torch
import pandas as pd
import regex
import re
import numpy as np
from transformers import (
    EarlyStoppingCallback,
    BartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq)
from itertools import zip_longest
from kesi import Ku
from StarCC import PresetConversion
from datasets import Dataset
print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

2.0.1
cuda


In [3]:
LANGUAGE = "台文"

# Load Data and Preprocess

In [4]:
# load data
df2 = pd.read_csv("data/moedict_平行_人工調整.csv")
df3 = pd.read_csv("../平行語料/聖經平行語料_p_final.csv")
df5 = pd.read_csv("../平行語料/TAT_p.csv")
df = pd.concat([df2, df5], axis=0).drop_duplicates()
df = df[['中文', '台文', '台羅', '白話字']]
print(df.shape[0])
df.head()

16873


Unnamed: 0,中文,台文,台羅,白話字
0,人家說當官如果清廉，生活就會清苦,人講做官若清廉，食飯著攪鹽。,"Lâng kóng tsò-kuann nā tshing-liâm, tsia̍h-pn̄...","Lâng kóng chò-koaⁿ nā chheng-liâm, chia̍h-pn̄g..."
1,頭髮留那麼長還不剪，莫非是想把錢省下來買花生糖吃。,頭毛留長長，儉錢食塗豆糖。,"Thâu-mn̂g lâu tn̂g-tn̂g, khiām-tsînn tsia̍h th...","Thâu-mn̂g lâu tn̂g-tn̂g, khiām-chîⁿ chia̍h thô..."
2,這項買賣，一邊要，一邊不要。,這項買賣，一頭欲，一頭毋。,"Tsit hāng bé-bē, tsi̍t thâu beh, tsi̍t thâu m̄.","Chit hāng bé-bē, chi̍t thâu beh, chi̍t thâu m̄."
3,大事化小，小事化無。,大事化小事，小事化無事。,"Tuā-sū huà sió-sū, sió-sū huà bô sū.","Tōa-sū hòa sió-sū, sió-sū hòa bô sū."
4,北部、南部都很知名。,頂港有名聲，下港上出名。,"Tíng-káng ū miâ-siann, ē-káng siōng tshut-miâ.","Téng-káng ū miâ-siaⁿ, ē-káng siōng chhut-miâ."


In [5]:
# 轉白話字或漢羅
#%pip install KeSi
def lomaji2POJ(lomaji)->str:
    '''
    轉白話字
    '''
    ji_ls = re.split(r' |\xa0|\u3000', str(lomaji))
    trans_ji = ' '.join([Ku(ji).POJ().hanlo for ji in ji_ls])
    return trans_ji
def lomaji2KIP(lomaji)->str:
    '''
    轉羅馬字
    '''
    ji_ls = re.split(r' |\xa0|\u3000', str(lomaji))
    trans_ji = ' '.join([Ku(ji).KIP().hanlo for ji in ji_ls])
    return trans_ji

def fill_na(row):
    if pd.isnull(row['台羅']):
        row['台羅'] = lomaji2KIP(row['白話字'])
    if pd.isnull(row['白話字']):
        row['白話字'] = lomaji2POJ(row['台羅'])
    return row

df = df.apply(fill_na, axis=1).dropna()
df.shape


(16873, 4)

In [6]:
# concat the sentence
中文 = []
台文 = []
台羅 = []
白話字 = []
conti_suffixes = ["，", "、", "；"]
end_suffixes = ["。","？","！","」","）",")","!","?"]
ch,tai,tailo,poj = "","","",""
for i in range(len(df)):
    sen = df['中文'].iloc[i]
    if any(sen.rstrip().endswith(suffix) for suffix in conti_suffixes):
        ch   +=df['中文'].iloc[i].rstrip()
        tai  +=df['台文'].iloc[i].rstrip()
        tailo+=df['台羅'].iloc[i].rstrip()
        poj  +=df['白話字'].iloc[i].rstrip()
    if any(sen.rstrip().endswith(suffix) for suffix in end_suffixes):
        ch   +=df['中文'].iloc[i].rstrip()
        tai  +=df['台文'].iloc[i].rstrip()
        tailo+=df['台羅'].iloc[i].rstrip()
        poj  +=df['白話字'].iloc[i].rstrip()
        中文.append(ch)
        台文.append(tai)
        台羅.append(tailo)
        白話字.append(poj)
        ch,tai,tailo,poj = "","","",""
    if not any(sen.rstrip().endswith(suffix) for suffix in conti_suffixes+end_suffixes):
        中文.append(df['中文'].iloc[i].rstrip())
        台文.append(df['台文'].iloc[i].rstrip())
        台羅.append(df['台羅'].iloc[i].rstrip())
        白話字.append(df['白話字'].iloc[i].rstrip())
df =pd.DataFrame({
    "中文":中文,
    "台文":台文,
    "台羅":台羅,
    "白話字":白話字
})
from sklearn.utils import shuffle
df = shuffle(df, random_state=46)
df.shape

(15925, 4)

In [7]:
convert_tw2cn = PresetConversion(src='tw', dst='cn', with_phrase=False)

def segmentation(例句, 例句標音):
    '''
    ```python
    >>> 例句 = '塗跤一半擺仔無拭，袂偌垃圾啦！'
    >>> 例句標音 = 'Thôo-kha tsi̍t-puànn-pái-á bô tshit, bē guā lah-sap--lah!'
    >>> segmentation(例句, 例句標音)
    '塗跤|一半擺仔|無|拭|，|袂|偌|垃圾啦|！'
    ```
    '''
    s = re.sub('--','-',例句標音)
    s = re.sub(',|，',', ',例句標音)
    #s = re.sub('',', ',例句標音)
    #s = re.sub('「|」', ' inn ',例句標音)
    parts = regex.split(r' |(?=-)|(?=\p{Punct})', s)
    pattern = regex.compile(r'^[a-zA-Z]')

    split_sentence = [i for i in 例句]
    #合併英文名字
    sentence = []
    english_word = ""
    for character in split_sentence:
        if ord(character) < 128 and character != " ":
            english_word += character
        else:
            if len(english_word)!=0:
                sentence.append(english_word)
                english_word = ""
            sentence.append(character)

    res = []
    for (ch, part) in zip_longest(sentence, parts):
        if ch is None:
            break
        if part is None:  # Handle the case where there's no corresponding part for a character
            res.append(ch)
        elif not part.startswith('-'): # also detect ch is not start with English character
            res.append('/')
        res.append(ch)
    return ''.join(res[1:])

i = 1351
print(df['台文'].iloc[i])
print(df['台羅'].iloc[i])
print(segmentation(df['台文'].iloc[i], df['台羅'].iloc[i]))
j = 2902
print(df['台文'].iloc[j])
print(df['台羅'].iloc[j])
segmentation(df['台文'].iloc[j], df['台羅'].iloc[j])

遮的錢是我該當出的份額。
Tsia-ê tsînn sī guá kai-tong tshut ê hūn-gia̍h.
遮的/錢/是/我/該當/出/的/份額/。
我做甲流汗，猶閣予人嫌甲流瀾，有夠慼心啦！
Guá tsò kah lâu-kuānn, iáu-koh hōo lâng hiâm kah lâu-nuā, ū-kàu tsheh-sim--lah!


'我/做/甲/流汗/，/猶/閣予/人/嫌/甲/流/瀾，/有/夠/慼心/啦！'

In [8]:
華語 = []
台語 =[]
华语 = []
台语 = []
d = {}
for i in range(len(df)):
    例句標音 = df["台羅"].iloc[i].rstrip()
    #例句 = segmentation(df["台文"].iloc[i].rstrip(), 例句標音)
    例句 = df["台文"].iloc[i].rstrip()
    if 例句[-1] != 例句[-2] and (LANGUAGE == "台文" or LANGUAGE == "中文"):
        華語.append(df["中文"].iloc[i].rstrip())
        台語.append(例句)
    else:
        華語.append(df["中文"].iloc[i].rstrip())
        台語.append(例句)
for 華, 台 in zip(華語, 台語):
    华语.append(convert_tw2cn(華))
    台语.append(convert_tw2cn(台))

# Tokenzier

In [9]:
with open('data/vocab.txt', 'r', encoding='utf-8') as file:
    vocab_list = [vocab.rstrip() for vocab in file.readlines()]
print("number of vocab:{}".format(len(vocab_list)))

number of vocab:25643


In [10]:
from transformers import AddedToken, PreTrainedTokenizer
from typing import Union, Optional
from os import makedirs
from os.path import join
class CharBasedTokeniser(PreTrainedTokenizer):

    model_input_names = ['input_ids', 'attention_mask']

    def __init__(self, vocab: Union[list[str], str]) -> None:
        if isinstance(vocab, str):
            with open(vocab, encoding='utf-8') as f:
                vocab = [line.rstrip('\n') for line in f]

        super().__init__(
            pad_token=AddedToken('[PAD]'),
            unk_token=AddedToken('[UNK]'),
            bos_token=AddedToken('[BOS]'),
            eos_token=AddedToken('[EOS]'),
            mask_token=AddedToken('[MSK]'),
        )

        assert vocab[0] == '[PAD]'
        assert vocab[1] == '[UNK]'
        assert vocab[2] == '[BOS]'
        assert vocab[3] == '[EOS]'
        assert vocab[4] == '[MSK]'

        self.special_tokens_encoder = {
            self.pad_token: 0,
            self.unk_token: 1,
            self.bos_token: 2,
            self.eos_token: 3,
            self.mask_token: 4,
        }
        self._num_special_tokens = len(self.special_tokens_encoder)
        self.special_tokens_decoder = {v: k for k, v in self.special_tokens_encoder.items()}

        self.id2ch = vocab
        self.ch2id = {c: i for i, c in enumerate(vocab)}

        assert self.pad_token_id == 0
        assert self.unk_token_id == 1
        assert self.bos_token_id == 2
        assert self.eos_token_id == 3
        assert self.mask_token_id == 4

    @property
    def vocab_size(self):
        return len(self.id2ch)

    def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1) -> list[int]:
        assert token_ids_1 is None
        return [self.bos_token_id, *token_ids_0, self.eos_token_id]

    def _tokenize(self, text: str) -> list[str]:
        return list(text)

    def _convert_token_to_id(self, token):
        unk_token_id = self.ch2id[self.unk_token]
        return self.ch2id.get(token, unk_token_id)

    def _convert_id_to_token(self, index):
        return self.id2ch[index]

    def convert_tokens_to_string(self, tokens):
        while tokens and tokens[-1] == '[PAD]':
            tokens.pop()
        if tokens and tokens[0] == '[BOS]':
            tokens = tokens[1:]
        if tokens and tokens[-1] == '[EOS]':
            tokens.pop()
        return ''.join(tokens)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
        filename = 'vocab.txt'
        if filename_prefix is not None:
            filename = filename_prefix + filename
        if save_directory is not None:
            makedirs(save_directory, exist_ok=True)
            filename = join(save_directory, filename)

        with open(filename, 'w', encoding='utf-8') as f:
            for w in self.id2ch:
                print(w, file=f)

        return (filename,)

In [11]:
def get_tokenizer(language:str, type:str, vocab_list = None):
    if type == "sentenepiece":
        Hokkien_sp_model_file = "sentencepiece/"+ language +"tokenizer.model"
        Hokkien_sp_model = spm.SentencePieceProcessor()
        Hokkien_sp_model.Load(Hokkien_sp_model_file)
        Hokkien_spm = sp_pb2_model.ModelProto()
        Hokkien_spm.ParseFromString(Hokkien_sp_model.serialized_model_proto())
        new_vocab = sorted([p.piece for p in Hokkien_spm.pieces[3:]], key=lambda x: len(x),  reverse=True)
        return len(new_vocab), Hokkien_sp_model
        
    elif type == "CharBasedTokeniser" and language != "台文":
        word_list = [re.sub(r"！|!|_|＿|\?|？|\.|。|;|:|\(|\)", "", word) for sen in df[language] for word in regex.split(r' |(?=-)|(?=\p{Punct})', sen)]
        word_list = [word.split('\n') for word in word_list]
        word_list = [i for word in word_list for i in word]
        word_list = [re.sub(r"-+|──", "-", word) for word in word_list]+["！","!","_"",","＿","?","？",".","。",";",":","(",")"]
        new_vocab = [i for i in set(word_list)-set(vocab_list)]
        vocab_list = vocab_list+sorted(new_vocab, key=lambda x: len(x),  reverse=True)
        tokenizer = CharBasedTokeniser(vocab = vocab_list)
        return tokenizer.vocab_size, tokenizer

    elif type == "LLamaTokenizer":
        from transformers import LlamaTokenizer
        tokenizer = LlamaTokenizer.from_pretrained("tokenizer/"+language+"/translator_tokenizer_hf")
        return tokenizer.vocab_size+3, tokenizer   # [BOS] [EOS] [PAD]
               
    else:
        # default
        tokenizer = CharBasedTokeniser(vocab = vocab_list)
        return tokenizer.vocab_size, tokenizer


In [12]:
if LANGUAGE == "台文":
    vocab_length, tokenizer = get_tokenizer(LANGUAGE, "CharBasedTokeniser", vocab_list = vocab_list) 
    print("number of vocab before tokenize:{}".format(tokenizer.vocab_size))
elif LANGUAGE == "台羅":
    vocab_length, tokenizer = get_tokenizer(LANGUAGE, "CharBasedTokeniser", vocab_list = vocab_list)
    #print("number of vocab before tokenize:{}".format(tokenizer.vocab_size))
elif LANGUAGE == "白話字":
    vocab_length, tokenizer = get_tokenizer(LANGUAGE, "LLamaTokenizer", vocab_list = vocab_list)
    #print("number of vocab before tokenize: {}".format(tokenizer.vocab_size))


number of vocab before tokenize:25643


In [13]:
print(df["台文"].iloc[0])
print(tokenizer.tokenize(df["台文"].iloc[0]))

火車敗馬去，死傷真濟人。
['火', '車', '敗', '馬', '去', '，', '死', '傷', '真', '濟', '人', '。']


# Dataset and Data Loader

In [14]:
import math
# train test split
len_dataset = len(華語)
len_train = math.floor(len_dataset * 0.8)
len_dev = math.floor(len_dataset * 0.1)
len_test = len_dataset - len_train - len_dev

# adjust the model max length and tokenzier padding max length
max_len_hokkien = max(len(tokenizer.tokenize(l)) for l in 台语) + 2 
min_len_hokkien = min(len(tokenizer.tokenize(l)) for l in 台语) + 2 

if LANGUAGE == "台文":
    max_len_mandarin = max(len(tokenizer.tokenize(l)) for l in 华语) + 2 # 2: [BOS] and [EOS]
    min_len_mandarin = min(len(tokenizer.tokenize(l)) for l in 华语) + 2 # 2: [BOS] and [EOS]
    max_length = min(max(max_len_mandarin, max_len_hokkien), 200)
    min_length = min(min_len_mandarin, min_len_hokkien)
elif LANGUAGE == "台羅":
    max_len_kip = max(len(tokenizer.tokenize(l)) for l in df['台羅']) + 2 
    min_len_kip = min(len(tokenizer.tokenize(l)) for l in df['台羅']) + 2 
    max_length = min(max(min_len_hokkien, max_len_kip), 200)
    min_length = min(max_len_hokkien, min_len_kip)
elif LANGUAGE == "白話字":
    max_len_poj = max(len(tokenizer.tokenize(l)) for l in df['白話字']) + 2 
    min_len_poj = min(len(tokenizer.tokenize(l)) for l in df['白話字']) + 2 
    max_length = min(max(max_len_hokkien, max_len_poj), 200)
    min_length = min(max_len_hokkien, min_len_poj)

print(max_length)
print(min_length)


200
3


In [15]:
if LANGUAGE == "台文":
    source_language = 華語
    target_language = 台語
elif LANGUAGE == "台羅":
    assert len(华语) == len(df['台羅'])
    source_language = 台语
    target_language = [re.sub(r"-+|──", "-", word) for word in df['台羅']]
elif LANGUAGE == "白話字":
    assert len(华语) == len(df['白話字'])
    source_language = 台语
    target_language = [re.sub(r"-+|──", "-", word) for word in df['白話字']]

train_data_txt = Dataset.from_dict({
    "source language": source_language[:len_train],
    "target language": target_language[:len_train]
})
validation_data_txt = Dataset.from_dict({
    "source language": source_language[len_train:],
    "target language": target_language[len_train:]
})

In [16]:
def batch_tokenize_preprocess(batch, tokenizer, max_length):
    source, target = batch['source language'], batch['target language']
    
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

train_data

Map:   0%|          | 0/12740 [00:00<?, ? examples/s]

Map:   0%|          | 0/3185 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12740
})

# Model

In [17]:
# special tokens map to model config
# for tokenizer
model = BartForConditionalGeneration.from_pretrained("fnlp/bart-base-chinese")
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.mask_token_id = tokenizer.mask_token_id
model.config.unk_token_id = tokenizer.unk_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.forced_eos_token_id = tokenizer.eos_token_id
print("BOS: ", model.config.bos_token_id)
print("EOS: ",model.config.eos_token_id)
print("PAD: ",model.config.pad_token_id)
print("MASK: ",model.config.mask_token_id)
print("UNK: ",model.config.unk_token_id)
print("Decoder Start Token ID: ",model.config.decoder_start_token_id)
print("Forced EOS Token ID: ",model.config.forced_eos_token_id)
# error here!!!!
model.resize_token_embeddings(vocab_length)
# for decode max_length
model.config.task_specific_params = {"translation": {
  "length_penalty": 1.0,
  "max_length": max_length,   # Adjust based on the expected translation length
  "min_length": min_length,   # Adjust based on the expected translation length
  "num_beams": 4              # Experiment with this value
}}
model.config.max_length = max_length
#model.config.max_length = 200
print("Decode max_length: ",model.config.max_length)

BOS:  2
EOS:  3
PAD:  0
MASK:  4
UNK:  1
Decoder Start Token ID:  2
Forced EOS Token ID:  3
Decode max_length:  200


# Training

In [18]:
import evaluate
bleu = evaluate.load("bleu")
chrf = evaluate.load("chrf")
#evaluate.list_evaluation_modules()

In [19]:
def test_metric():
    decoded_preds = ['好機會']
    decoded_labels = ['好字運']
    c = chrf.compute(predictions=["".join(tokenizer.tokenize(pred)) for pred in decoded_preds], references=["".join(tokenizer.tokenize(label)) for label in decoded_labels], word_order=2)
    b = bleu.compute(predictions=[" ".join(tokenizer.tokenize(pred)) for pred in decoded_preds], references=[" ".join(tokenizer.tokenize(label)) for label in decoded_labels], smooth=True)
    c.update(b)
    del c['precisions']
    c['score'] = c['score'] / 100
    return {
        "CHRF++": c['score'],
        "BLEU Score": c['bleu']
    }
#test_metric()

In [20]:
convert_cn2tw = PresetConversion(src='cn', dst='tw', with_phrase=False)

def process_text_evaluate(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def compute_bleu(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = process_text_evaluate(decoded_preds, decoded_labels)
    # Compute CHRF++ scores with (word_order=2, lowercase=True)
    chrf_results = chrf.compute(predictions=["".join(tokenizer.tokenize(pred)) for pred in decoded_preds], references=["".join(tokenizer.tokenize(label)) for label in decoded_labels], word_order=2)
    # Compute BLEU scores
    bleu_results = bleu.compute(predictions=[" ".join(tokenizer.tokenize(pred)) for pred in decoded_preds], references=[" ".join(tokenizer.tokenize(label)) for label in decoded_labels], smooth=True)
    chrf_results.update(bleu_results)
    del chrf_results['precisions']
    chrf_results['score'] = chrf_results['score'] / 100
    return {
        "CHRF++": chrf_results['score'],
        "BLEU Score": chrf_results['bleu']
    }

Building prefix dict from the default dictionary ...


Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.520 seconds.
Prefix dict has been built successfully.


In [21]:
from datetime import datetime
formatted_datetime = (str(datetime.now().year)+"_"+
                      str(datetime.now().month)+"_"+
                      str(datetime.now().day)+"_"+
                      str(datetime.now().hour)+"_"+
                      str(datetime.now().minute)+"_maxLength"+
                      str(model.config.max_length))
                      
if LANGUAGE == "台文":
    output_dir = "model/台文/" + formatted_datetime
elif LANGUAGE == "台羅":
    output_dir = "model/台羅/" + formatted_datetime
elif LANGUAGE == "白話字":
    output_dir = "model/白話字/" + formatted_datetime
print(f"model is save at: {output_dir}")
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=30,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=40, 
    per_device_eval_batch_size=40,
    learning_rate=3e-04,
    warmup_steps=2000,
    weight_decay=0.05,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    generation_max_length = model.config.max_length, # max length for best bleu score
    save_total_limit=2,
    save_strategy='epoch',
    evaluation_strategy="epoch", # Specify the evaluation strategy based on steps
    load_best_model_at_end = True # this will let the model save the best checkpoint
    # no_cuda = True    
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_bleu,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

model is save at: model/台文/2023_9_11_11_57_maxLength200


In [22]:
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb
    wandb.login(key="f65de74e8b17eee687fc6b6a91dc155659e02a5d")
from datetime import datetime

if WANDB_INTEGRATION:
    wandb_run = wandb.init(
        project="bart_wiki_lingua",
        config={
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate
        },
    )

    now = datetime.now()
    current_time = now.strftime("%H%M%S")
    wandb_run.name = "run_" + "Chinese-Hokkien" + "_" + current_time

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoeyliang[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/u5110390/.netrc


In [23]:
model.config

BartConfig {
  "_name_or_path": "fnlp/bart-base-chinese",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 2,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 3,
  "forced_eos_token_id": 3,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "mask_token_id": 4,
  "max_length": 200,
  "max_p

In [24]:
!nvidia-smi

Mon Sep 11 11:57:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 12.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:1C:00.0 Off |                    0 |
| N/A   29C    P0    57W / 300W |    999MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [25]:
prediction = trainer.predict(validation_data)
print(prediction.metrics)
print(prediction.predictions.shape)



Exception ignored in: <function _xla_gc_callback at 0x7fab0a772480>
Traceback (most recent call last):
  File "/home/u5110390/.local/lib/python3.11/site-packages/jax/_src/lib/__init__.py", line 98, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
# start training
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
prediction = trainer.predict(validation_data)
print(prediction.metrics)
print(prediction.predictions.shape)

{'eval_loss': 1.403465986251831,
 'eval_bleu': 0.13463581588592002,
 'eval_precisions': [0.9842016075820287,
  0.9729695034379839,
  0.964005314651528,
  0.9558593021086024],
 'eval_brevity_penalty': 0.13891413282668094,
 'eval_length_ratio': 0.3362588654717452,
 'eval_translation_length': 133368,
 'eval_reference_length': 396623,
 'eval_runtime': 437.5751,
 'eval_samples_per_second': 19.953,
 'eval_steps_per_second': 0.999,
 'epoch': 30.0}

In [None]:
if WANDB_INTEGRATION:
    wandb_run.finish()

VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/bleu,▁▃▇█████████████████████████████████████
eval/brevity_penalty,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/length_ratio,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/reference_length,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▂▂▁▁▁▁▂▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▂▂
eval/samples_per_second,▁▇▇▇▇▇█▇▇██████▇███████▇███████▇██████▇▇
eval/steps_per_second,▁▇▇▇▇▇█▇▇██████▇███████▇███████▇██████▇▇
eval/translation_length,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/bleu,0.13464
eval/brevity_penalty,0.13891
eval/length_ratio,0.33626
eval/loss,1.40347
eval/reference_length,396623.0
eval/runtime,437.5751
eval/samples_per_second,19.953
eval/steps_per_second,0.999
eval/translation_length,133368.0
train/epoch,30.0


# Generate translation

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["source language"][0:50],
        padding="max_length",
        truncation=True,
        max_length=max_len_mandarin,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str
    
def translate_sentence(sentence:str, model)->str:
    sen_dataset = Dataset.from_dict({
                                    "source language": [sentence]
                                })
    _, translation  = generate_summary(sen_dataset, model)
    return _, translation

In [None]:
tokenizer.decode([    2,     2, 19063, 13043,  9448,  5006,  9026, 16474, 13280,  4863,
8576,     1, 23709, 11757, 20011,     1,     1,  3322,     3])

'[BOS]警然我佮恁老爸仝年[UNK]，毋过[UNK][UNK]。'

In [None]:
model_path = "model/台文/results_2023-08-15-11-44-02-843691/checkpoint-9500"
model = BartForConditionalGeneration.from_pretrained(model_path)
sentences = [
    "雖然我和你爸爸同年齡，但是論輩份不論年紀，咱們算同輩，叫我哥哥就好。",
    "散會時，我們準備了三、四百塊鳳梨酥，在大門口送給客人。",
    "他活著的時候做了很多好事，死後一定會上天堂。",
    "傷口在上藥的時候有刺痛感。",
    "他好幾餐沒吃，餓過頭，沒體力竟暈倒了",
    "人不舒服，一直感到身體熱起來。",
    "民主黨在參眾兩院選舉也是贏家，在已開出的票中，民主黨在一百席的參院雖未跨過六十席門檻，但已取得五十六席，在眾院的席次也增為二百五十二席。"
]
for sentence in sentences :
    translation = translate_sentence(sentence, model)
    print(translation)

(tensor([[    2,     2, 19063, 13043,  9448,  5006,  9026, 16474, 13280,  4863,
          8576,     1, 23709, 11757, 20011,     1,     1,  3322,     3]]), ['警然我佮恁老爸仝年，毋过。'])
(tensor([[    2,     2, 10226,     1,     1,     3]]), ['散'])
(tensor([[    2,     2,  4907, 21919, 17529, 14310,     1,  5169,  5231, 14454,
         12136,  7398,  4868, 23709, 11685, 16259,  4646,  7904,     1,     3]]), ['伊食著的候做真济好代，死缝一定'])
(tensor([[    2,     2,     1,  4854,  6065,  4656,     1, 14310,     1,  4854,
         10627,  5675, 14133,  9238,  3322,     3]]), ['仔口上的仔有刺疼感。'])
(tensor([[    2,     2,  4907,  7398, 14275, 21919, 23709,     1,  5759,  6492,
             3]]), ['伊好癖食，力喔'])
(tensor([[    2,     2,  4828, 10343, 13284,  8940, 23709, 14411, 14411, 14029,
         11710,  3322,     3]]), ['人无爽快，直直畏殕。'])
(tensor([[    2,     2, 11825,  7329, 11773,     1, 23709, 11757,  5202,  4741,
         10421,     1,  7931,  3322,     3]]), ['民头毛，毋值也是家。'])
