In [1]:
import sys
sys.path.append(f"tformer") # add tformer folder

from train import train_model, test_model
import warnings

warnings.filterwarnings("ignore")

In [2]:
def get_config():
    return {
        "batch_size": 64,   # For A6000(48G)
        "num_epochs": 21,
        "lr": 10**-4,
        "seq": 350,
        "d_model": 512,
        "datasource": 'lemon-mint/korean_english_parallel_wiki_augmented_v1',
        "lang_src_fieldname": "english",
        "lang_tgt_fieldname": "korean",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

# def get_config():
#     return {
#         "batch_size": 64,
#         "num_epochs": 21,
#         "lr": 10**-4,
#         "seq": 350,
#         "d_model": 512,
#         "datasource": 'opus_books',
#         "lang_src": "en",
#         "lang_tgt": "it",
#         "model_folder": "weights",
#         "model_basename": "tmodel_",
#         "preload": "latest",
#         "tokenizer_file": "tokenizer_{0}.json",
#         "experiment_name": "runs/tmodel"
#     }


config = get_config()

### Training

In [3]:
train_model(config)

Using device: cuda
Device name: NVIDIA RTX A6000
Device memory: 47.526 GB
Max length of source sentence: 2400
Max length of target sentence: 1763
No model to preload, starting from scratch


Processing Epoch 00:   0%|          | 0/7077 [00:00<?, ?it/s]


ValueError: Sentence is too long

### Evaluating

In [None]:
test_model(config)

### [TEST] EN-KR

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

# Huggingface datasets and tokenizers
from datasets import load_dataset
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

def get_config():
    return {
        "batch_size": 64,   # For A6000(48G)
        "num_epochs": 21,
        "lr": 10**-4,
        "seq": 350,
        "d_model": 512,
        "datasource": 'lemon-mint/korean_english_parallel_wiki_augmented_v1',
        "lang_src": "en",
        "lang_src_fieldname": "english",
        "lang_tgt": "kr",
        "lang_tgt_fieldname": "korean",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()

def get_all_sentences_lemonmint(ds, lang):
    for item in ds:
        yield item[lang]

def get_all_sentences_for_opusbooks(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang, get_all_sentences):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

# It only has the train split, so we divide it overselves
if config['datasource'] == 'lemon-mint/korean_english_parallel_wiki_augmented_v1':
    ds_raw = load_dataset(f"{config['datasource']}", split='train')
    # Build tokenizers
    lang_src = config['lang_src_fieldname']
    lang_tgt = config['lang_tgt_fieldname']
    func_get_all_sentences = get_all_sentences_lemonmint
else:
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')
    lang_src = config['lang_src']
    lang_tgt = config['lang_tgt']
    func_get_all_sentences = get_all_sentences_for_opusbooks

# Build tokenizers
tokenizer_src = get_or_build_tokenizer(config, ds_raw, lang_src, func_get_all_sentences)
tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, lang_tgt, func_get_all_sentences)

# Keep 90% for training, 10% for validation
train_ds_size = int(0.9 * len(ds_raw))
val_ds_size = len(ds_raw) - train_ds_size
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

from dataset import TranslationDatasetForOpusbooks, TranslationDatasetForLemonmint
train_ds = TranslationDatasetForLemonmint(train_ds_raw, tokenizer_src, tokenizer_tgt, lang_src, lang_tgt, config['seq'])
val_ds = TranslationDatasetForLemonmint(val_ds_raw, tokenizer_src, tokenizer_tgt, lang_src, lang_tgt, config['seq'])


# Find the maximum length of each sentence in the source and target sentence
max_len_src = 0
max_len_tgt = 0

for item in ds_raw:
    src_ids = tokenizer_src.encode(item[config['lang_src_fieldname']]).ids
    tgt_ids = tokenizer_tgt.encode(item[config['lang_tgt_fieldname']]).ids
    max_len_src = max(max_len_src, len(src_ids))
    max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')


train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

