In [2]:
import sys
sys.path.append("tformer") # add tformer folder

### Training

In [None]:
from train import train_model
import warnings

warnings.filterwarnings("ignore")

def get_config():
    return {
        "batch_size": 64,
        "num_epochs": 21,
        "lr": 10**-4,
        "seq": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()
train_model(config)

Using device: cuda
Device name: NVIDIA RTX A6000
Device memory: 47.988 GB
Max length of source sentence: 309
Max length of target sentence: 274
Preloading model opus_books_weights\tmodel_19.pt


Processing Epoch 20: 100%|██████████| 455/455 [06:27<00:00,  1.18it/s, loss=3.076]


--------------------------------------------------------------------------------
    SOURCE: "We will wait a few minutes, Jane, till you are more composed."
    TARGET: — Jane, aspettiamo che siate più calma.
 PREDICTED: — qualche minuto , Jane ; fino a quanto siete più calma .
--------------------------------------------------------------------------------
    SOURCE: You are both to blame: You more in external matters and he more in essential ones.'
    TARGET: Avete torto tutti e due. Tu hai torto in un modo più formale, lui in un modo più sostanziale.
 PREDICTED: Voi non siete colpevole ; voi siete più superficiale e più .
--------------------------------------------------------------------------------


### Evaluating

In [4]:
from train import test_model
import warnings

warnings.filterwarnings("ignore")

def get_config():
    return {
        "batch_size": 64,
        "num_epochs": 21,
        "lr": 10**-4,
        "seq": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()
test_model(config)

Using device: cuda
Device name: NVIDIA RTX A6000
Device memory: 47.988 GB
Max length of source sentence: 309
Max length of target sentence: 274
Preloading model opus_books_weights\tmodel_20.pt
--------------------------------------------------------------------------------
    SOURCE: And do not let any one impugn this statement with the trite proverb that "He who builds on the people, builds on the mud," for this is true when a private citizen makes a foundation there, and persuades himself that the people will free him when he is oppressed by his enemies or by the magistrates; wherein he would find himself very often deceived, as happened to the Gracchi in Rome and to Messer Giorgio Scali in Florence.
    TARGET: E non sia alcuno che repugni a questa mia opinione con quello proverbio trito, che chi fonda in sul populo, fonda in sul fango: perché quello è vero, quando uno cittadino privato vi fa su fondamento, e dassi ad intendere che il populo lo liberi, quando fussi oppresso da' nim

### EN-KR

In [None]:
# Huggingface datasets and tokenizers
from datasets import load_dataset
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

def get_config():
    return {
        "batch_size": 64,   # For A6000(48G)
        "num_epochs": 21,
        "lr": 10**-4,
        "seq": 350,
        "d_model": 512,
        "datasource": 'lemon-mint/korean_english_parallel_wiki_augmented_v1',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

# It only has the train split, so we divide it overselves
if config['lang_src'] == "" or config['lang_tgt'] == "":
    ds_raw = load_dataset(f"{config['datasource']}", split='train')
else:
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

# Build tokenizers
tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

# Keep 90% for training, 10% for validation
train_ds_size = int(0.9 * len(ds_raw))
val_ds_size = len(ds_raw) - train_ds_size
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

train_ds = TranslationDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq'])
val_ds = TranslationDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq'])

# Find the maximum length of each sentence in the source and target sentence
max_len_src = 0
max_len_tgt = 0

for item in ds_raw:
    src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
    tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
    max_len_src = max(max_len_src, len(src_ids))
    max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')


train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)



README.md:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/285M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/285M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/503245 [00:00<?, ? examples/s]