<a href="https://colab.research.google.com/github/jessicangu/english-french-nmtbaseline/blob/main/fr_en_nmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os
import random
from collections import Counter
import sentencepiece as spm

drive.mount('/content/drive', force_remount=True)

!mkdir -p /content/drive/MyDrive/nmt_project/
!mkdir -p /content/drive/MyDrive/nmt_project/checkpoints
!mkdir -p /content/drive/MyDrive/nmt_project/data
!mkdir -p /content/drive/MyDrive/nmt_project/models
!mkdir -p /content/drive/MyDrive/nmt_project/vocab

!mkdir -p /content/data
!mkdir -p /content/data/europarl
!mkdir -p /content/data/processed
!mkdir -p /content/models

def step_completed(checkpoint_file):
    """Check if a processing step has already been completed."""
    return os.path.exists(f'/content/drive/MyDrive/nmt_project/{checkpoint_file}')

def mark_step_completed(checkpoint_file):
    """Mark a processing step as completed."""

    checkpoint_dir = os.path.dirname(f'/content/drive/MyDrive/nmt_project/{checkpoint_file}')
    if checkpoint_dir:
        !mkdir -p {checkpoint_dir}
    !touch /content/drive/MyDrive/nmt_project/{checkpoint_file}

if not step_completed('checkpoints/dependencies_installed'):
    print("Installing dependencies...")
    !pip install OpenNMT-py scikit-learn sacrebleu mosestokenizer sentencepiece

    if not os.path.exists('/content/OpenNMT-py'):
        !git clone https://github.com/OpenNMT/OpenNMT-py
    else:
        print("OpenNMT-py directory already exists. Skipping clone.")

    mark_step_completed('checkpoints/dependencies_installed')
else:
    print("Dependencies already installed.")

    if not os.path.exists('/content/OpenNMT-py'):
        !git clone https://github.com/OpenNMT/OpenNMT-py

Mounted at /content/drive
Dependencies already installed.


In [2]:
def upload_or_load_data():
    """Upload files if needed or load from Google Drive if available."""

    drive_files_exist = False
    if step_completed('checkpoints/data_uploaded'):
        print("Checking files in Google Drive...")

        !ls -la /content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.*

        import os
        en_size = os.path.getsize('/content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.en') if os.path.exists('/content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.en') else 0
        fr_size = os.path.getsize('/content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.fr') if os.path.exists('/content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.fr') else 0

        if en_size > 0 and fr_size > 0:
            print("Valid files found in Drive, copying to local environment...")
            !cp -r /content/drive/MyDrive/nmt_project/data/europarl/* /content/data/europarl/
            drive_files_exist = True
        else:
            print("Files in Drive are empty or missing, need to upload...")

    if not drive_files_exist:
        print("Please upload your small Europarl files (5000 lines)...")
        from google.colab import files

        print("Select your small English Europarl file (5000 lines):")
        uploaded_en = files.upload()

        print("Select your small French Europarl file (5000 lines):")
        uploaded_fr = files.upload()

        for filename in uploaded_en.keys():
            !mv "{filename}" /content/data/europarl/europarl-v7.fr-en.en

        for filename in uploaded_fr.keys():
            !mv "{filename}" /content/data/europarl/europarl-v7.fr-en.fr

        !mkdir -p /content/drive/MyDrive/nmt_project/data/europarl
        !cp /content/data/europarl/europarl-v7.fr-en.en /content/drive/MyDrive/nmt_project/data/europarl/
        !cp /content/data/europarl/europarl-v7.fr-en.fr /content/drive/MyDrive/nmt_project/data/europarl/

        mark_step_completed('checkpoints/data_uploaded')

    !ls -la /content/data/europarl/
    !head -n 3 /content/data/europarl/europarl-v7.fr-en.fr
    !head -n 3 /content/data/europarl/europarl-v7.fr-en.en
    !wc -l /content/data/europarl/europarl-v7.fr-en.fr /content/data/europarl/europarl-v7.fr-en.en

    return True

upload_or_load_data()

Checking files in Google Drive...
-rw------- 1 root root 767365 Mar 13 14:57 /content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.en
-rw------- 1 root root 869492 Mar 13 14:57 /content/drive/MyDrive/nmt_project/data/europarl/europarl-v7.fr-en.fr
Valid files found in Drive, copying to local environment...
total 1612
drwxr-xr-x 2 root root   4096 Mar 13 20:12 .
drwxr-xr-x 4 root root   4096 Mar 13 20:12 ..
-rw------- 1 root root 767365 Mar 13 20:14 europarl-v7.fr-en.en
-rw------- 1 root root 869492 Mar 13 20:14 europarl-v7.fr-en.fr
Reprise de la session
Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.
Comme vous avez pu le constater, le grand "bogue de l'an 2000" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.
Resumption o

True

In [3]:
if not step_completed('checkpoints/tokenization_completed'):
    print("Performing tokenization...")

    from mosestokenizer import MosesTokenizer

    with MosesTokenizer('fr') as tokenize_fr:
        with open('/content/data/europarl/europarl-v7.fr-en.fr', 'r', encoding='utf-8') as f_in, \
             open('/content/data/europarl-v7.fr-en.tok.fr', 'w', encoding='utf-8') as f_out:
            for line in f_in:
                tokens = tokenize_fr(line.strip())
                f_out.write(' '.join(tokens) + '\n')

    with MosesTokenizer('en') as tokenize_en:
        with open('/content/data/europarl/europarl-v7.fr-en.en', 'r', encoding='utf-8') as f_in, \
             open('/content/data/europarl-v7.fr-en.tok.en', 'w', encoding='utf-8') as f_out:
            for line in f_in:
                tokens = tokenize_en(line.strip())
                f_out.write(' '.join(tokens) + '\n')

    !cp /content/data/europarl-v7.fr-en.tok.fr /content/drive/MyDrive/nmt_project/data/
    !cp /content/data/europarl-v7.fr-en.tok.en /content/drive/MyDrive/nmt_project/data/

    mark_step_completed('checkpoints/tokenization_completed')
else:
    print("Loading tokenized data from Drive...")
    !cp /content/drive/MyDrive/nmt_project/data/europarl-v7.fr-en.tok.fr /content/data/
    !cp /content/drive/MyDrive/nmt_project/data/europarl-v7.fr-en.tok.en /content/data/

!head -n 3 /content/data/europarl-v7.fr-en.tok.fr
!head -n 3 /content/data/europarl-v7.fr-en.tok.en
!wc -l /content/data/europarl-v7.fr-en.tok.fr /content/data/europarl-v7.fr-en.tok.en

Loading tokenized data from Drive...
Reprise de la session
Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances .
Comme vous avez pu le constater , le grand &quot; bogue de l&apos; an 2000 &quot; ne s&apos; est pas produit . En revanche , les citoyens d&apos; un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles .
Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .
Although , as you will have seen , the dreaded &apos; millennium bug &apos; failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful .
   5000 /content/data/europarl-v7.fr-en.tok.fr

In [4]:
if not step_completed('checkpoints/cleaning_completed'):
    print("Cleaning parallel data...")

    def clean_parallel_data(src_file, tgt_file, max_len=100, max_diff=50, max_pairs=5000):
        with open(src_file, 'r', encoding='utf-8') as f_src, open(tgt_file, 'r', encoding='utf-8') as f_tgt:
            src_lines = f_src.readlines()
            tgt_lines = f_tgt.readlines()

        cleaned_pairs = []
        for src, tgt in zip(src_lines, tgt_lines):
            src = src.strip()
            tgt = tgt.strip()
            src_len = len(src.split())
            tgt_len = len(tgt.split())

            if src_len <= max_len and tgt_len <= max_len and abs(src_len - tgt_len) <= max_diff:
                cleaned_pairs.append((src, tgt))

        print(f"Original pairs: {len(src_lines)}, Cleaned pairs: {len(cleaned_pairs)}")
        return cleaned_pairs

    clean_pairs = clean_parallel_data('/content/data/europarl-v7.fr-en.tok.fr', '/content/data/europarl-v7.fr-en.tok.en')

    with open('/content/data/cleaned.fr', 'w', encoding='utf-8') as f_src, \
         open('/content/data/cleaned.en', 'w', encoding='utf-8') as f_tgt:
        for src, tgt in clean_pairs:
            f_src.write(src + '\n')
            f_tgt.write(tgt + '\n')

    !cp /content/data/cleaned.fr /content/drive/MyDrive/nmt_project/data/
    !cp /content/data/cleaned.en /content/drive/MyDrive/nmt_project/data/

    mark_step_completed('checkpoints/cleaning_completed')
else:
    print("Loading cleaned data from Drive...")
    !cp /content/drive/MyDrive/nmt_project/data/cleaned.fr /content/data/
    !cp /content/drive/MyDrive/nmt_project/data/cleaned.en /content/data/

!head -n 3 /content/data/cleaned.fr
!head -n 3 /content/data/cleaned.en
!wc -l /content/data/cleaned.fr /content/data/cleaned.en

Loading cleaned data from Drive...
Reprise de la session
Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances .
Comme vous avez pu le constater , le grand &quot; bogue de l&apos; an 2000 &quot; ne s&apos; est pas produit . En revanche , les citoyens d&apos; un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles .
Resumption of the session
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .
Although , as you will have seen , the dreaded &apos; millennium bug &apos; failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful .
   4959 /content/data/cleaned.fr
   4959 /conten

In [5]:
!pip install -r OpenNMT-py/requirements.opt.txt
!pip install OpenNMT-py scikit-learn sacrebleu
!pip install sacrebleu mosestokenizer sentencepiece

Collecting OpenNMT-py
  Using cached OpenNMT_py-3.5.1-py3-none-any.whl.metadata (8.8 kB)
Collecting sacrebleu
  Using cached sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting torch<2.3,>=2.1 (from OpenNMT-py)
  Using cached torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting configargparse (from OpenNMT-py)
  Using cached ConfigArgParse-1.7-py3-none-any.whl.metadata (23 kB)
Collecting ctranslate2<5,>=4 (from OpenNMT-py)
  Using cached ctranslate2-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting waitress (from OpenNMT-py)
  Using cached waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting pyonmttok<2,>=1.37 (from OpenNMT-py)
  Using cached pyonmttok-1.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pyahocorasick (from OpenNMT-py)
  Using cached pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting fasttext-w

In [6]:
if not step_completed('checkpoints/bpe_completed'):
    print("Training and applying BPE...")

    spm.SentencePieceTrainer.train(
        input='/content/data/cleaned.fr,/content/data/cleaned.en',
        model_prefix='/content/data/bpe',
        vocab_size=8000,
        character_coverage=0.9995,
        model_type='bpe',
        input_sentence_size=5000,
        shuffle_input_sentence=True
    )

    sp = spm.SentencePieceProcessor()
    sp.load('/content/data/bpe.model')

    with open('/content/data/cleaned.fr', 'r', encoding='utf-8') as f_in, \
         open('/content/data/bpe.fr', 'w', encoding='utf-8') as f_out:
        for line in f_in:
            pieces = sp.encode_as_pieces(line.strip())
            f_out.write(' '.join(pieces) + '\n')

    with open('/content/data/cleaned.en', 'r', encoding='utf-8') as f_in, \
         open('/content/data/bpe.en', 'w', encoding='utf-8') as f_out:
        for line in f_in:
            pieces = sp.encode_as_pieces(line.strip())
            f_out.write(' '.join(pieces) + '\n')

    !cp /content/data/bpe.model /content/drive/MyDrive/nmt_project/models/
    !cp /content/data/bpe.vocab /content/drive/MyDrive/nmt_project/models/
    !cp /content/data/bpe.fr /content/drive/MyDrive/nmt_project/data/
    !cp /content/data/bpe.en /content/drive/MyDrive/nmt_project/data/

    mark_step_completed('checkpoints/bpe_completed')
else:
    print("Loading BPE data from Drive...")
    !cp /content/drive/MyDrive/nmt_project/models/bpe.model /content/data/ 2>/dev/null || echo "BPE model not found in Drive"
    !cp /content/drive/MyDrive/nmt_project/models/bpe.vocab /content/data/ 2>/dev/null || echo "BPE vocab not found in Drive"
    !cp /content/drive/MyDrive/nmt_project/data/bpe.fr /content/data/ 2>/dev/null || echo "BPE French file not found in Drive"
    !cp /content/drive/MyDrive/nmt_project/data/bpe.en /content/data/ 2>/dev/null || echo "BPE English file not found in Drive"

!head -n 3 /content/data/bpe.fr 2>/dev/null || echo "BPE French file not available"
!head -n 3 /content/data/bpe.en 2>/dev/null || echo "BPE English file not available"
!wc -l /content/data/bpe.fr /content/data/bpe.en 2>/dev/null || echo "BPE files not available for line count"

Loading BPE data from Drive...
▁Re pr ise ▁de ▁la ▁session
▁Je ▁décl are ▁reprise ▁la ▁session ▁du ▁Parlement ▁européen ▁qui ▁avait ▁été ▁inter r omp ue ▁le ▁v endre di ▁17 ▁décembre ▁dernier ▁et ▁je ▁vous ▁renouvel le ▁tous ▁mes ▁v ux ▁en ▁esp érant ▁que ▁vous ▁avez ▁passé ▁de ▁bonnes ▁vac ances ▁.
▁Comme ▁vous ▁avez ▁pu ▁le ▁constater ▁, ▁le ▁grand ▁& quot ; ▁b ogue ▁de ▁l & apos ; ▁an ▁2000 ▁& quot ; ▁ne ▁s & apos ; ▁est ▁pas ▁produit ▁. ▁En ▁rev anche ▁, ▁les ▁citoyens ▁d & apos ; ▁un ▁certain ▁nombre ▁de ▁nos ▁pays ▁ont ▁été ▁victimes ▁de ▁catastrophes ▁naturelles ▁qui ▁ont ▁vraiment ▁été ▁ter rib les ▁.
▁Res um ption ▁of ▁the ▁session
▁I ▁decl are ▁resum ed ▁the ▁session ▁of ▁the ▁European ▁Parliament ▁ad j our ned ▁on ▁Fr id ay ▁17 ▁December ▁1999 ▁, ▁and ▁I ▁would ▁like ▁once ▁again ▁to ▁wish ▁you ▁a ▁happy ▁new ▁year ▁in ▁the ▁hope ▁that ▁you ▁enj oyed ▁a ▁ple as ant ▁f es tive ▁period ▁.
▁Although ▁, ▁as ▁you ▁will ▁have ▁seen ▁, ▁the ▁dre ad ed ▁& apos ; ▁millennium ▁b ug ▁&

In [7]:
if not step_completed('checkpoints/split_completed'):
    print("Splitting data into train, validation, and test sets...")

    def split_data(src_file, tgt_file, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2, seed=42):
        with open(src_file, 'r', encoding='utf-8') as f_src, open(tgt_file, 'r', encoding='utf-8') as f_tgt:
            src_lines = f_src.readlines()
            tgt_lines = f_tgt.readlines()

        combined = list(zip(src_lines, tgt_lines))
        random.seed(seed)
        random.shuffle(combined)
        src_lines, tgt_lines = zip(*combined)

        total_size = len(src_lines)
        train_size = int(total_size * train_ratio)
        val_size = int(total_size * val_ratio)

        os.makedirs('/content/data/processed', exist_ok=True)

        with open('/content/data/processed/src-train.txt', 'w', encoding='utf-8') as f:
            f.writelines(src_lines[:train_size])
        with open('/content/data/processed/tgt-train.txt', 'w', encoding='utf-8') as f:
            f.writelines(tgt_lines[:train_size])

        with open('/content/data/processed/src-val.txt', 'w', encoding='utf-8') as f:
            f.writelines(src_lines[train_size:train_size+val_size])
        with open('/content/data/processed/tgt-val.txt', 'w', encoding='utf-8') as f:
            f.writelines(tgt_lines[train_size:train_size+val_size])

        with open('/content/data/processed/src-test.txt', 'w', encoding='utf-8') as f:
            f.writelines(src_lines[train_size+val_size:])
        with open('/content/data/processed/tgt-test.txt', 'w', encoding='utf-8') as f:
            f.writelines(tgt_lines[train_size+val_size:])

        print(f"Data split complete:")
        print(f"  Train: {train_size} examples ({train_ratio*100:.0f}%)")
        print(f"  Validation: {val_size} examples ({val_ratio*100:.0f}%)")
        print(f"  Test: {len(src_lines) - train_size - val_size} examples ({test_ratio*100:.0f}%)")

    split_data('/content/data/bpe.fr', '/content/data/bpe.en')

    !cp -r /content/data/processed /content/drive/MyDrive/nmt_project/

    mark_step_completed('checkpoints/split_completed')
else:
    print("Loading split data from Drive...")
    !mkdir -p /content/data/processed
    !cp -r /content/drive/MyDrive/nmt_project/processed/* /content/data/processed/ 2>/dev/null || echo "Split data not found in Drive"

!wc -l /content/data/processed/src-train.txt /content/data/processed/tgt-train.txt 2>/dev/null || echo "Train files not available for line count"
!wc -l /content/data/processed/src-val.txt /content/data/processed/tgt-val.txt 2>/dev/null || echo "Validation files not available for line count"
!wc -l /content/data/processed/src-test.txt /content/data/processed/tgt-test.txt 2>/dev/null || echo "Test files not available for line count"

Loading split data from Drive...
   3471 /content/data/processed/src-train.txt
   3471 /content/data/processed/tgt-train.txt
   6942 total
   495 /content/data/processed/src-val.txt
   495 /content/data/processed/tgt-val.txt
   990 total
   993 /content/data/processed/src-test.txt
   993 /content/data/processed/tgt-test.txt
  1986 total


In [8]:
if not step_completed('checkpoints/vocab_created'):
    print("Creating vocabulary files...")

    src_vocab = Counter()
    with open('/content/data/processed/src-train.txt', 'r', encoding='utf-8') as f:
        for line in f:
            for word in line.strip().split():
                src_vocab[word] += 1

    tgt_vocab = Counter()
    with open('/content/data/processed/tgt-train.txt', 'r', encoding='utf-8') as f:
        for line in f:
            for word in line.strip().split():
                tgt_vocab[word] += 1

    with open('/content/data/onmt_vocab.vocab.src', 'w', encoding='utf-8') as f:
        f.write("<blank> 1\n<unk> 1\n<s> 1\n</s> 1\n")
        for word, count in src_vocab.most_common(8000):
            f.write(f"{word} {count}\n")

    with open('/content/data/onmt_vocab.vocab.tgt', 'w', encoding='utf-8') as f:
        f.write("<blank> 1\n<unk> 1\n<s> 1\n</s> 1\n")
        for word, count in tgt_vocab.most_common(8000):
            f.write(f"{word} {count}\n")

    !cp /content/data/onmt_vocab.vocab.src /content/drive/MyDrive/nmt_project/vocab/
    !cp /content/data/onmt_vocab.vocab.tgt /content/drive/MyDrive/nmt_project/vocab/

    mark_step_completed('checkpoints/vocab_created')
else:
    print("Loading vocabulary files from Drive...")
    !cp /content/drive/MyDrive/nmt_project/vocab/onmt_vocab.vocab.src /content/data/ 2>/dev/null || echo "Source vocabulary not found in Drive"
    !cp /content/drive/MyDrive/nmt_project/vocab/onmt_vocab.vocab.tgt /content/data/ 2>/dev/null || echo "Target vocabulary not found in Drive"

print("Vocabulary files created successfully!")

Loading vocabulary files from Drive...
Vocabulary files created successfully!


In [12]:
!ls /content/data/processed/

src-test.txt  src-train.txt  src-val.txt  tgt-test.txt	tgt-train.txt  tgt-val.txt


In [15]:
with open('/content/train_config.yml', 'w') as f:
    f.write("""
data:
    corpus_1:
        path_src: /content/data/processed/src-train.txt
        path_tgt: /content/data/processed/tgt-train.txt
        transforms: [filtertoolong]
        weight: 1
    valid:
        path_src: /content/data/processed/src-val.txt
        path_tgt: /content/data/processed/tgt-val.txt
        transforms: [filtertoolong]

# vocabulary files
src_vocab: /content/data/onmt_vocab.vocab.src
tgt_vocab: /content/data/onmt_vocab.vocab.tgt

# general opts
save_model: /content/models/fr_en_model
save_checkpoint_steps: 500
keep_checkpoint: 3
seed: 42
train_steps: 500
valid_steps: 500
report_every: 250

# filter options
src_seq_length: 100
tgt_seq_length: 100

# model parameters
model_dtype: "fp32"
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 4
dec_layers: 4
heads: 8
hidden_size: 256
word_vec_size: 256
transformer_ff: 1024
dropout: 0.1

# optimization
batch_size: 2048
batch_type: tokens
normalization: tokens
accum_count: 2
optim: adam
adam_beta2: 0.998
decay_method: noam
warmup_steps: 1000
learning_rate: 2
max_grad_norm: 0
param_init: 0
param_init_glorot: true
label_smoothing: 0.1

# GPU settings
world_size: 1
gpu_ranks: [0]
""")


In [45]:
print("Training the French to English model...")

!mkdir -p /content/models

!cd /content/OpenNMT-py && python train.py -config /content/train_config.yml

with open('/content/train_reverse_config.yml', 'w') as f:
    f.write("""
data:
    corpus_1:
        path_src: /content/data/processed/tgt-train.txt
        path_tgt: /content/data/processed/src-train.txt
        transforms: [filtertoolong]
        weight: 1
    valid:
        path_src: /content/data/processed/tgt-val.txt
        path_tgt: /content/data/processed/src-val.txt
        transforms: [filtertoolong]

src_vocab: /content/data/onmt_vocab.vocab.tgt
tgt_vocab: /content/data/onmt_vocab.vocab.src

save_model: /content/models/en_fr_model
save_checkpoint_steps: 500
keep_checkpoint: 3

seed: 42
train_steps: 500  # Reduced to 500 steps
valid_steps: 500
report_every: 250

src_seq_length: 100
tgt_seq_length: 100

model_dtype: "fp32"
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 4
dec_layers: 4
heads: 8
hidden_size: 256
word_vec_size: 256
transformer_ff: 1024
dropout: 0.1

batch_size: 2048
batch_type: tokens
normalization: tokens
accum_count: 2
optim: adam
adam_beta2: 0.998
decay_method: noam
warmup_steps: 1000
learning_rate: 2
max_grad_norm: 0
param_init: 0
param_init_glorot: true
label_smoothing: 0.1

world_size: 1
gpu_ranks: [0]
""")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
			* corpus_1: 720
[2025-03-13 19:18:20,559 INFO] Weighted corpora loaded so far:
			* corpus_1: 740
[2025-03-13 19:18:20,578 INFO] Weighted corpora loaded so far:
			* corpus_1: 721
[2025-03-13 19:18:20,636 INFO] Weighted corpora loaded so far:
			* corpus_1: 741
[2025-03-13 19:18:20,660 INFO] Weighted corpora loaded so far:
			* corpus_1: 722
[2025-03-13 19:18:20,718 INFO] Weighted corpora loaded so far:
			* corpus_1: 742
[2025-03-13 19:18:20,736 INFO] Weighted corpora loaded so far:
			* corpus_1: 723
[2025-03-13 19:18:20,794 INFO] Weighted corpora loaded so far:
			* corpus_1: 743
[2025-03-13 19:18:20,812 INFO] Weighted corpora loaded so far:
			* corpus_1: 724
[2025-03-13 19:18:20,871 INFO] Weighted corpora loaded so far:
			* corpus_1: 744
[2025-03-13 19:18:20,887 INFO] Weighted corpora loaded so far:
			* corpus_1: 725
[2025-03-13 19:18:20,948 INFO] Weighted corpora loaded so far:
			* corpus_1: 745
[2025-03-13 19

In [46]:
with open('/content/train_reverse_config.yml', 'w') as f:
    f.write("""# Data and vocabulary (reversed)
data:
    corpus_1:
        path_src: /content/data/processed/tgt-train.txt
        path_tgt: /content/data/processed/src-train.txt
        transforms: [filtertoolong]
        weight: 1
    valid:
        path_src: /content/data/processed/tgt-val.txt
        path_tgt: /content/data/processed/src-val.txt
        transforms: [filtertoolong]

# vocabulary files (reversed)
src_vocab: /content/data/onmt_vocab.vocab.tgt
tgt_vocab: /content/data/onmt_vocab.vocab.src

save_model: /content/models/en_fr_model
save_checkpoint_steps: 500
keep_checkpoint: 3
seed: 42
train_steps: 5000
valid_steps: 500
report_every: 250

src_seq_length: 100
tgt_seq_length: 100

model_dtype: "fp32"
encoder_type: transformer
decoder_type: transformer
position_encoding: true
enc_layers: 4
dec_layers: 4
heads: 8
hidden_size: 256
word_vec_size: 256
transformer_ff: 1024
dropout: 0.1

batch_size: 2048
batch_type: tokens
normalization: tokens
accum_count: 2
optim: adam
adam_beta2: 0.998
decay_method: noam
warmup_steps: 1000
learning_rate: 2
max_grad_norm: 0
param_init: 0
param_init_glorot: true
label_smoothing: 0.1

world_size: 1
gpu_ranks: [0]
""")


In [None]:
!cd /content/OpenNMT-py && python train.py -config /content/train_reverse_config.yml

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2025-03-13 19:35:31,659 INFO] Weighted corpora loaded so far:
			* corpus_1: 1179
[2025-03-13 19:35:31,732 INFO] Weighted corpora loaded so far:
			* corpus_1: 1180
[2025-03-13 19:35:31,800 INFO] Weighted corpora loaded so far:
			* corpus_1: 1181
[2025-03-13 19:35:31,869 INFO] Weighted corpora loaded so far:
			* corpus_1: 1182
[2025-03-13 19:35:31,936 INFO] Weighted corpora loaded so far:
			* corpus_1: 1183
[2025-03-13 19:35:32,006 INFO] Weighted corpora loaded so far:
			* corpus_1: 1184
[2025-03-13 19:35:32,074 INFO] Weighted corpora loaded so far:
			* corpus_1: 1185
[2025-03-13 19:35:32,141 INFO] Weighted corpora loaded so far:
			* corpus_1: 1186
[2025-03-13 19:35:32,209 INFO] Weighted corpora loaded so far:
			* corpus_1: 1187
[2025-03-13 19:35:32,263 INFO] Weighted corpora loaded so far:
			* corpus_1: 1192
[2025-03-13 19:35:32,282 INFO] Weighted corpora loaded so far:
			* corpus_1: 1188
[2025-03-13 19:35:32,3

In [None]:
!cd /content/OpenNMT-py && python translate.py -model /content/models/fr_en_model_step_500.pt -src /content/data/processed/tgt-test.txt -output /content/data/translated_fr_en.txt -gpu 0
