In [None]:
!pip install datasets
!pip install joeynmt==2.3.0
!pip install sacrebleu
!pip install evaluate



In [None]:
import re
import torch
from datasets import load_dataset, DatasetDict, Translation

In [None]:
repo_name = "data354/Koumankan_mt_dyu_fr"
dataset = load_dataset(repo_name)
dataset

Downloading data:   0%|          | 0.00/530k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8065 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1471 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1393 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'translation'],
        num_rows: 8065
    })
    validation: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1471
    })
    test: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1393
    })
})

In [None]:
src_lang = 'dyu'
trg_lang = 'fr'
chars_to_remove_regex = '[!"&\(\),-./:;=?+.\n\[\]]'
def remove_special_characters(text):
  text = re.sub(chars_to_remove_regex, '', text)
  text = text.lower()
  return text.strip()

def clean_text(batch):
    # process source text
    batch['translation'][src_lang] = remove_special_characters(batch['translation'][src_lang])
    # process target text
    batch['translation'][trg_lang] = remove_special_characters(batch['translation'][trg_lang])

    return batch


dataset = dataset.map(clean_text)
dataset

Map:   0%|          | 0/8065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1471 [00:00<?, ? examples/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'translation'],
        num_rows: 8065
    })
    validation: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1471
    })
    test: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1393
    })
})

In [None]:
dataset["validation"]["translation"][:3]

[{'dyu': 'i tɔgɔ bi cogodɔ', 'fr': 'tu portes un nom de fantaisie'},
 {'dyu': 'puɛn saba fɔlɔ', 'fr': 'trois points d’avance'},
 {'dyu': 'tile bena', 'fr': 'le soleil s’est couché'}]

In [None]:
data_dir = "../data/dyu_fr"
dataset.save_to_disk(data_dir)


Saving the dataset (0/1 shards):   0%|          | 0/8065 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1471 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1393 [00:00<?, ? examples/s]

### Vocabulary

In [None]:
from pathlib import Path

# model dir
model_dir = "../saved_model/dyu_fr"

# Create the config
config = """
name: "dyu_fr_transformer-sp"
joeynmt_version: "2.3.0"
model_dir: "{model_dir}"
use_cuda: True # False for CPU training
fp16: True

data:
    train: "{data_dir}"
    dev: "{data_dir}"
    test: "{data_dir}"
    dataset_type: "huggingface"
    dataset_cfg:
        name: "dyu-fr"
    sample_dev_subset: 1460
    src:
        lang: "dyu"
        max_length: 100
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 2000
        voc_min_freq: 1
        voc_file: "{data_dir}/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "{data_dir}/sp.model"
    trg:
        lang: "fr"
        max_length: 100
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 2000
        voc_min_freq: 1
        voc_file: "{data_dir}/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            model_file: "{data_dir}/sp.model"
    special_symbols:
        unk_token: "<unk>"
        unk_id: 0
        pad_token: "<pad>"
        pad_id: 1
        bos_token: "<s>"
        bos_id: 2
        eos_token: "</s>"
        eos_id: 3

""".format(data_dir=data_dir, model_dir=model_dir)
with (Path(data_dir) / "config.yaml").open('w') as f:
    f.write(config)

In [None]:
!wget https://raw.githubusercontent.com/joeynmt/joeynmt/v2.3/scripts/build_vocab.py
! sudo chmod 777 build_vocab.py

--2024-08-13 18:12:20--  https://raw.githubusercontent.com/joeynmt/joeynmt/v2.3/scripts/build_vocab.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13170 (13K) [text/plain]
Saving to: ‘build_vocab.py’


2024-08-13 18:12:20 (18.9 MB/s) - ‘build_vocab.py’ saved [13170/13170]



In [None]:
!python build_vocab.py {data_dir}/config.yaml --joint

2024-08-13 18:12:24.287096: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 18:12:24.601092: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 18:12:24.694871: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-13 18:12:24.968244: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Dropping NaN...: 100% 8065/8065 [00:00<00:00, 113733.

In [None]:
!head -10 {data_dir}/vocab.txt

<unk>
<pad>
<s>
</s>
s
▁a
▁ka
a
'
▁


### Model Training

In [None]:
config += """
testing:
    #load_model: "{model_dir}/best.ckpt"
    n_best: 1
    beam_size: 10
    beam_alpha: 1.0
    batch_size: 1024
    batch_type: "token"
    max_output_length: 100
    eval_metrics: ["bleu"]
    #return_prob: "hyp"
    #return_attention: False
    sacrebleu_cfg:
        tokenize: "13a"

training:
    #load_model: "{model_dir}/latest.ckpt"
    #reset_best_ckpt: False
    #reset_scheduler: False
    #reset_optimizer: False
    #reset_iter_state: False
    random_seed: 42
    optimizer: "adamw"
    normalization: "tokens"
    adam_betas: [0.9, 0.98]
    scheduling: "plateau"
    learning_rate_warmup: 100
    learning_rate: 0.0003
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    loss: "crossentropy"
    batch_size: 128
    batch_type: "token"
    batch_multiplier: 4
    early_stopping_metric: "bleu"
    epochs: 316
    validation_freq: 2000
    logging_freq: 2000
    overwrite: True
    shuffle: True
    print_valid_sents: [0, 1, 2, 3]
    keep_best_ckpts: 3

model:
    initializer: "xavier_uniform"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier_uniform"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 2
        num_heads: 8
        embeddings:
            embedding_dim: 384
            scale: True
            dropout: 0.1
        # typically ff_size = 4 x hidden_size
        hidden_size: 384
        ff_size: 1536
        dropout: 0.1
        layer_norm: "pre"
        activation: "gelu"
    decoder:
        type: "transformer"
        num_layers: 2
        num_heads: 8
        embeddings:
            embedding_dim: 384
            scale: True
            dropout: 0.1
        # typically ff_size = 4 x hidden_size
        hidden_size: 384
        ff_size: 1536
        dropout: 0.1
        layer_norm: "pre"
        activation: "gelu"

""".format(model_dir=model_dir)
with (Path(data_dir) / "config.yaml").open('w') as f:
    f.write(config)


### Run Training

In [None]:
%%time
!python -m joeynmt train {data_dir}/config.yaml --skip-test

2024-08-13 18:12:34.962745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 18:12:34.995076: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 18:12:35.004758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-13 18:12:35.026852: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-13 18:12:38,572 - INFO - root - Hello! This i

Invalid scheduler. Valid options: 'plateau', 'decaying', 'exponential', 'noam', 'warmupexponentialdecay', 'warmupinversesquareroot'.


Plateau - 1.99
warmupinversesquareroot - 1.00

In [None]:
# Add the best model info on config file
with (Path(model_dir) / "config.yaml").open('r') as f:
    config = f.read()
resume_config = config\
  .replace(f'#load_model: "{model_dir}/best.ckpt"',
           f'load_model: "{model_dir}/best.ckpt"')

resume_config = resume_config\
  .replace(f'model_file: "{data_dir}/sp.model"',
           f'model_file: "{model_dir}/sp.model"')

resume_config = resume_config\
  .replace(f'voc_file: "{data_dir}/vocab.txt"',
           f'voc_file: "{model_dir}/vocab.txt"')

with (Path(model_dir) / "config.yaml").open('w') as f:
    f.write(resume_config)

In [None]:
!cp {data_dir}/vocab.txt  {model_dir}
!cp -R {model_dir} /content/drive/MyDrive/mt-dyu-fr

cp: cannot create directory '/content/drive/MyDrive/mt-dyu-fr': No such file or directory


In [None]:
import os
import shutil

os.makedirs('/content/lean_model', exist_ok=True)

files_to_copy = [
    ("/content/../saved_model/dyu_fr/best.ckpt", "/content/lean_model/best.ckpt"),
    ("/content/../saved_model/dyu_fr/config.yaml", "/content/lean_model/config.yaml"),
    ("/content/../saved_model/dyu_fr/sp.model", "/content/lean_model/sp.model"),
    ("/content/../saved_model/dyu_fr/vocab.txt", "/content/lean_model/vocab.txt")
]

for src, dst in files_to_copy:
    shutil.copy(src, dst)


### Upload Trained Model to HuggingFace

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Remember to run `huggingface-cli login` before you run the code below
import os
from pathlib import Path

import joeynmt
import torch
from huggingface_hub import HfApi

api = HfApi()

In [None]:
# load the model
import torch
from joeynmt.config import load_config, parse_global_args
from joeynmt.prediction import predict, prepare


class JoeyNMTModel:
    """
    JoeyNMTModel which load JoeyNMT model for inference.

    :param config_path: Path to YAML config file
    :param n_best: return this many hypotheses, <= beam (currently only 1)
    """
    def __init__(self, config_path: str, n_best: int = 1):
        seed = 42
        torch.manual_seed(seed)
        cfg = load_config(config_path)
        args = parse_global_args(cfg, rank=0, mode="translate")
        self.args = args._replace(test=args.test._replace(n_best=n_best))
        # build model
        self.model, _, _, self.test_data = prepare(self.args, rank=0, mode="translate")

    def _translate_data(self):
        _, _, hypotheses, trg_tokens, trg_scores, _ = predict(
            model=self.model,
            data=self.test_data,
            compute_loss=False,
            device=self.args.device,
            rank=0,
            n_gpu=self.args.n_gpu,
            normalization="none",
            num_workers=self.args.num_workers,
            args=self.args.test,
            autocast=self.args.autocast,
        )
        return hypotheses, trg_tokens, trg_scores

    def translate(self, sentence) -> list:
        """
        Translate the given sentence.

        :param sentence: Sentence to be translated
        :return:
        - translations: (list of str) possible translations of the sentence.
        """
        self.test_data.set_item(sentence.strip())
        translations, _, _ = self._translate_data()
        assert len(translations) == len(self.test_data) * self.args.test.n_best
        self.test_data.reset_cache()
        return translations
config_path = "/content/lean_model/config.yaml" # Change this to the path to your model congig file
model = JoeyNMTModel(config_path=config_path, n_best=1)

2024-08-13 21:36:32,694 - INFO - joeynmt.data - Building tokenizer...
2024-08-13 21:36:32,703 - INFO - joeynmt.tokenizers - dyu tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 100), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
2024-08-13 21:36:32,704 - INFO - joeynmt.tokenizers - fr tokenizer: SentencePieceTokenizer(level=bpe, lowercase=False, normalize=False, filter_by_length=(-1, 100), pretokenizer=none, tokenizer=SentencePieceProcessor, nbest_size=5, alpha=0.0)
2024-08-13 21:36:32,705 - INFO - joeynmt.data - Building vocabulary...
2024-08-13 21:36:32,762 - INFO - joeynmt.data - Data loaded.
2024-08-13 21:36:32,763 - INFO - joeynmt.data - Train dataset: None
2024-08-13 21:36:32,765 - INFO - joeynmt.data - Valid dataset: None
2024-08-13 21:36:32,767 - INFO - joeynmt.data -  Test dataset: StreamDataset(split=test, len=0, src_lang="dyu", trg_lang="fr", has_trg=False, random_subset=-1, has_src_prompt=Fa

In [None]:
from tqdm import tqdm
import pandas as pd
# Convert the validation dataset to a pandas DataFrame
validation_data = dataset["validation"]
eval_df = pd.DataFrame(validation_data["translation"])

# Add a column for the predicted translations
eval_df['predicted'] = ""

# Iterate over the DataFrame and translate the sentences
for i, row in tqdm(eval_df.iterrows(), total=eval_df.shape[0]):
    predicted = model.translate(sentence=row['dyu'])
    eval_df.at[i, 'predicted'] = predicted[0]


  0%|          | 0/1471 [00:00<?, ?it/s]2024-08-13 21:36:33,386 - INFO - joeynmt.prediction - Predicting 1 example(s)... (Beam search with beam_size=10, beam_alpha=0.7, n_best=1, min_output_length=1, max_output_length=100, return_prob='none', generate_unk=True, repetition_penalty=-1, no_repeat_ngram_size=-1)
2024-08-13 21:36:33,437 - INFO - joeynmt.prediction - Generation took 0.0487[sec].
2024-08-13 21:36:33,438 - INFO - joeynmt.prediction - Predicting 1 example(s)... (Beam search with beam_size=10, beam_alpha=0.7, n_best=1, min_output_length=1, max_output_length=100, return_prob='none', generate_unk=True, repetition_penalty=-1, no_repeat_ngram_size=-1)
2024-08-13 21:36:33,599 - INFO - joeynmt.prediction - Generation took 0.1577[sec].
  0%|          | 2/1471 [00:00<02:38,  9.26it/s]2024-08-13 21:36:33,604 - INFO - joeynmt.prediction - Predicting 1 example(s)... (Beam search with beam_size=10, beam_alpha=0.7, n_best=1, min_output_length=1, max_output_length=100, return_prob='none', ge

In [None]:
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" SACREBLEU metric. """

import datasets
import sacrebleu as scb
import evaluate

_CITATION = """\
@inproceedings{post-2018-call,
    title = "A Call for Clarity in Reporting {BLEU} Scores",
    author = "Post, Matt",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6319",
    pages = "186--191",
}
"""

_DESCRIPTION = """\
SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores.
Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text.
It also knows all the standard test sets and handles downloading, processing, and tokenization for you.

See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information.
"""

_KWARGS_DESCRIPTION = """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
        - `'none'`: no smoothing
        - `'floor'`: increment zero counts
        - `'add-k'`: increment num/denom by k for n>1
        - `'exp'`: exponential decay
    smooth_value (`float`): The smoothing value. Only valid when `smooth_method='floor'` (in which case `smooth_value` defaults to `0.1`) or `smooth_method='add-k'` (in which case `smooth_value` defaults to `1`).
    tokenize (`str`): Tokenization method to use for BLEU. If not provided, defaults to `'zh'` for Chinese, `'ja-mecab'` for Japanese and `'13a'` (mteval) otherwise. Possible values are:
        - `'none'`: No tokenization.
        - `'zh'`: Chinese tokenization.
        - `'13a'`: mimics the `mteval-v13a` script from Moses.
        - `'intl'`: International tokenization, mimics the `mteval-v14` script from Moses
        - `'char'`: Language-agnostic character-level tokenization.
        - `'ja-mecab'`: Japanese tokenization. Uses the [MeCab tokenizer](https://pypi.org/project/mecab-python3).
    lowercase (`bool`): If `True`, lowercases the input, enabling case-insensitivity. Defaults to `False`.
    force (`bool`): If `True`, insists that your tokenized input is actually detokenized. Defaults to `False`.
    use_effective_order (`bool`): If `True`, stops including n-gram orders for which precision is 0. This should be `True`, if sentence-level BLEU will be computed. Defaults to `False`.

Returns:
    'score': BLEU score,
    'counts': Counts,
    'totals': Totals,
    'precisions': Precisions,
    'bp': Brevity penalty,
    'sys_len': predictions length,
    'ref_len': reference length,

Examples:

    Example 1:
        >>> predictions = ["hello there general kenobi", "foo bar foobar"]
        >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
        >>> sacrebleu = evaluate.load("sacrebleu")
        >>> results = sacrebleu.compute(predictions=predictions, references=references)
        >>> print(list(results.keys()))
        ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
        >>> print(round(results["score"], 1))
        100.0

    Example 2:
        >>> predictions = ["hello there general kenobi", "on our way to ankh morpork"]
        >>> references = [["hello there general kenobi", "hello there !"], ["goodbye ankh morpork", "ankh morpork"]]
        >>> sacrebleu = evaluate.load("sacrebleu")
        >>> results = sacrebleu.compute(predictions=predictions, references=references)
        >>> print(list(results.keys()))
        ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
        >>> print(round(results["score"], 1))
        39.8
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Sacrebleu(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            homepage="https://github.com/mjpost/sacreBLEU",
            inputs_description=_KWARGS_DESCRIPTION,
            features=[
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
                    }
                ),
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Value("string", id="sequence"),
                    }
                ),
            ],
            codebase_urls=["https://github.com/mjpost/sacreBLEU"],
            reference_urls=[
                "https://github.com/mjpost/sacreBLEU",
                "https://en.wikipedia.org/wiki/BLEU",
                "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213",
            ],
        )

    def _compute(
        self,
        predictions,
        references,
        smooth_method="exp",
        smooth_value=None,
        force=False,
        lowercase=False,
        tokenize=None,
        use_effective_order=False,
    ):
        if isinstance(references[0], str):
            references = [[ref] for ref in references]

        references_per_prediction = len(references[0])
        if any(len(refs) != references_per_prediction for refs in references):
            raise ValueError("Sacrebleu requires the same number of references for each prediction")
        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
        output = scb.corpus_bleu(
            predictions,
            transformed_references,
            smooth_method=smooth_method,
            smooth_value=smooth_value,
            force=force,
            lowercase=lowercase,
            use_effective_order=use_effective_order,
            **(dict(tokenize=tokenize) if tokenize else {}),
        )
        output_dict = {
            "score": output.score,
            "counts": output.counts,
            "totals": output.totals,
            "precisions": output.precisions,
            "bp": output.bp,
            "sys_len": output.sys_len,
            "ref_len": output.ref_len,
        }
        return output_dict


import evaluate

def bleu_custom(df):
    sacrebleu = evaluate.load("sacrebleu")
    predictions = df['predicted'].tolist()
    references = [[ref] for ref in df['fr'].tolist()]
    results = sacrebleu.compute(predictions=predictions, references=references)
    mean_bleu = results['score']
    print(f"Overall mean BLEU score: {mean_bleu}")
    return results

# Assuming eval_df is already defined and populated with predicted translations
result = bleu_custom(eval_df)
print(result)


Overall mean BLEU score: 8.087556451584101
{'score': 8.087556451584101, 'counts': [1503, 581, 298, 149], 'totals': [6786, 5315, 3854, 2471], 'precisions': [22.148541114058354, 10.931326434619002, 7.732226258432797, 6.0299473897207605], 'bp': 0.7846180751413663, 'sys_len': 6786, 'ref_len': 8432}


6_86 - 7.616578295827822 - 21.059322033898304

Overall mean BLEU score: 3.6887774429328783
{'score': 3.6887774429328783, 'counts': [1040, 267, 127, 56], 'totals': [7438, 5967, 4506, 3125], 'precisions': [13.982253293896209]}


Overall mean BLEU score: 3.5513888698242932
{'score': 3.5513888698242932, 'counts': [1090, 290, 115, 46], 'totals': [7391, 5920, 4455, 3070], 'precisions': [14.747666080368015, 4.898648648648648, 2.5813692480359145, 1.498371335504886], 'bp': 0.868622221529913, 'sys_len': 7391, 'ref_len': 8432}


Overall mean BLEU score: 3.5473260597558802
{'score': 3.5473260597558802, 'counts': [1087, 277, 114, 48], 'totals': [7328, 5857, 4397, 3018], 'precisions': [14.833515283842795, 4.7293836435035, 2.592676825108028, 1.5904572564612327], 'bp': 0.8601443785123691, 'sys_len': 7328, 'ref_len': 8432}


Overall mean BLEU score: 5.540000158701423
{'score': 5.540000158701423, 'counts': [1248, 410, 193, 96], 'totals': [7201, 5730, 4270, 2883], 'precisions': [17.330926260241633, 7.155322862129145, 4.519906323185012, 3.3298647242455774], 'bp': 0.8428649972820359, 'sys_len': 7201, 'ref_len': 8432}



Overall mean BLEU score: 6.412369271671805
{'score': 6.412369271671805, 'counts': [1382, 471, 238, 115], 'totals': [7453, 5982, 4514, 3096], 'precisions': [18.54286864349926, 7.873620862587764, 5.272485600354453, 3.714470284237726], 'bp': 0.8769051013445209, 'sys_len': 7453, 'ref_len': 8432}

In [None]:

HF_REPO_NAME = "Koleshjr/dyu-fr-joeynmt-316-epochs_2_layers_8heads_128_384_plateau_2000_7_95_21_48"
lean_model_dir = "/content/lean_model"

# Optionally add a model card
# Create the config
model_card = f"""---
language:
- en
- fr
- multilingual
tags:
- translation
- pytorch
model-index:
- name: koleshjr/dyu-fr-joeynmt
  results: []
---

# koleshjr/dyu-fr-joeynmt

An example of a machine translation model that translates Dyula to French using the [JoeyNMT framework](https://github.com/joeynmt/joeynmt).

This following example is based on [this Github repo](https://github.com/data354/koumakanMT-challenge) that was kindly created by [data354](https://data354.com/en/).

## Model description

More information needed

## Intended uses & limitations

More information needed

## Training and evaluation data

More information needed

## Usage

### Load and use for inference

```python
import torch
from joeynmt.config import load_config, parse_global_args
from joeynmt.prediction import predict, prepare
from huggingface_hub import snapshot_download

# Download model
snapshot_download(
    repo_id="{HF_REPO_NAME}",
    local_dir="/path/to/save/locally"
)

# Define model interface
class JoeyNMTModel:
    '''
    JoeyNMTModel which load JoeyNMT model for inference.

    :param config_path: Path to YAML config file
    :param n_best: return this many hypotheses, <= beam (currently only 1)
    '''
    def __init__(self, config_path: str, n_best: int = 1):
        seed = 42
        torch.manual_seed(seed)
        cfg = load_config(config_path)
        args = parse_global_args(cfg, rank=0, mode="translate")
        self.args = args._replace(test=args.test._replace(n_best=n_best))
        # build model
        self.model, _, _, self.test_data = prepare(self.args, rank=0, mode="translate")

    def _translate_data(self):
        _, _, hypotheses, trg_tokens, trg_scores, _ = predict(
            model=self.model,
            data=self.test_data,
            compute_loss=False,
            device=self.args.device,
            rank=0,
            n_gpu=self.args.n_gpu,
            normalization="none",
            num_workers=self.args.num_workers,
            args=self.args.test,
            autocast=self.args.autocast,
        )
        return hypotheses, trg_tokens, trg_scores

    def translate(self, sentence) -> list:
        '''
        Translate the given sentence.

        :param sentence: Sentence to be translated
        :return:
        - translations: (list of str) possible translations of the sentence.
        '''
        self.test_data.set_item(sentence.strip())
        translations, _, _ = self._translate_data()
        assert len(translations) == len(self.test_data) * self.args.test.n_best
        self.test_data.reset_cache()
        return translations

# Load model
config_path = "/path/to/lean_model/config_local.yaml" # Change this to the path to your model congig file
model = JoeyNMTModel(config_path=config_path, n_best=1)

# Translate
model.translate(sentence="i tɔgɔ bi cogodɔ")
```

## Training procedure

### Training hyperparameters

More information needed

### Training results

More information needed

### Framework versions

- JoeyNMT {joeynmt.__version__}
- Torch {torch.__version__}

"""
with (Path(lean_model_dir) / "README.md").open('w') as f:
    f.write(model_card)


# List files in the model directory (lean_model)
files = []
for filename in os.listdir(lean_model_dir):
    filepath = os.path.join(lean_model_dir, filename)
    if os.path.isfile(filepath):
        files.append(Path(filepath))

files

[PosixPath('/content/lean_model/best.ckpt'),
 PosixPath('/content/lean_model/sp.model'),
 PosixPath('/content/lean_model/config.yaml'),
 PosixPath('/content/lean_model/vocab.txt'),
 PosixPath('/content/lean_model/README.md')]

In [None]:

for file_path in files:
    print(file_path.name)
    print(str(file_path))
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=file_path.name,
        repo_id=HF_REPO_NAME,

    )

best.ckpt
/content/lean_model/best.ckpt


best.ckpt:   0%|          | 0.00/124M [00:00<?, ?B/s]

sp.model
/content/lean_model/sp.model


sp.model:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.yaml
/content/lean_model/config.yaml
vocab.txt
/content/lean_model/vocab.txt
README.md
/content/lean_model/README.md
