In [1]:
%%capture
# ! pip install huggingface_hub
# ! pip install transformers
# ! pip install datasets
# ! pip install evaluate
# ! pip install sacrebleu
# ! pip install matplotlib
# ! pip install torch torchvision torchaudio
# ! pip install sacrebleu
# ! pip install nltk

In [2]:
# libraries to import
import re
import pandas as pd
import nltk
from nltk.corpus import wordnet
import random
from datasets import load_dataset
import evaluate
import numpy as np

from datasets import Dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, NllbTokenizerFast
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

import torch
from tqdm.auto import tqdm, trange
from transformers.optimization import Adafactor, AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup

import gc

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
# connec to huggingface
! huggingface-cli login

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
# load and show
ds = load_dataset("uvci/Koumankan_mt_dyu_fr")
ds

In [11]:
CHARS_TO_REMOVE_REGEX = '[!"&\(\),-./:;=?+.\n\[\]]'
SRC_LANG = "dyu"
TRG_LANG = "fr"
DIR = 'machine-translation/mt_facebook_nllb_dyu_fr'

In [12]:
# create different dataframe based on the dataset
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])
val_df = pd.DataFrame(ds['validation'])

In [13]:
# show one of them
train_df

Unnamed: 0,ID,translation
0,ID_18897661270129,"{'dyu': 'A bi ji min na', 'fr': 'Il boit de l’..."
1,ID_18479132727846,"{'dyu': 'A le dalakolontɛ lon bɛ.', 'fr': 'Il ..."
2,ID_18164131280307,"{'dyu': 'Mun? Fɛn dɔ.', 'fr': 'Quoi ? Quelque ..."
3,ID_18344573728152,"{'dyu': 'O bɛ bi bɔra fo Gubeta.', 'fr': 'Tous..."
4,ID_18127342282717,"{'dyu': 'A ale lo bi da bugɔ la!', 'fr': 'Ah !..."
...,...,...
8060,ID_17804695917936,"{'dyu': 'Alu a yi n ka yanmariya kɔnɔ.', 'fr':..."
8061,ID_19488957919412,"{'dyu': 'O fatura aksidan ni na.', 'fr': 'Ils ..."
8062,ID_18268594920535,"{'dyu': 'An! Ni tericɛ fariman ni.', 'fr': 'Ah..."
8063,ID_17525560921507,"{'dyu': 'Sen bi dougouma', 'fr': 'À bas les pa..."


In [14]:
# create another dataframe, which we will use to augment the training datas
to_use = pd.DataFrame(columns = ['ID','dyu', 'fr'])
for i in range(len(train_df['translation'])):
    new_row = pd.DataFrame({'ID' : train_df['ID'][i],'dyu': [train_df['translation'][i]['dyu']], 'fr': [train_df['translation'][i]['fr']]})
    to_use = pd.concat([to_use, new_row], ignore_index=True)

to_use

Unnamed: 0,ID,dyu,fr
0,ID_18897661270129,A bi ji min na,Il boit de l’eau.
1,ID_18479132727846,A le dalakolontɛ lon bɛ.,Il se plaint toujours.
2,ID_18164131280307,Mun? Fɛn dɔ.,Quoi ? Quelque chose.
3,ID_18344573728152,O bɛ bi bɔra fo Gubeta.,Tous sortent excepté Gubetta.
4,ID_18127342282717,A ale lo bi da bugɔ la!,Ah ! c’est lui… il sonne…
...,...,...,...
8060,ID_17804695917936,Alu a yi n ka yanmariya kɔnɔ.,Allez… attendez mes ordres.
8061,ID_19488957919412,O fatura aksidan ni na.,Ils ont péri dans l'accident.
8062,ID_18268594920535,An! Ni tericɛ fariman ni.,Ah ! ce brave ami !
8063,ID_17525560921507,Sen bi dougouma,À bas les pattes !


In [15]:
nltk.download('wordnet')
nltk.download('omw-1.4')

# function to get the synonyms of a word
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word, lang='fra'):
        for lemma in syn.lemmas('fra'):
            synonyms.append(lemma.name())
    unique_synonyms = list(set(synonyms))
    random.shuffle(unique_synonyms)
    return unique_synonyms

# function to randomly replace a word in a sentence
def replace_with_synonym(sentence):
    words = sentence.split()
    word_to_replace = random.choice(words)
    synonyms = list(get_synonyms(word_to_replace))
    if synonyms:
        synonym = random.choice(synonyms)
        sentence = sentence.replace(word_to_replace, synonym, 1)
    return sentence

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
# create a copy of the precedent dataframe to have the originals and the modified
train_df_2 = pd.DataFrame(columns=['ID', 'translation'])

for i in range(len(to_use)):
    to_add = {'dyu': to_use['dyu'][i], 'fr': to_use['fr'][i]}
    new_row = {'ID': f'ID_{random.randint(200000000,999999999)}', 'translation': to_add}
    train_df_2 = pd.concat([train_df_2, pd.DataFrame([new_row])], ignore_index=True)

train_df_2

Unnamed: 0,ID,translation
0,ID_354721039,"{'dyu': 'A bi ji min na', 'fr': 'Il boit de l’..."
1,ID_268618265,"{'dyu': 'A le dalakolontɛ lon bɛ.', 'fr': 'Il ..."
2,ID_337738635,"{'dyu': 'Mun? Fɛn dɔ.', 'fr': 'Quoi ? Quelque ..."
3,ID_764250322,"{'dyu': 'O bɛ bi bɔra fo Gubeta.', 'fr': 'Tous..."
4,ID_926913680,"{'dyu': 'A ale lo bi da bugɔ la!', 'fr': 'Ah !..."
...,...,...
8060,ID_692185500,"{'dyu': 'Alu a yi n ka yanmariya kɔnɔ.', 'fr':..."
8061,ID_258341877,"{'dyu': 'O fatura aksidan ni na.', 'fr': 'Ils ..."
8062,ID_562686744,"{'dyu': 'An! Ni tericɛ fariman ni.', 'fr': 'Ah..."
8063,ID_645802691,"{'dyu': 'Sen bi dougouma', 'fr': 'À bas les pa..."


In [16]:
# replace randomly a word by its synonym in each line of the dataframe
to_use['fr'] = to_use['fr'].apply(replace_with_synonym)

In [20]:
# fuse the two dataframes
full_train_df = pd.concat([train_df, train_df_2], axis=0, ignore_index=True)
full_train_df

Unnamed: 0,ID,translation
0,ID_18897661270129,"{'dyu': 'A bi ji min na', 'fr': 'Il boit de l’..."
1,ID_18479132727846,"{'dyu': 'A le dalakolontɛ lon bɛ.', 'fr': 'Il ..."
2,ID_18164131280307,"{'dyu': 'Mun? Fɛn dɔ.', 'fr': 'Quoi ? Quelque ..."
3,ID_18344573728152,"{'dyu': 'O bɛ bi bɔra fo Gubeta.', 'fr': 'Tous..."
4,ID_18127342282717,"{'dyu': 'A ale lo bi da bugɔ la!', 'fr': 'Ah !..."
...,...,...
16125,ID_692185500,"{'dyu': 'Alu a yi n ka yanmariya kɔnɔ.', 'fr':..."
16126,ID_258341877,"{'dyu': 'O fatura aksidan ni na.', 'fr': 'Ils ..."
16127,ID_562686744,"{'dyu': 'An! Ni tericɛ fariman ni.', 'fr': 'Ah..."
16128,ID_645802691,"{'dyu': 'Sen bi dougouma', 'fr': 'À bas les pa..."


In [27]:
# create datasets based on the training dataframe, test dataframe and validation dataframe
train_dataset = Dataset.from_pandas(full_train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(val_df)


In [28]:
# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'translation'],
        num_rows: 16130
    })
    test: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1393
    })
    validation: Dataset({
        features: ['ID', 'translation'],
        num_rows: 1471
    })
})

In [29]:
# cleaning the datas (remove punctuation and lowercase everything)
def remove_special_characters(text):
    text = re.sub(CHARS_TO_REMOVE_REGEX, " ", text.lower())
    return text.strip()

def clean_text(batch):
    batch["translation"][SRC_LANG] = remove_special_characters(batch["translation"][SRC_LANG])
    batch["translation"][TRG_LANG] = remove_special_characters(batch["translation"][TRG_LANG])
    return batch

# apply the cleaning
dataset = dataset.map(clean_text)

Map:   0%|          | 0/16130 [00:00<?, ? examples/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1471 [00:00<?, ? examples/s]

In [33]:
# Load model and tokenizer
tokenizer = NllbTokenizerFast.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="dyu_Latn", tgt_lang="fra_Latn"
)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [34]:
# function to preprocess/tokenise the datas
max_length = 256
def preprocess_function(examples):
    inputs = [ex["dyu"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [36]:
# apply the tokenization
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=ds["train"].column_names,
)

Map:   0%|          | 0/16130 [00:00<?, ? examples/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Map:   0%|          | 0/1471 [00:00<?, ? examples/s]

In [37]:
# process and create batch of datas (example) and then print the keys
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

In [41]:
# define evaluation metric
metric = evaluate.load("sacrebleu")

In [42]:
# function to compute the previously selected metric
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Check if preds is a tuple and extract the logits if it is
    if isinstance(preds, tuple):
        preds = preds[0]

    # Ensure preds are integers and within valid range
    preds = np.array(preds, dtype=np.int64)
    if np.any(preds < 0) or np.any(preds >= tokenizer.vocab_size):
        print("Warning: Preds contain out-of-range values")
        preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    # Decode the predictions using the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # Compute the metric
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Return the computed metric
    return {"bleu": result["score"]}



In [43]:
# define the training arguments
args = Seq2SeqTrainingArguments(
    DIR,
    report_to='none',
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [44]:
# define the parameters of the training function we will use
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## TRAINING WITH TRAINER FUNCTION

In [45]:
# define function to get some space on the gpu
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [46]:
# evaluate performances before training
trainer.evaluate(max_length=max_length)

{'eval_loss': 3.4963648319244385,
 'eval_bleu': 3.083315249586182,
 'eval_runtime': 32.1102,
 'eval_samples_per_second': 45.811,
 'eval_steps_per_second': 0.716}

In [48]:
# do some cleanup, run and save the model
cleanup()
trainer.train()
tokenizer.save_pretrained(DIR)
model.save_pretrained(DIR)

Epoch,Training Loss,Validation Loss,Bleu
1,2.8941,2.350066,5.246169
2,2.4889,2.268962,7.740157
3,2.2751,2.235852,7.395943
4,2.1224,2.229784,10.009536
5,1.996,2.226373,10.643348
6,1.9061,2.229146,10.315838
7,1.8315,2.234242,10.788804
8,1.7792,2.243133,11.209866
9,1.7425,2.248103,10.89441
10,1.7172,2.25244,10.916711


Non-default generation parameters: {'max_length': 200}


## TRAINING WITH CUSTOM TRAINING LOOP (OPTIONAL)

In [45]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [46]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [47]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [48]:
from transformers import get_scheduler

num_train_epochs = 25
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [50]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [51]:
import gc
import torch

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = torch.tensor(batch["labels"], dtype=torch.int64)

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

tokenizer.save_pretrained(DIR)
model.save_pretrained(DIR)

  0%|          | 0/25225 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 0, BLEU score: 8.97


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 1, BLEU score: 10.05


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 2, BLEU score: 10.52


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 3, BLEU score: 10.39


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 4, BLEU score: 11.23


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 5, BLEU score: 10.71


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 6, BLEU score: 11.12


  0%|          | 0/184 [00:00<?, ?it/s]

  labels = torch.tensor(batch["labels"], dtype=torch.int64)
Non-default generation parameters: {'max_length': 200}


epoch 7, BLEU score: 11.44
