In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import torch
import os
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

print(raw_datasets)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})


In [4]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.8, seed=20)

print(split_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 168138
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 42035
    })
})


In [5]:
split_datasets["validation"] = split_datasets.pop("test")

In [6]:
def flatten_translation(examples):
    return {
        "en": [ex["en"] for ex in examples["translation"]],
        "fr": [ex["fr"] for ex in examples["translation"]]
    }

equivalent_datasets = split_datasets.map(flatten_translation, batched=True, remove_columns=["id", "translation"])

print(equivalent_datasets)

DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 168138
    })
    validation: Dataset({
        features: ['en', 'fr'],
        num_rows: 42035
    })
})


In [7]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [8]:
en_sentence = equivalent_datasets["train"]["en"][0]
fr_sentence = equivalent_datasets["train"]["fr"][0]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
print(inputs)
print(tokenizer.decode(inputs["input_ids"]))
print(tokenizer.encode(en_sentence))
print(tokenizer.decode(inputs["labels"]))

{'input_ids': [1232, 13572, 7823, 9, 0], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [22181, 10691, 412, 9, 1232, 21332, 0]}
Web Shortcuts</s>
[1232, 13572, 7823, 9, 0]
Raccourcis WebComment</s>


In [17]:
max_length = 128
def preprocess_function(examples):
    inputs = examples["en"]
    targets = examples["fr"]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

# data_check = equivalent_datasets["train"][0:4]
# print(preprocess_function(data_check))
tokenized_datasets_eq = equivalent_datasets.map(preprocess_function, batched=True, remove_columns=equivalent_datasets["train"].column_names)

Map: 100%|██████████| 168138/168138 [00:44<00:00, 3768.36 examples/s]
Map: 100%|██████████| 42035/42035 [00:11<00:00, 3803.06 examples/s]


In [16]:
max_length = 128
def preprocess_function2(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

# data_check = split_datasets["train"][0:10]
# print(preprocess_function2(data_check))
tokenized_datasets = split_datasets.map(preprocess_function2, batched=True, remove_columns=split_datasets["train"].column_names)

Map: 100%|██████████| 168138/168138 [00:47<00:00, 3532.26 examples/s]
Map: 100%|██████████| 42035/42035 [00:11<00:00, 3538.98 examples/s]


In [20]:
max_length = 128
def preprocess_function3(examples):
    inputs = examples["en"]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["fr"], max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets_3 = equivalent_datasets.map(preprocess_function3, batched=True, remove_columns=equivalent_datasets["train"].column_names)

Map: 100%|██████████| 168138/168138 [00:44<00:00, 3802.50 examples/s]
Map: 100%|██████████| 42035/42035 [00:10<00:00, 3924.09 examples/s]


In [21]:
print(tokenized_datasets["train"][0])
print(tokenized_datasets_eq["train"][0])
print(tokenized_datasets_3["train"][0])

{'input_ids': [1232, 13572, 7823, 9, 0], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [22181, 10691, 412, 9, 1232, 21332, 0]}
{'input_ids': [1232, 13572, 7823, 9, 0], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [22181, 10691, 412, 9, 1232, 21332, 0]}
{'input_ids': [1232, 13572, 7823, 9, 0], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [22181, 10691, 412, 9, 1232, 21332, 0]}


In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [49]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1)])
print(batch.keys())
# print(batch["labels"])
# print(batch["input_ids"])
# print(batch["attention_mask"])

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])


In [None]:
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds): 
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

['Raccourcis ?Comment'] ['Raccourcis WebComment']
{'score': 0.0, 'counts': [1, 0, 0, 0], 'totals': [3, 2, 1, 0], 'precisions': [33.333333333333336, 25.0, 25.0, 0.0], 'bp': 1.0, 'sys_len': 3, 'ref_len': 2}
{'bleu': 0.0}


In [73]:
hf_login_key = os.environ.get("HF_LOGIN_KEY")
login(token=hf_login_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\kyanj\.cache\huggingface\token
Login successful


In [None]:
args = Seq2SeqTrainingArguments(
    f"Model_Files/translation-finetuning-test-en-fr",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
    report_to=["wandb"],
    run_name="translation-finetuning-test-en-fr-v1"
    )

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [3]:
print(torch.cuda.is_available())

False


In [None]:
from accelerate import Accelerator
from transformers import get_scheduler

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )