<a href="https://colab.research.google.com/github/jubin0615/DL/blob/main/HW2_2_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q "datasets==2.21.0" protobuf==3.20.3 evaluate sacrebleu sentencepiece transformers[torch]

In [None]:
!pip install -q transformers[torch] datasets evaluate sacrebleu sentencepiece

In [None]:
from datasets import load_dataset, DatasetDict

# Load english-korean sentence pairs
# dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-ko-en")
dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-fr-en", trust_remote_code=True)

## Since the number of training data is too large, it will take a long time to train. So, let's just use a subset of training data
# You can use any number instead of 5000. But all you have to do is, achieve higher score than 0.1 BLEU score.
dataset['train'] = dataset['train'].select(range(5000))

# Possible language pairs
#'iwslt2017-en-it', 'iwslt2017-en-nl', 'iwslt2017-en-ro', 'iwslt2017-it-en', 'iwslt2017-it-nl',
#'iwslt2017-it-ro', 'iwslt2017-nl-en', 'iwslt2017-nl-it', 'iwslt2017-nl-ro', 'iwslt2017-ro-en',
#'iwslt2017-ro-it', 'iwslt2017-ro-nl', 'iwslt2017-ar-en', 'iwslt2017-de-en', 'iwslt2017-en-ar',
#'iwslt2017-en-de', 'iwslt2017-en-fr', 'iwslt2017-en-ja', 'iwslt2017-en-ko', 'iwslt2017-en-zh',
#'iwslt2017-fr-en', 'iwslt2017-ja-en', 'iwslt2017-ko-en', 'iwslt2017-zh-en'

# If you plan to use the dataset that has only a train data, then execute the following, otherwise pass it
# Split into train (70%), validation (15%), and test (15%)
# train_test_split = dataset['train'].train_test_split(test_size=0.3, seed=42)
# validation_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

# Combine splits into a new DatasetDict
# dataset = DatasetDict({
#     'train': train_test_split['train'],
#     'validation': validation_test_split['train'],
#     'test': validation_test_split['test']
# })

#Do not change the below
dataset['test'] = dataset['test'].select(range(100))
dataset['validation'] = dataset['validation'].select(range(100))
print(dataset)
for i in dataset['validation']['translation'][:10]:
    print(i)



In [None]:
import torch
import numpy as np
import evaluate
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer.src_lang = "fr_XX"  # 입력: 프랑스어
tokenizer.tgt_lang = "en_XX"  # 출력: 영어

def preprocess_function(examples):
    inputs = [ex['fr'] for ex in examples['translation']]
    targets = [ex['en'] for ex in examples['translation']]

    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        max_length=64,
        truncation=True
    )
    return model_inputs

# 데이터셋에 적용
tokenized_datasets = dataset.map(preprocess_function, batched=True)

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # 디코딩
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 공백 정리
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]/100}

args = Seq2SeqTrainingArguments(
    output_dir="./mbart_fr_en",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,

    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.model.config.forced_bos_token_id = tokenizer.lang_code_to_id["en_XX"]
trainer.train()

In [None]:
test_results = trainer.predict(tokenized_datasets["test"])
print(f"BLEU score on test data is {test_results.metrics['test_bleu']}")