In [None]:
#! pip install pyarrow datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /gdrive


In [None]:
import os
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric
import numpy as np


In [None]:
# !nvcc --version
torch.cuda.is_available()

True

In [None]:
device = torch.device('cuda') 
os.environ["WANDB_DISABLED"]="true" 
model_checkpoint = "Helsinki-NLP/opus-mt-ko-en" 


In [None]:
# load preprocessed dataset
# write lines, where line is = {"translation": {"ko": ko_sentence, "en": en_sentence}}
raw_datasets = load_dataset('json', data_files= {'train':'/content/drive/MyDrive/nlp_study/rawdataset/train_ko_en_dataset.json',
                                                 'validation':'/content/drive/MyDrive/nlp_study/rawdataset/validation_ko_en_dataset.json',
                                                 'test':'/content/drive/MyDrive/nlp_study/rawdataset/test_ko_en_dataset.json'})

Using custom data configuration default-60be4a8afb0af00f


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-60be4a8afb0af00f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-60be4a8afb0af00f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 378778
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1241
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1241
    })
})

In [None]:
metric = load_metric("sacrebleu")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/822k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/794k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "ko"
target_lang = "en"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60be4a8afb0af00f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-66f80348d2bb4450.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60be4a8afb0af00f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-30a64e95562d6edc.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-60be4a8afb0af00f/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-121c4e7194a5c4ed.arrow


In [None]:

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True    
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
transformers.logging.set_verbosity_info()

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 378778
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 23674


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.2395,1.094333,38.772,15.6753


Saving model checkpoint to opus-mt-ko-en-finetuned-ko-to-en/checkpoint-500
Configuration saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-500/config.json
Model weights saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-500/pytorch_model.bin
tokenizer config file saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-500/tokenizer_config.json
Special tokens file saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-500/special_tokens_map.json
Saving model checkpoint to opus-mt-ko-en-finetuned-ko-to-en/checkpoint-1000
Configuration saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-1000/config.json
Model weights saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-1000/tokenizer_config.json
Special tokens file saved in opus-mt-ko-en-finetuned-ko-to-en/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to opus-mt-ko-en-finetuned-ko-to-en/checkpoint-1500
Configuration saved i

TrainOutput(global_step=23674, training_loss=1.3147646559244517, metrics={'train_runtime': 3024.4674, 'train_samples_per_second': 125.238, 'train_steps_per_second': 7.827, 'total_flos': 2852578658746368.0, 'train_loss': 1.3147646559244517, 'epoch': 1.0})

In [None]:
# last checkpoint = checkpoint-23500
# checking the fine tuned model
from transformers import MarianMTModel, MarianTokenizer
src_text = ['My name is Sarah and I live in London']

trained_model_name = 'opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500'

In [None]:
tokenizer = MarianTokenizer.from_pretrained(trained_model_name)
print(tokenizer.supported_language_codes)

Didn't find file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/added_tokens.json. We won't load it.
Didn't find file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/tokenizer.json. We won't load it.
loading file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/source.spm
loading file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/target.spm
loading file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/vocab.json
loading file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/tokenizer_config.json
loading file None
loading file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/special_tokens_map.json
loading file None


[]


In [None]:
src_text = ['이 회사의 가치는 향후 20년 동안 기하적으로 급상승할 것이라고 전망이 됩니다.']
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

loading configuration file opus-mt-ko-en-finetuned-ko-to-en/checkpoint-23500/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-ko-en",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 65000,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 65001,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "labe

["The company's value is expected to rise exponentially over the next 20 years."]

In [None]:
import shutil
in_dir = r'/content/copy_runs'
out_dir = r'/content/drive/MyDrive/finetune_model_ko_en/final/runs'
for f in os.listdir(in_dir):
  new_file_dir = os.path.join(out_dir, f)
  shutil.copy(os.path.join(in_dir, f), new_file_dir)
#shutil.copy("//content/opus-mt-ko-en-finetuned-ko-to-en/runs/Jan13_17-03-35_7eabdf4415bb/events.out.tfevents.1642093454.7eabdf4415bb.73.0", "/content/copy_runs/events.out.tfevents.1642093454.7eabdf4415bb.73.0")