In [2]:
%cd ..

/home/haryoaw/documents/courses/nlp802/project/texteditalay


In [1]:
from transformers import MBartForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from indobenchmark import IndoNLGTokenizer

tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
model = MBartForConditionalGeneration.from_pretrained("indobenchmark/indobart-v2")

In [4]:
import pandas as pd
import datasets

In [24]:
df_train = pd.read_csv("data/stif_indo/train_with_pointing.csv")
df_train_used = df_train[['informal', 'formal']]

df_val = pd.read_csv("data/stif_indo/dev_with_pointing.csv")
df_val_used = df_val[['informal', 'formal']]

In [25]:
df_train_data = datasets.Dataset.from_pandas(df_train_used)
df_val_data = datasets.Dataset.from_pandas(df_val_used)

In [26]:
def tokenize_function(examples, src='informal', tgt='formal'):
    src = examples[src]
    tgt = examples[tgt]

    src_tokenized = tokenizer(src, truncation=True)
    tgt_tokenized = tokenizer(tgt, truncation=True)['input_ids']
    returned_dict = {
        'input_ids': src_tokenized['input_ids'],
        'attention_mask': src_tokenized['attention_mask'],
        'labels': tgt_tokenized
    }
    return returned_dict

In [27]:
train_tokenized = df_train_data.map(tokenize_function, batched=True, batch_size=32, remove_columns=['informal', 'formal'])
val_tokenized = df_val_data.map(tokenize_function, batched=True, batch_size=32, remove_columns=['informal', 'formal'])

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map: 100%|██████████| 1922/1922 [00:01<00:00, 1691.63 examples/s]
Map: 100%|██████████| 214/214 [00:00<00:00, 2239.73 examples/s]


In [28]:
train_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1922
})

In [10]:
from transformers import DataCollatorForSeq2Seq

In [11]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model,)

In [12]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_tokenized, batch_size=32, collate_fn=collator)

In [16]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

In [31]:
early_stopping_cb = EarlyStoppingCallback(early_stopping_patience=5)

In [32]:
training_args = TrainingArguments(
    learning_rate=1e-5,
    evaluation_strategy = "steps",
    eval_steps = 100,
    save_steps = 100,
    metric_for_best_model = "eval_loss",
    save_strategy = "steps",
    num_train_epochs=50,
    max_steps=1,
    output_dir='outputs/stif-i-f/indobart-v2/',
    per_device_train_batch_size=8,
    save_total_limit=1,
    weight_decay = 0.01,
    load_best_model_at_end = True,
)

In [34]:
trainer= Trainer(
    model = model,
    args = training_args,
    train_dataset = train_tokenized,
    eval_dataset = val_tokenized,
    tokenizer = tokenizer,
    data_collator = collator,
    callbacks = [early_stopping_cb]
)

In [None]:
trainer.train()