# INFORMASI
### Di bagian ini, model IndoBERTweet diberlakukan Masked Language Modelling sebagai DAPT techniques

In [None]:
import pandas as pd

In [3]:
import torch
from transformers import (
    BertTokenizer, 
    BertForMaskedLM, 
    LineByLineTextDataset, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


## DAPT IndoBERTWEET

In [8]:
MODEL_NAME = 'indolem/indobertweet-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
print("Loading dataset...")
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./dapt_corpus.txt", 
    block_size=128  # MAX_LEN: 128 
)

Loading dataset...




- LineByLineTextDataset utk membaca file txt baris per baris & mentokenisasinya
- MAX LEN 128 menjadi standar umum model BERT sentimen analysis

In [None]:

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15
)

Data Collator = Ini yang otomatis menutup 15% kata dengan [MASK] secara acak

In [11]:
model = BertForMaskedLM.from_pretrained(MODEL_NAME)
model = model.to(device)

Some weights of the model checkpoint at indolem/indobertweet-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
training_args = TrainingArguments(
    output_dir="./indobert-medsos-dapt", # Folder output model baru
    overwrite_output_dir=True,
    num_train_epochs=3,              
    per_device_train_batch_size=8,   
    gradient_accumulation_steps=2,   
    learning_rate=2e-5,
    fp16=True,                       
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,       # Kita cuma butuh loss, gak butuh akurasi di tahap ini
    dataloader_num_workers=0
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

Step,Training Loss
500,3.5711
1000,3.2642
1500,3.0801
2000,3.001
2500,2.9163
3000,2.9132
3500,2.8796


TrainOutput(global_step=3540, training_loss=3.0863787721105886, metrics={'train_runtime': 1000.2501, 'train_samples_per_second': 56.605, 'train_steps_per_second': 3.539, 'total_flos': 1865878118321076.0, 'train_loss': 3.0863787721105886, 'epoch': 3.0})

Hasil loss 2.87 sudah cukup bagus, kita tidak mengharapkan hasil DAPT memilik loss 0,.... seperti di proses fine-tune model. Karena di MLM sendiri, model harus menebak 1 kata dari ~18000 kata lainnya, sehingga loss yang kian menurun ini sudah cukup baik di 2,88.

## Simpan model hasil MLM

In [14]:
trainer.save_model("./indobertweet-yt-dapt")
tokenizer.save_pretrained("./indobertweet-yt-dapt")

('./indobertweet-yt-dapt\\tokenizer_config.json',
 './indobertweet-yt-dapt\\special_tokens_map.json',
 './indobertweet-yt-dapt\\vocab.txt',
 './indobertweet-yt-dapt\\added_tokens.json')