In [3]:
%cd ..
%load_ext autoreload
%autoreload 2

/home/haryoaw/documents/courses/nlp802/project/texteditalay


In [4]:
import fire
from transformers import AutoTokenizer, BertForTokenClassification, BertConfig, BertForMaskedLM
from neo_stif.components.utils import create_label_map
import pandas as pd
from neo_stif.components.train_data_preparation import prepare_data_tagging_and_pointer
import datasets
from neo_stif.lit import LitPointer, LitTaggerOrInsertion
from torch.utils.data import DataLoader
from neo_stif.components.collator import FelixCollator, FelixInsertionCollator
from lightning import Trainer
from lightning.pytorch.callbacks import RichProgressBar, ModelCheckpoint, EarlyStopping
from neo_stif.components.utils import compute_class_weights
from datasets import load_from_disk


MAX_MASK = 30
USE_POINTING = True


model_dict = {"koto": "indolem/indobert-base-uncased"}


LR_TAGGER = 5e-5 # due to the pre-trained nature
LR_POINTER = 1e-5 # no pre-trained
LR_INSERTION = 2e-5 # due to the pre-trained nature
VAL_CHECK_INTERVAL = 20

In [5]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
label_dict = create_label_map(MAX_MASK, USE_POINTING)

# Callback for trainer

df_train = pd.read_csv("data/stif_indo/train_with_pointing.csv")
data_train = datasets.Dataset.from_pandas(df_train)
data_train, label_dict = prepare_data_tagging_and_pointer(
    data_train, tokenizer, label_dict
)
model_path_or_name = model_dict["koto"]

Map: 100%|██████████| 1922/1922 [00:00<00:00, 10459.69 examples/s]
Map: 100%|██████████| 1922/1922 [00:01<00:00, 1918.60 examples/s]


In [22]:
pre_trained_bert = BertForTokenClassification.from_pretrained(
        model_path_or_name, num_labels=len(label_dict)
    )

pointer_network_config = BertConfig(
        vocab_size=len(label_dict) + 1,
        num_hidden_layers=2,
        hidden_size=100,
        num_attention_heads=1,
        pad_token_id=len(label_dict),
    )  # + 1 as the pad token
lit_tagger = LitTaggerOrInsertion(
    pre_trained_bert,
    lr=LR_TAGGER,
    num_classes=len(label_dict),
    class_weight=None,
    tokenizer=tokenizer,
    label_dict=label_dict,
    use_pointer=USE_POINTING,
    pointer_config=pointer_network_config,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from torch.utils.data import DataLoader

In [24]:
dl = DataLoader(data_train, batch_size=2, collate_fn=FelixCollator(tokenizer, pad_label_as_input=len(label_dict)))

In [25]:
batch = next(iter(dl))

In [26]:
input_to_model = {
    k: v
    for k, v in batch.items()
    if k in ["input_ids", "attention_mask", "token_type_ids"]
}

out_tagger = lit_tagger(**input_to_model, output_hidden_states=True)
logits, last_hidden_state = out_tagger.logits, out_tagger.hidden_states[-1]


In [27]:
input_to_model["input_ids"] = batch.pop("tag_labels_input")

In [32]:
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'labels', 'tag_labels', 'point_labels'])

In [33]:
lit_tagger.forward_pointer(**input_to_model, previous_last_hidden=last_hidden_state, labels=batch["point_labels"])

tensor([[[ 0.8405,  0.6828,  1.5589,  ...,  0.1676,  0.1404,  0.1342],
         [ 0.0942,  0.6466,  0.9292,  ..., -0.1699,  0.4575, -0.0187],
         [ 0.1260,  0.1223,  0.5908,  ..., -0.1454,  0.1146,  0.1259],
         ...,
         [ 1.4951,  0.7587,  0.4712,  ...,  0.1887,  0.5146, -0.1423],
         [-0.1156,  0.2500,  0.4943,  ...,  0.2851,  0.2963, -0.0067],
         [ 0.5965,  0.7373,  1.0791,  ...,  0.0721,  0.5133, -0.1444]],

        [[ 0.6767,  0.5220,  0.9397,  ...,  0.4351, -0.1700, -0.0956],
         [ 0.1796,  0.7682,  0.5418,  ...,  0.0973, -0.1072, -0.1483],
         [-0.1553,  0.7158,  0.7125,  ..., -0.0647, -0.1159, -0.1457],
         ...,
         [-0.1429,  0.8854,  0.6558,  ..., -0.1628,  0.0914, -0.1264],
         [-0.1189,  0.0296,  1.5637,  ...,  0.1906, -0.1379, -0.0803],
         [ 0.0476,  0.4853,  1.5372,  ..., -0.0752, -0.0934, -0.1675]]],
       grad_fn=<GeluBackward0>)
tensor([[[ 2.5538,  1.0884,  4.3728,  ...,  0.3323, -0.0632,  0.2884],
         [ 0.

(tensor(3.3687, grad_fn=<NllLossBackward0>),
 tensor([[[[0.0000, 0.0415, 0.0338,  ..., 0.0377, 0.0419, 0.0381],
           [0.0384, 0.0408, 0.0335,  ..., 0.0424, 0.0431, 0.0386],
           [0.0412, 0.0431, 0.0391,  ..., 0.0405, 0.0405, 0.0392],
           ...,
           [0.0379, 0.0448, 0.0403,  ..., 0.0354, 0.0372, 0.0343],
           [0.0361, 0.0339, 0.0348,  ..., 0.0410, 0.0395, 0.0386],
           [0.0397, 0.0439, 0.0350,  ..., 0.0382, 0.0409, 0.0389]]],
 
 
         [[[0.0379, 0.0500, 0.0410,  ..., 0.0426, 0.0414, 0.0000],
           [0.0399, 0.0450, 0.0368,  ..., 0.0445, 0.0421, 0.0000],
           [0.0000, 0.0441, 0.0437,  ..., 0.0396, 0.0402, 0.0000],
           ...,
           [0.0394, 0.0451, 0.0443,  ..., 0.0459, 0.0397, 0.0000],
           [0.0397, 0.0477, 0.0442,  ..., 0.0000, 0.0413, 0.0000],
           [0.0400, 0.0000, 0.0485,  ..., 0.0400, 0.0420, 0.0000]]]],
        grad_fn=<MulBackward0>))

In [7]:
processed_train_data = "data/stif_indo/train_insertion"
processed_dev_data = "data/stif_indo/dev_insertion"
batch_size=2
device="cpu"
rich_cb = RichProgressBar()

ea_stop = EarlyStopping(patience=5, monitor="val_loss", mode="min")
train_data = load_from_disk(processed_train_data)
dev_data = load_from_disk(processed_dev_data)
train_dl = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=FelixInsertionCollator(tokenizer),
)
dev_dl = DataLoader(
    dev_data,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=FelixInsertionCollator(tokenizer),
)
model = BertForMaskedLM.from_pretrained(model_path_or_name)
lit_insert = LitTaggerOrInsertion(
    model,
    lr=LR_INSERTION,
    num_classes=model.config.vocab_size,
    class_weight=None,
    tokenizer=tokenizer,
    label_dict=label_dict,
    is_insertion=True,
)
trainer = Trainer(
    accelerator=device,
    devices=1,
    val_check_interval=20,
    check_val_every_n_epoch=None,
    callbacks=[rich_cb, ea_stop],
    fast_dev_run=True
)


Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [8]:
trainer.fit(lit_insert, train_dl, dev_dl)

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


`Trainer.fit` stopped: `max_steps=1` reached.


: 