In [1]:
%cd ..
%load_ext autoreload
%autoreload 2


/home/haryoaw/documents/courses/nlp802/project/texteditalay


In [2]:
import fire
from transformers import AutoTokenizer, BertForTokenClassification, BertConfig, BertForMaskedLM
from neo_stif.components.utils import create_label_map
import pandas as pd
from neo_stif.components.train_data_preparation import prepare_data_tagging_and_pointer
import datasets
from neo_stif.lit import LitTaggerOrInsertion
from torch.utils.data import DataLoader
from neo_stif.components.collator import FelixCollator, FelixInsertionCollator
from lightning import Trainer
from lightning.pytorch.callbacks import RichProgressBar, ModelCheckpoint, EarlyStopping
from neo_stif.components.utils import compute_class_weights
from datasets import load_from_disk


MAX_MASK = 30
USE_POINTING = True


model_dict = {"koto": "indolem/indobert-base-uncased"}


LR_TAGGER = 5e-5 # due to the pre-trained nature
LR_POINTER = 1e-5 # no pre-trained
LR_INSERTION = 2e-5 # due to the pre-trained nature
VAL_CHECK_INTERVAL = 20

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_path_or_name = model_dict["koto"]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
label_dict = create_label_map(MAX_MASK, USE_POINTING)

# Callback for trainer

df_train = pd.read_csv("data/stif_indo/test_with_pointing.csv")
data_train = datasets.Dataset.from_pandas(df_train)
data_train, label_dict = prepare_data_tagging_and_pointer(
    data_train, tokenizer, label_dict
)
model_path_or_name = model_dict["koto"]

Map: 100%|██████████| 363/363 [00:00<00:00, 4748.33 examples/s]
Map: 100%|██████████| 363/363 [00:00<00:00, 1401.45 examples/s]


In [5]:
pre_trained_bert = BertForTokenClassification.from_pretrained(
        model_path_or_name, num_labels=len(label_dict)
    )



Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import torch

In [8]:
pointer_network_config = BertConfig(
    vocab_size=len(label_dict) + 1,
    num_hidden_layers=2,
    hidden_size=64,
    num_attention_heads=1,
    pad_token_id=len(label_dict),
)  # + 1 as the pad token

lit_tagger = LitTaggerOrInsertion.load_from_checkpoint(
    "/mnt/d/Documents/temp/last-v1.ckpt",
    model=pre_trained_bert,
    lr=2e-5,
    num_classes=len(label_dict),
    class_weight=None,
    tokenizer=tokenizer,
    label_dict=label_dict,
    use_pointer=USE_POINTING,
    pointer_config=pointer_network_config,
    map_location=torch.device("cpu"),
)

In [9]:
lit_tagger.device

device(type='cpu')

In [10]:
lit_tagger = lit_tagger.eval()
lit_tagger.freeze()


In [11]:
tokenizer_vocab_reverse = {v: k for k, v in tokenizer.vocab.items()}
label_dict

# reverese the label dict
label_dict_reverse = {v: k for k, v in label_dict.items()}

In [12]:
from pprint import pprint

In [13]:
data_0 = data_train[12]

In [14]:
import torch
with torch.no_grad():
    inp_to_model = tokenizer(data_0['informal'], return_tensors="pt").to('cpu')
    out_logits = lit_tagger.forward(**inp_to_model, output_hidden_states=True)
    decoded_seq = [tokenizer_vocab_reverse[x.item()] for x in inp_to_model['input_ids'][0]]
    decoded_label = [label_dict_reverse[x.item()] for x in out_logits.logits.argmax(-1)[0]]
    inp_tag = torch.LongTensor([data_0['tag_labels']])
    _, out_att = lit_tagger.forward_pointer(
        input_ids=inp_tag,
        attention_mask=inp_to_model["attention_mask"],
        token_type_ids=inp_to_model["token_type_ids"],
        previous_last_hidden=out_logits.hidden_states[-1],
    )
    att_output = out_att.argmax(-1)
    pprint(list(zip(list(range(len(decoded_seq))), decoded_seq, decoded_label, att_output[0][0].numpy(), data_0['point_labels'])))

[(0, '[CLS]', 'KEEP', 1, 1),
 (1, 'hal', 'KEEP', 2, 2),
 (2, 'apa', 'KEEP', 3, 3),
 (3, 'yang', 'KEEP', 4, 4),
 (4, 'lebih', 'KEEP|1', 7, 7),
 (5, 'cep', 'DELETE', 0, 0),
 (6, '##et', 'DELETE', 0, 0),
 (7, 'dari', 'KEEP', 8, 8),
 (8, 'gund', 'KEEP', 9, 9),
 (9, '##ala', 'KEEP', 10, 10),
 (10, '?', 'KEEP', 11, 11),
 (11, 'ketika', 'KEEP', 13, 13),
 (12, 'driver', 'KEEP', 0, 0),
 (13, 'dan', 'KEEP', 15, 15),
 (14, 'cs', 'KEEP', 0, 0),
 (15, 'sama', 'KEEP', 16, 16),
 (16, '-', 'KEEP', 17, 17),
 (17, 'sama', 'KEEP|4', 21, 21),
 (18, 'nge', 'DELETE', 0, 0),
 (19, '##cha', 'DELETE', 0, 0),
 (20, '##t', 'DELETE', 0, 0),
 (21, '"', 'DELETE', 24, 24),
 (22, 'oke', 'DELETE', 0, 0),
 (23, '"', 'DELETE', 0, 0),
 (24, '[SEP]', 'KEEP', 0, 0)]


In [15]:
tagger_logit, pointer_att = out_logits.logits.numpy(), out_att

In [16]:
import numpy as np

In [17]:
tagger_logit.shape

(1, 25, 34)

In [18]:
np.argmax(tagger_logit, axis=1)

array([[24, 24, 10,  5,  4, 17, 21, 17, 14, 17, 17,  5, 24,  5,  9, 24,
         5,  5, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24]])

In [19]:
input_word_ids = inp_to_model['input_ids'][0].numpy()

In [20]:
last_token_index = inp_to_model['input_ids'][0].tolist().index(tokenizer.vocab['[SEP]'])

In [21]:
deleted_tags = ["DELETE", "PAD_TAG", "PAD"]

In [22]:
predicted_tags = list(np.argmax(tagger_logit, axis=-1))[0]
non_deleted_indexes = set(
    i
    for i, tag in enumerate(predicted_tags[: last_token_index + 1])
    if label_dict_reverse[int(tag)] not in deleted_tags
)
source_tokens = [
    tokenizer_vocab_reverse[x.item()] for x in inp_to_model["input_ids"][0]
]
sep_indexes = set(
    [
        i
        for i, token in enumerate(source_tokens)
        if token == '[SEP]' and i in non_deleted_indexes
    ]
)

In [23]:
non_deleted_indexes

{0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 24}

In [24]:
sep_indexes

{24}

In [56]:
# coding=utf-8
# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utility functions for running inference with a Felix model."""

from typing import Optional, Sequence, Set

import numpy as np
import scipy.special


def get_number_of_masks(label):
  """Convert a tag to the number of MASK tokens it represents."""

  if '|' not in label:
    return 0
  return int(label.split('|')[1])


def _normalize_logits(logits):
  numerator = logits
  denominator = scipy.special.logsumexp(logits)
  return numerator - denominator


def beam_search_single_tagging(
    predicted_points_logits,
    good_indexes,
    sep_indexes,
    beam_size,
    end_index = 128,
    max_length = 128):
  """Returns the most likely (according to a beam search) sequence of indexes.

  Args:
    predicted_points_logits: Matrix of logits (timesteps x timesteps). Each
      timestep has logits for every other timestep.
    good_indexes: A restricted set of indexes which the beam must use. As such
      the problem becomes find the most likely permutation of these indexes.
    sep_indexes: A set of indexes for the [SEP] token. This ensure the last
      token is a [SEP].
    beam_size: The size of the beam.
    end_index: The index of the last token (excluding padding)
    max_length: The maximum length of the generation.

  Returns:
    The most likely sequence of indexes.
  """
  # -1 is useful for np.argpartition which splits on smallest.
  predicted_points = -1 * _normalize_logits(predicted_points_logits)
  sequences = [[0]]
  scores = [0]
  finished_sequences = []
  finished_scores = []
  for _ in range(max_length):
    assert len(sequences) == len(scores)
    candidate_scores = []
    candidate_sequences_reconstructor = []
    for j, (sequence, score) in enumerate(zip(sequences, scores)):
      sequence_set = set(sequence)
      next_scores = predicted_points[sequence[-1]]
      for index in range(end_index + 1):
        # Can't predict the same index twice.
        if index in sequence_set:
          continue
        # You must produce a good index.
        if index not in good_indexes:
          continue
        # The last token must be a [SEP].
        if len(sequence) == len(good_indexes) - 1:
          if index not in sep_indexes:
            continue
        # If there is only one SEP don't predict it till the end.
        elif index in sep_indexes and len(sep_indexes) == 1:
          continue

        candidate_scores.append(score + next_scores[index])
        # Don't construct a sequence for every candidate as this is expensive.
        # Instead store a way to reconstruct the sequence.
        candidate_sequences_reconstructor.append((j, index))

    if not candidate_scores:
      break

    if beam_size < 1:
      break
    if beam_size >= len(candidate_scores):
      top_n_indexes = list(range(len(candidate_scores)))
    else:
      # Get the N most likely sequences. (A full sort is not needed).
      top_n_indexes = np.argpartition(candidate_scores, beam_size)[:beam_size]

    new_sequences = []
    new_scores = []

    for top_n_index in top_n_indexes:
      sequence_index, token_index = candidate_sequences_reconstructor[
          top_n_index]
      # Reconstruct the sequence.
      new_sequence = sequences[sequence_index] + [token_index]
      new_score = candidate_scores[top_n_index]

      # For every completed beam we reduce the beamsize by 1.
      if len(new_sequence) == len(good_indexes):
        finished_sequences.append(new_sequence)
        finished_scores.append(-1 * new_score / len(new_sequence))
        beam_size -= 1
      else:
        new_sequences.append(new_sequence)
        new_scores.append(new_score)

    sequences = new_sequences
    scores = new_scores
    if beam_size < 1:
      break
  if not finished_sequences:
    return None

  return finished_sequences[np.argmax(finished_scores)]

def realize_beam_search(
    source_token_ids,
    ordered_source_indexes,
    tags,
    source_length,
    inverse_label_map,
    tokenizer,
):
    """Returns realized prediction using indexes and tags.

    TODO: Refactor this function to share code with
    `_create_masked_source` from insertion_converter.py to reduce code
    duplication and to ensure that the insertion example creation is consistent
    between preprocessing and prediction.

    Args:
      source_token_ids: List of source token ids.
      ordered_source_indexes: The order in which the kept tokens should be
        realized.
      tags: a List of tags.
      source_length: How long is the source input (excluding padding).

    Returns:
      Realized predictions (with deleted tokens).
    """
    source_token_ids_set = set(ordered_source_indexes)
    out_tokens = []
    out_tokens_with_deletes = []
    for j, index in enumerate(ordered_source_indexes):
        token = tokenizer.convert_ids_to_tokens([source_token_ids[index]])
        out_tokens += token
        tag = inverse_label_map[tags[index]]
        out_tokens_with_deletes += token
        # Add the predicted MASK tokens.
        number_of_masks = get_number_of_masks(tag)
        # Can not add phrases after last token.
        if j == len(ordered_source_indexes) - 1:
            number_of_masks = 0
        masks = ["[MASK]"] * number_of_masks
        out_tokens += masks
        out_tokens_with_deletes += masks

        # Find the deleted tokens, which appear after the current token.
        deleted_tokens = []
        for i in range(index + 1, source_length):
            if i in source_token_ids_set:
                break
            deleted_tokens.append(source_token_ids[i])
        # Bracket the deleted tokens, between unused0 and unused1.
        if deleted_tokens:
            deleted_tokens = (
                ["[UNK]"]
                + list(
                    tokenizer.convert_ids_to_tokens(deleted_tokens)
                )
                + ["[PAD]"]
            )
            out_tokens_with_deletes += deleted_tokens
    return out_tokens_with_deletes

In [26]:
pointer_np = pointer_att[0][0].numpy()

In [27]:
best_sequence = beam_search_single_tagging(
    list(pointer_np),
    non_deleted_indexes,
    sep_indexes,
    4,
    last_token_index,
    20,
)

In [28]:
tokenizer.convert_ids_to_tokens([1,2,3,4,5])

['[UNK]', '[MASK]', '[CLS]', '[SEP]', '!']

In [29]:
best_sequence

[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 13, 15, 16, 17, 14, 12, 24]

In [38]:
realized_inp_insertion = realize_beam_search(input_word_ids, best_sequence, predicted_tags, last_token_index+1, label_dict_reverse, tokenizer)

In [33]:
pre_trained_another_bert = BertForMaskedLM.from_pretrained(
    model_path_or_name
)



Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
lit_insert = LitTaggerOrInsertion.load_from_checkpoint(
    "/mnt/d/Documents/temp/epoch=9-val_loss=2.21-f1_val_step=0.00.ckpt",
    model=pre_trained_another_bert,
    lr=LR_INSERTION,
    num_classes=pre_trained_another_bert.config.vocab_size,
    class_weight=None,
    tokenizer=tokenizer,
    label_dict=label_dict,
    is_insertion=True,
    map_location=torch.device("cpu"),
)

In [36]:
lit_insert = lit_insert.eval()
lit_insert.freeze()


In [46]:
input_ids = tokenizer.convert_tokens_to_ids(realized_inp_insertion)
attention_mask = [1] * len(input_ids)
token_type_ids = [0] * len(input_ids)

# make them to torch
input_ids = torch.LongTensor([input_ids])
attention_mask = torch.LongTensor([attention_mask])
token_type_ids = torch.LongTensor([token_type_ids])

In [47]:
with torch.no_grad():
    out = lit_insert.forward(
        input_ids=input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
    )
input_ids_detokenized = tokenizer.convert_ids_to_tokens(input_ids[0].numpy())
out_ids_detokenized =  tokenizer.convert_ids_to_tokens(out.logits.argmax(-1)[0].numpy())

list(zip(input_ids_detokenized, out_ids_detokenized))

[('[CLS]', '.'),
 ('hal', 'hal'),
 ('apa', 'apa'),
 ('yang', 'yang'),
 ('lebih', 'lebih'),
 ('[MASK]', 'cepat'),
 ('[UNK]', 'dan'),
 ('cep', 'cepat'),
 ('##et', 'cepat'),
 ('[PAD]', 'cepat'),
 ('dari', 'dari'),
 ('gund', 'gund'),
 ('##ala', '##ala'),
 ('?', '?'),
 ('ketika', 'ketika'),
 ('dan', 'dan'),
 ('sama', 'sama'),
 ('-', '-'),
 ('sama', 'sama'),
 ('[MASK]', 'ngomong'),
 ('[MASK]', '##pon'),
 ('[MASK]', 'oke'),
 ('[MASK]', 'oke'),
 ('[UNK]', ','),
 ('nge', 'nge'),
 ('##cha', '##cha'),
 ('##t', '##t'),
 ('"', '"'),
 ('oke', 'oke'),
 ('"', '"'),
 ('[PAD]', '.'),
 ('cs', 'cs'),
 ('driver', 'driver'),
 ('[SEP]', '.')]

In [53]:
tokenizer.convert_ids_to_token(out.logits.argmax(-1)[0])

AttributeError: 'BertTokenizerFast' object has no attribute 'convert_ids_to_token'