## Data Preprocessing

In [2]:
from datasets import load_dataset
from pandas import DataFrame as df

raw_datasets = load_dataset("conll2003")

Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
example_entry = raw_datasets["train"][4]

In [5]:
ner_feature = raw_datasets["train"].features["ner_tags"]
feature_names = ner_feature.feature.names

In [6]:
tokens = example_entry["tokens"]
tag_ids = example_entry["ner_tags"]
tags = map(lambda tag_id:feature_names[tag_id], tag_ids)

df(zip(tokens, tag_ids, tags))

Unnamed: 0,0,1,2
0,Germany,5,B-LOC
1,'s,0,O
2,representative,0,O
3,to,0,O
4,the,0,O
5,European,3,B-ORG
6,Union,4,I-ORG
7,'s,0,O
8,veterinary,0,O
9,committee,0,O


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
tokenizer.is_fast

True

In [9]:
inputs = tokenizer(tokens, is_split_into_words=True, truncation=True)
# inputs = tokenizer(tokens)
# df(inputs.tokens())

In [10]:
# inputs.word_ids()

In [11]:
# df(zip(inputs.tokens(), inputs.word_ids()))

In [12]:
from typing import List
def align_labels_with_split_tokens(labels_for_word_id: List[int], word_ids: List[int]) -> List[str]:
    """
    Given the map from id of each original word to the orignal label 
    and the word_ids list indicating how the tokenizer split each word, 
    return labels for the tokenized sentence, with word parts 
    padded as "inside".

    For example, if LAMB (B-ORG) is split into LA and ##MB,
    label LA as B-ORG and ##MB as I-ORG. 
    """
    output = []
    prev_word = None
    for word_id in word_ids:
        if word_id is None: 
            # [CLS] or [SEP]
            new_tag_id = -100

        else:
            original_tag_id = labels_for_word_id[word_id]
            if word_id != prev_word:
                # New word.
                # Use the exact same tag id.
                new_tag_id = original_tag_id
            
            else:
                # Non-leading part of a word that was split.
                # Flip any "B-" (odd label id) into "I-" (by adding 1.)
                if (original_tag_id % 2 == 1):
                    new_tag_id = original_tag_id + 1
                else:
                    new_tag_id = original_tag_id

        output.append(new_tag_id)

        prev_word = word_id

    return output


In [13]:
new_feature_names = dict(enumerate(feature_names))
new_feature_names[-100] = "SPECIAL"

In [14]:
new_labels = align_labels_with_split_tokens(tag_ids, inputs.word_ids())

new_tags = map(lambda tag_id: new_feature_names[tag_id], new_labels)

df(zip(inputs.tokens(), inputs.word_ids(), new_labels, new_tags))

Unnamed: 0,0,1,2,3
0,[CLS],,-100,SPECIAL
1,Germany,0.0,5,B-LOC
2,',1.0,0,O
3,s,1.0,0,O
4,representative,2.0,0,O
5,to,3.0,0,O
6,the,4.0,0,O
7,European,5.0,3,B-ORG
8,Union,6.0,4,I-ORG
9,',7.0,0,O


### Apply to the entire dataset

In [15]:
# Peek at "tokens".
df(raw_datasets["train"]["tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,EU,rejects,German,call,to,boycott,British,lamb,.,,...,,,,,,,,,,
1,Peter,Blackburn,,,,,,,,,...,,,,,,,,,,
2,BRUSSELS,1996-08-22,,,,,,,,,...,,,,,,,,,,
3,The,European,Commission,said,on,Thursday,it,disagreed,with,German,...,,,,,,,,,,
4,Germany,'s,representative,to,the,European,Union,'s,veterinary,committee,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14036,on,Friday,:,,,,,,,,...,,,,,,,,,,
14037,Division,two,,,,,,,,,...,,,,,,,,,,
14038,Plymouth,2,Preston,1,,,,,,,...,,,,,,,,,,
14039,Division,three,,,,,,,,,...,,,,,,,,,,


In [16]:
def tokenize_and_align(examples):
    """
    Given a list of sentences 
    and a list of token ids, output a dataset of the tokenized
    features and the properly-aligned labels. 
    """

    list_of_sentences_of_tokens = examples["tokens"]
    list_of_sentences_of_tag_ids = examples["ner_tags"]

    tokenized_inputs = tokenizer(
        list_of_sentences_of_tokens, 
        truncation=True,  
        # Truncate to the maximum possible length of the model. 
        is_split_into_words=True
    )
    
    list_of_aligned_tags_ids = []

    for (sentence_index, tag_ids) in enumerate(list_of_sentences_of_tag_ids):
        token_ids = tokenized_inputs.word_ids(sentence_index)
        aligned_tag_ids = align_labels_with_split_tokens(tag_ids, token_ids)
        list_of_aligned_tags_ids.append(aligned_tag_ids)

    tokenized_inputs["labels"] = list_of_aligned_tags_ids
    return tokenized_inputs

In [17]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6/cache-3b2f5f9281210785.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6/cache-1a6416a06fcc9d44.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6/cache-712ad618bec2be3e.arrow


## Data Collation

In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_datasets["train"][index] for index in range(2)])

for key, value in batch.items():
    print(key)
    print(value)
    print()

attention_mask
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

input_ids
tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]])

labels
tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

token_type_ids
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])



In [19]:
from datasets import load_metric
metric = load_metric("seqeval")

In [20]:
labels = raw_datasets["train"][0]["ner_tags"]
labels
predictions = labels.copy()
predictions[2] = 0
predictions[6] = 0

labels = list(map(lambda tag_id:feature_names[tag_id], labels))
predictions = list(map(lambda tag_id:feature_names[tag_id], predictions))

In [21]:
df(zip(labels, predictions))

Unnamed: 0,0,1
0,B-ORG,B-ORG
1,O,O
2,B-MISC,O
3,O,O
4,O,O
5,O,O
6,B-MISC,O
7,O,O
8,O,O


In [22]:
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.3333333333333333,
 'overall_f1': 0.5,
 'overall_accuracy': 0.7777777777777778}

In [23]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [24]:
id2label = {str(index): label for index, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

## Model

In [35]:
import torch
from torch import nn
from torchcrf import CRF
from transformers import BertModel, BertConfig

class CustomModel(nn.Module):
    def __init__(self, bert_config, rnn_dim=128):
        super(CustomModel, self).__init__()
        self.bert = BertModel(bert_config)
        self.birnn = nn.LSTM(bert_config.hidden_size, rnn_dim, num_layers=1, bidirectional=True)
        self.linear = nn.Linear(rnn_dim * 2, bert_config.num_labels),
        self.crf = CRF(bert_config.num_labels)

        self.dropout = nn.Dropout(0.1)

    def forward(self, **batch):
        bert_embedding = self.bert(**batch)[0]
        bilstm_embedding = self.birnn(bert_embedding)
        bilstm_embedding = self.dropout(bilstm_embedding)

        low_dimension_embedding = self.linear(bilstm_embedding)
        loss = -1 * self.crf(low_dimension_embedding, batch["labels"], attention_mask=batch["attention_mask"])

        return loss

    def predict(self, **batch):
        with torch.no_grad():
            bert_embedding = self.bert(**batch)[0]
            bilstm_embedding = self.birnn(bert_embedding)
            bilstm_embedding = self.dropout(bilstm_embedding)

            low_dimension_embedding = self.linear(bilstm_embedding)
            predictions = self.crf.decode(low_dimension_embedding, attention_mask=batch["attention_mask"])

            return predictions

bert_config = BertConfig.from_pretrained("bert-base-cased", num_labels=9)
model = CustomModel(bert_config)

In [25]:
# import torch.nn as nn
# from torchcrf import CRF

# from transformers import BertPreTrainedModel, BertModel

# class BERT_BiLSTM_CRF(BertPreTrainedModel):
#     def __init__(self, config):
        

In [36]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    batch_size=32,
)

In [37]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# model = AutoModelForTokenClassification.from_pretrained(
#     "bert-base-cased",
#     id2label=id2label,
#     label2id=label2id,
# )

In [40]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [41]:
from transformers import get_scheduler

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
print(num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

4390


In [None]:
train_data = list(train_dataloader)

In [None]:
train_data[-1]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'input_ids': tensor([[  101,   140,  3663,  ...,     0,     0,     0],
         [  101,  3905,  1104,  ...,     0,     0,     0],
         [  101,  7708, 13460,  ...,     0,     0,     0],
         ...,
         [  101,  8988,  3878,  ...,     0,     0,     0],
         [  101, 16972,  9159,  ...,     0,     0,     0],
         [  101,  1109,  1938,  ...,  3292,   119,   102]]),
 'labels': tensor([[-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         ...,
         [-100,    7,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    7,  ...,    2,    0, -100]]),
 'token_type_ids': tensor([[0

In [59]:
bert_config = BertConfig.from_pretrained("bert-base-cased", num_labels=9)
model = BertModel(bert_config)

rnn_dim = 128

In [82]:
train_data[-1]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'input_ids': tensor([[  101,   140,  3663,  ...,     0,     0,     0],
         [  101,  3905,  1104,  ...,     0,     0,     0],
         [  101,  7708, 13460,  ...,     0,     0,     0],
         ...,
         [  101,  8988,  3878,  ...,     0,     0,     0],
         [  101, 16972,  9159,  ...,     0,     0,     0],
         [  101,  1109,  1938,  ...,  3292,   119,   102]]),
 'labels': tensor([[-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         ...,
         [-100,    7,    0,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         [-100,    0,    7,  ...,    2,    0, -100]]),
 'token_type_ids': tensor([[0

In [75]:
train_data_example_filtered = {
    key: train_data[-1][key] 
    for key in ["input_ids", "token_type_ids", "attention_mask"]
}

with torch.no_grad():
    output = model(**train_data_example_filtered)


In [81]:
output[0].shape

torch.Size([25, 67, 768])

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))


for epoch in range(num_train_epochs):
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()

    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=1)
        labels = batch["labels"]

        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_labels, prediction_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=prediction_labels, reference=true_labels)

    results = metric.compute()
    print(
        f"Epoch {epoch}:", 
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )


accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)

In [None]:
id2label


{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC',
 -100: 'SPECIAL'}