In [1]:
import json
import torch
from pandas import read_parquet
from transformers import BertModel, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "data/mBERT/fine"

tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [3]:
train_data = read_parquet("data/merge/train.parquet")
dev_data = read_parquet("data/merge/dev.parquet")
test_data = read_parquet("data/merge/test.parquet")

with open("data/merge/tags_2_idx.json", "r") as f:
    tags2idx = json.load(f)

with open("data/merge/idx_2_tags.json", "r") as f:
    idx2tags = json.load(f)

In [4]:
sentences_train = train_data["tokens"].values.tolist()
tags_train = train_data["ner_tags"].values.tolist()

sentences_dev = dev_data["tokens"].values.tolist()
tags_dev = dev_data["ner_tags"].values.tolist()

sentences_test = test_data["tokens"].values.tolist()
tags_test = test_data["ner_tags"].values.tolist()

In [5]:
def align_label(tokenized_input, tags, tags_2_idx, idx_2_tags, label_all_tokens=True): 
    # tokenized_input refers to the sequences after tokenized
    # tags refers to the original tags from dataset
    # False:只为每个拆分token的第一个子词提供一个标签。
    # True:在属于同一 token 的所有子词中提供相同的标签。
    word_ids = tokenized_input.word_ids()
    previous_word_idx = None
    label_ids = []   
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)                
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tags[word_idx])
            except:
                label_ids.append(-100) 
        else:
            label_ids.append(tags[word_idx] if label_all_tokens else -100)
        previous_word_idx = word_idx      
    return label_ids

def generate_tokenized_input(sentences_raw, tags_raw):
    sentences = []
    tags = []
    for i in range(len(sentences_raw)):
        tokenized_text = tokenizer(sentences_raw[i].tolist(), padding="max_length", max_length=512, truncation=True, return_tensors="pt", is_split_into_words=True)
        extended_tags = align_label(tokenized_text, tags_raw[i], tags2idx, idx2tags)
        sentences.append(tokenized_text)
        tags.append(extended_tags)
    return sentences, tags

In [6]:
train_sentences, train_tags = generate_tokenized_input(sentences_train, tags_train)
dev_sentences, dev_tags = generate_tokenized_input(sentences_dev, tags_dev)
test_sentences, test_tags = generate_tokenized_input(sentences_test, tags_test)