In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from pandas import read_parquet
from transformers import BertModel, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support
from torch.nn.utils.rnn import pad_sequence
import evaluate
import pandas as pd
from TorchCRF import CRF

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "data/mBERT/fine"

tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [3]:
train_data = read_parquet("data/merge/train.parquet")
dev_data = read_parquet("data/merge/dev.parquet")
test_data = read_parquet("data/merge/test.parquet")

with open("data/merge/tags_2_idx.json", "r") as f:
    tags2idx = json.load(f)

idx2tags = {idx: tag for tag, idx in tags2idx.items()}

with open("data/merge/chars2idx.json", "r") as f:
    chars2idx = json.load(f)

In [4]:
sentences_train = train_data["tokens"].values.tolist()
tags_train = train_data["ner_tags"].values.tolist()

sentences_dev = dev_data["tokens"].values.tolist()
tags_dev = dev_data["ner_tags"].values.tolist()

sentences_test = test_data["tokens"].values.tolist()
tags_test = test_data["ner_tags"].values.tolist()

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert = BertModel.from_pretrained("data/mBERT/fine").to(device)
bert.eval()

Some weights of the model checkpoint at data/mBERT/fine were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [6]:
def pooling_embedding(batch_word_ids, batch_embeddings):
    processed_embeddings = []
    
    for word_ids, embeddings in zip(batch_word_ids, batch_embeddings):
        shrunk_embeddings = []
        current_embeddings = []
        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx == previous_word_idx:
                current_embeddings.append(embeddings[i])
            else:
                if current_embeddings:
                    shrunk_embeddings.append(
                        torch.mean(torch.stack(current_embeddings), dim=0)
                    )
                    current_embeddings.clear()
                current_embeddings.append(embeddings[i])
                previous_word_idx = word_idx
        if current_embeddings:
            shrunk_embeddings.append(
                torch.mean(torch.stack(current_embeddings), dim=0)
            )
    
        processed_embeddings.append(torch.stack(shrunk_embeddings))

    return processed_embeddings
                    

In [7]:
class MultilingualDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sentences[idx].tolist(), torch.tensor(self.labels[idx].tolist(), dtype=int), idx
    
def collate_fn(batch):
    sentences, labels, ids = zip(*batch)
    # print([len(x) for x in sentences])
    B = len(labels)
    tokenized_inputs_list = [
        tokenizer(sentence, is_split_into_words=True, truncation=True, return_tensors="pt")
        for sentence in sentences
    ]
    input_ids = [x["input_ids"][0] for x in tokenized_inputs_list]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0).to(device)
    attention_mask = [x["attention_mask"][0] for x in tokenized_inputs_list]
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0).to(device)
    with torch.no_grad():
        bert_output = bert(input_ids, attention_mask)
    bert_embeddings = bert_output["last_hidden_state"]
    
    word_ids = [x.word_ids() for x in tokenized_inputs_list]
    pooled_embeddings = pooling_embedding(word_ids, bert_embeddings)
    batch_embeddings = pad_sequence(pooled_embeddings, batch_first=True, padding_value=0)
    # print(batch_embeddings.shape)
    T = batch_embeddings.shape[1]   # The original length might be larger than T (at most 512)
    labels = [x[:T] for x in labels]
    sentences = [x[:T] for x in sentences]
    batch_labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    L = max(len(word) for sentence in sentences for word in sentence)
    batch_char_ids = torch.zeros(B, T, L, dtype=int)
    for i in range(B):
        for j in range(len(sentences[i])):
            cur_word = sentences[i][j]
            for k in range(len(cur_word)):
                batch_char_ids[i][j][k] = chars2idx.get(cur_word[k], chars2idx["<unk>"])
    try:
        assert batch_embeddings.shape[1] == batch_labels.shape[1], f"batch_embeddings: {batch_embeddings.shape} batch_labels: {batch_labels.shape}"
        assert batch_embeddings.shape[1] == batch_char_ids.shape[1]
    except:
        print(T)
        print(ids)
        print(batch_embeddings.shape)
        print(batch_char_ids.shape)
    return batch_embeddings, batch_labels, batch_char_ids

the next 2 blocks are used for testing

In [8]:
train_dataset = MultilingualDataset(sentences_train, tags_train)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    collate_fn=collate_fn
)
for data in train_loader:
    bert_embeddings, labels, char_ids = data
    print(bert_embeddings.shape, labels.shape, char_ids.shape)
    break

torch.Size([2, 17, 768]) torch.Size([2, 17]) torch.Size([2, 17, 1])


In [9]:
sentences = [train_dataset[2418][0], train_dataset[2419][0]]
labels = [train_dataset[2418][1], train_dataset[2419][1]]
print(f"original labels size: {[len(x) for x in labels]}")
tokenized_inputs_list = [
    tokenizer(sentence, is_split_into_words=True, truncation=True, return_tensors="pt")
    for sentence in sentences
]
input_ids = [x["input_ids"][0] for x in tokenized_inputs_list]
print([len(x) for x in input_ids])
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0).to(device)
attention_mask = [x["attention_mask"][0] for x in tokenized_inputs_list]
attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0).to(device)
with torch.no_grad():
    bert_output = bert(input_ids, attention_mask)
bert_embeddings = bert_output["last_hidden_state"]
word_ids = [x.word_ids() for x in tokenized_inputs_list]
pooled_embeddings = pooling_embedding(word_ids, bert_embeddings)
batch_embeddings = pad_sequence(pooled_embeddings, batch_first=True, padding_value=0)
# print(batch_embeddings.shape)
B = 2
T = batch_embeddings.shape[1]
labels = [x[:T] for x in labels]
sentences = [x[:T] for x in sentences]
batch_labels = pad_sequence(labels, batch_first=True, padding_value=-100)
L = max(len(word) for sentence in sentences for word in sentence)
batch_char_ids = torch.zeros(B, T, L, dtype=int)
for i in range(B):
    for j in range(len(sentences[i])):
        cur_word = sentences[i][j]
        for k in range(len(cur_word)):
            try: 
                batch_char_ids[i][j][k] = chars2idx.get(cur_word[k], chars2idx["<unk>"])
            except:
                print(i, j, k)

original labels size: [9, 3]
[11, 5]


# BiLSTM model training and evaluation

In [10]:
# class IDCNN(nn.Module):
#     def __init__(self, input_dim=768, output_dim=8, hidden_dim=128, num_layers=3, kernel_size=3, dilation=[1, 2, 3]):
#         super(IDCNN, self).__init__()
#         self.conv_layers = nn.ModuleList()
#         in_channels = input_dim
#         for i in range(num_layers):
#             dilation_rate = dilation[i % len(dilation)]
#             padding = dilation_rate * (kernel_size - 1) // 2
#             conv_layer = nn.Conv1d(in_channels, hidden_dim, kernel_size, padding=padding, dilation=dilation_rate)
#             in_channels = hidden_dim
#             self.conv_layers.append(conv_layer)
#         self.fc = nn.Linear(hidden_dim, output_dim)

#     def forward(self, x):
#         x = x.permute(0, 2, 1)  # 调整维度顺序 [batch_size, embedding_dim, sequence_length]
#         for conv_layer in self.conv_layers:
#             x = torch.relu(conv_layer(x))
#         x = torch.max(x, dim=2)[0]  # 最大池化
#         x = self.fc(x)
#         return x


class IDCNN(nn.Module):
    def __init__(self, input_dim=768, output_dim=8, num_layers=3, kernel_size=3, dilation=[1, 2, 4]):
        super(IDCNN, self).__init__()
        self.conv_layers = nn.ModuleList()
        in_channels = input_dim
        for i in range(num_layers):
            dilation_rate = dilation[i % len(dilation)]
            padding = dilation_rate * (kernel_size - 1) // 2
            conv_layer = nn.Conv1d(in_channels, in_channels, kernel_size, padding=padding, dilation=dilation_rate)
            in_channels = in_channels
            self.conv_layers.append(conv_layer)
        self.fc = nn.Linear(in_channels, output_dim)
        self.crf = CRF(output_dim)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # 调整维度顺序 [batch_size, embedding_dim, sequence_length]
#         print(x.size())
        for conv_layer in self.conv_layers:
            x = torch.relu(conv_layer(x))
#         x = torch.max(x, dim=2)[0]  # 最大池化
        x = x.permute(0, 2, 1)
        x = self.fc(x)
#         x = self.crf(x)
        return x

In [16]:
train_dataset = MultilingualDataset(sentences_train, tags_train)
dev_dataset = MultilingualDataset(sentences_dev, tags_dev)
test_dataset = MultilingualDataset(sentences_test, tags_test)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn
)
dev_loader = DataLoader(
    dataset=dev_dataset,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_fn
)
# lstm_model = BiLSTM(
#     input_size=768,
#     lstm_hidden_dim=256, lstm_num_layers=2, lstm_dropout=0.33,
#     linear_output_dim=128, label_size=len(idx2tags)
# )
lstm_model = IDCNN()

lstm_model = lstm_model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = Adam(lstm_model.parameters(), lr=1e-3)
epochs = 10

In [17]:
best_loss = torch.inf
best_acc = 0
for epoch in range(epochs):
    print(f'Epoch: {epoch + 1}/{epochs}')
    train_loss = 0
    cur_total = 0
    correct = 0
    amount = 0
    lstm_model.train()
    train_loop = tqdm(train_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    for bert_embeddings, labels, char_ids in train_loop:

        B = bert_embeddings.shape[0]
        bert_embeddings = bert_embeddings.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = lstm_model(bert_embeddings)
        logits_reshape = logits.view(-1, logits.shape[-1])
        labels_reshape = labels.view(-1)
        loss = criterion(logits_reshape, labels_reshape)
        loss.backward()
        optimizer.step()

        train_loss += loss.cpu().item() * B

        mask = labels_reshape != -100
        
        logits_non_pad = logits_reshape[mask]
        labels_non_pad = labels_reshape[mask]

        _, predictions = torch.max(logits_non_pad, 1)
        correct += sum(predictions == labels_non_pad)
        amount += len(labels_non_pad)

        cur_total += B
        running_loss = train_loss / cur_total
        running_acc = correct / amount
        train_loop.set_postfix(loss=running_loss, acc=running_acc.item())

    train_loss /= len(train_dataset)
    train_acc = correct / amount

    dev_loss = 0
    y_true, y_pred = [], []
    correct = 0
    amount = 0
    lstm_model.eval()
    dev_loop = tqdm(dev_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    with torch.no_grad():
        for bert_embeddings, labels, char_ids in dev_loop:
            B = bert_embeddings.shape[0]
            bert_embeddings = bert_embeddings.to(device)
            labels = labels.to(device)

            logits = lstm_model(bert_embeddings)
            logits_reshape = logits.view(-1, logits.shape[-1])
            labels_reshape = labels.view(-1)
            loss = criterion(logits_reshape, labels_reshape)
            dev_loss += loss.cpu().item() * B
            
            mask = labels_reshape != -100
            logits_non_pad = logits_reshape[mask]
            labels_non_pad = labels_reshape[mask]

            _, predictions = torch.max(logits_non_pad, 1)
            correct += sum(predictions == labels_non_pad)
            amount += len(labels_non_pad)
            y_pred.extend(predictions.cpu())
            y_true.extend(labels_non_pad.cpu())

    dev_loss /= len(dev_dataset)
    val_acc = correct / amount
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    print('train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}'.format(train_loss, train_acc, dev_loss, val_acc))
    print('val_precision: {:.4f}, val_recall: {:.4f}, val_f1: {:.4f}'.format(val_precision, val_recall, val_f1))

#     if dev_loss < best_loss:
#         print(f"update dev_loss from {best_loss} -> {dev_loss}")
#         best_loss = dev_loss
#         torch.save(lstm_model.state_dict(), "best_DCNN.pt")
        
    if val_acc > best_acc:
#         print(f"update dev_loss from {best_loss} -> {dev_loss}")
        best_acc = val_acc
        torch.save(lstm_model.state_dict(), "best_DCNN.pt")

Epoch: 1/10


Epoch: 1/10: 100%|████████████████████████████████████████| 10013/10013 [06:51<00:00, 24.33it/s, acc=0.864, loss=0.414]
Epoch: 1/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:48<00:00, 29.78it/s]


train_loss: 0.4136, train_acc: 0.8642, val_loss: 0.2696, val_acc: 0.8895
val_precision: 0.8465, val_recall: 0.8187, val_f1: 0.8310
Epoch: 2/10


Epoch: 2/10: 100%|█████████████████████████████████████████| 10013/10013 [06:46<00:00, 24.60it/s, acc=0.89, loss=0.323]
Epoch: 2/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:48<00:00, 29.76it/s]


train_loss: 0.3228, train_acc: 0.8903, val_loss: 0.2583, val_acc: 0.8943
val_precision: 0.8572, val_recall: 0.8207, val_f1: 0.8369
Epoch: 3/10


Epoch: 3/10: 100%|████████████████████████████████████████| 10013/10013 [07:16<00:00, 22.92it/s, acc=0.899, loss=0.293]
Epoch: 3/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:46<00:00, 30.18it/s]


train_loss: 0.2930, train_acc: 0.8994, val_loss: 0.2502, val_acc: 0.9002
val_precision: 0.8602, val_recall: 0.8314, val_f1: 0.8440
Epoch: 4/10


Epoch: 4/10: 100%|████████████████████████████████████████| 10013/10013 [06:52<00:00, 24.28it/s, acc=0.907, loss=0.268]
Epoch: 4/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:49<00:00, 29.65it/s]


train_loss: 0.2683, train_acc: 0.9068, val_loss: 0.2601, val_acc: 0.9000
val_precision: 0.8596, val_recall: 0.8319, val_f1: 0.8447
Epoch: 5/10


Epoch: 5/10: 100%|████████████████████████████████████████| 10013/10013 [06:58<00:00, 23.93it/s, acc=0.913, loss=0.249]
Epoch: 5/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:48<00:00, 29.79it/s]


train_loss: 0.2488, train_acc: 0.9127, val_loss: 0.2590, val_acc: 0.8967
val_precision: 0.8669, val_recall: 0.8158, val_f1: 0.8389
Epoch: 6/10


Epoch: 6/10: 100%|████████████████████████████████████████| 10013/10013 [06:47<00:00, 24.55it/s, acc=0.918, loss=0.232]
Epoch: 6/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:47<00:00, 29.93it/s]


train_loss: 0.2316, train_acc: 0.9181, val_loss: 0.2513, val_acc: 0.9048
val_precision: 0.8646, val_recall: 0.8410, val_f1: 0.8525
Epoch: 7/10


Epoch: 7/10: 100%|████████████████████████████████████████| 10013/10013 [06:47<00:00, 24.55it/s, acc=0.922, loss=0.216]
Epoch: 7/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:47<00:00, 30.02it/s]


train_loss: 0.2161, train_acc: 0.9225, val_loss: 0.2685, val_acc: 0.9043
val_precision: 0.8567, val_recall: 0.8466, val_f1: 0.8514
Epoch: 8/10


Epoch: 8/10: 100%|██████████████████████████████████████████| 10013/10013 [06:57<00:00, 24.00it/s, acc=0.928, loss=0.2]
Epoch: 8/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:57<00:00, 28.24it/s]


train_loss: 0.2003, train_acc: 0.9275, val_loss: 0.2693, val_acc: 0.9012
val_precision: 0.8444, val_recall: 0.8492, val_f1: 0.8462
Epoch: 9/10


Epoch: 9/10: 100%|████████████████████████████████████████| 10013/10013 [07:04<00:00, 23.61it/s, acc=0.932, loss=0.188]
Epoch: 9/10: 100%|█████████████████████████████████████████████████████████████████| 5013/5013 [02:59<00:00, 27.88it/s]


train_loss: 0.1884, train_acc: 0.9315, val_loss: 0.2984, val_acc: 0.9041
val_precision: 0.8698, val_recall: 0.8314, val_f1: 0.8491
Epoch: 10/10


Epoch: 10/10: 100%|███████████████████████████████████████| 10013/10013 [07:16<00:00, 22.96it/s, acc=0.936, loss=0.176]
Epoch: 10/10: 100%|████████████████████████████████████████████████████████████████| 5013/5013 [03:02<00:00, 27.44it/s]


train_loss: 0.1759, train_acc: 0.9359, val_loss: 0.3008, val_acc: 0.9039
val_precision: 0.8694, val_recall: 0.8347, val_f1: 0.8507


In [None]:
# state_dict = torch.load("best_DCNN.pt")
# lstm_model.load_state_dict(state_dict)

## Evaluation

In [23]:
seqeval = evaluate.load("seqeval")

In [25]:
train_predictions = []
train_references = []
lstm_model.eval()
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=1,   # use batch_size=1 => no padding needed
    shuffle=False,
    collate_fn=collate_fn
)
for bert_embeddings, labels, char_ids in tqdm(train_loader):
    bert_embeddings = bert_embeddings.to(device)
    with torch.no_grad():
        logits = lstm_model(bert_embeddings)
    logit = logits[0].cpu()
    label = labels[0]
    _, prediction = torch.max(logit, dim=1)
    assert len(prediction) == len(label)
    prediction_tag, reference_tag = [], []
    for p_idx, r_idx in zip(prediction, label):
        prediction_tag.append(idx2tags[p_idx.item()])
        reference_tag.append(idx2tags[r_idx.item()])
    train_predictions.append(prediction_tag)
    train_references.append(reference_tag)

100%|████████████████████████████████████████████████████████████████████████████| 80100/80100 [24:22<00:00, 54.77it/s]


In [26]:
train_result = seqeval.compute(predictions=train_predictions, references=train_references)
train_result

{'LOC': {'precision': 0.7509615606071075,
  'recall': 0.8488935173131997,
  'f1': 0.796930183675714,
  'number': 38410},
 'ORG': {'precision': 0.6441947565543071,
  'recall': 0.7104731214296177,
  'f1': 0.6757125741829428,
  'number': 34135},
 'PER': {'precision': 0.7513860307468829,
  'recall': 0.8889270779629947,
  'f1': 0.8143901127015574,
  'number': 34914},
 'overall_precision': 0.7182584108980069,
 'overall_recall': 0.8179305595622517,
 'overall_f1': 0.7648609842057172,
 'overall_accuracy': 0.94069062728435}

In [27]:
dev_predictions = []
dev_references = []
lstm_model.eval()
dev_loader = DataLoader(
    dataset=dev_dataset,
    batch_size=1,   # use batch_size=1 => no padding needed
    shuffle=False,
    collate_fn=collate_fn
)
for bert_embeddings, labels, char_ids in tqdm(dev_loader):
    bert_embeddings = bert_embeddings.to(device)
    with torch.no_grad():
        logits = lstm_model(bert_embeddings)
    logit = logits[0].cpu()
    label = labels[0]
    _, prediction = torch.max(logit, dim=1)
    assert len(prediction) == len(label)
    prediction_tag, reference_tag = [], []
    for p_idx, r_idx in zip(prediction, label):
        prediction_tag.append(idx2tags[p_idx.item()])
        reference_tag.append(idx2tags[r_idx.item()])
    dev_predictions.append(prediction_tag)
    dev_references.append(reference_tag)

100%|████████████████████████████████████████████████████████████████████████████| 40100/40100 [12:13<00:00, 54.69it/s]


In [28]:
dev_result = seqeval.compute(predictions=dev_predictions, references=dev_references)
dev_result

{'LOC': {'precision': 0.6770175121863152,
  'recall': 0.7782504928919788,
  'f1': 0.724112961622013,
  'number': 19274},
 'ORG': {'precision': 0.5287422037422037,
  'recall': 0.6326492537313433,
  'f1': 0.5760475651189128,
  'number': 16080},
 'PER': {'precision': 0.6784286630804915,
  'recall': 0.8303835699184536,
  'f1': 0.7467543049595307,
  'number': 16555},
 'overall_precision': 0.6312136103407451,
 'overall_recall': 0.7497736423356258,
 'overall_f1': 0.685404339250493,
 'overall_accuracy': 0.9023121711792634}

In [29]:
test_predictions = []
test_references = []
lstm_model.eval()
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=1,   # use batch_size=1 => no padding needed
    shuffle=False,
    collate_fn=collate_fn
)
for bert_embeddings, labels, char_ids in tqdm(test_loader):
    bert_embeddings = bert_embeddings.to(device)
    with torch.no_grad():
        logits = lstm_model(bert_embeddings)
    logit = logits[0].cpu()
    label = labels[0]
    _, prediction = torch.max(logit, dim=1)
    assert len(prediction) == len(label)
    prediction_tag, reference_tag = [], []
    for p_idx, r_idx in zip(prediction, label):
        prediction_tag.append(idx2tags[p_idx.item()])
        reference_tag.append(idx2tags[r_idx.item()])
    test_predictions.append(prediction_tag)
    test_references.append(reference_tag)

100%|████████████████████████████████████████████████████████████████████████████| 40100/40100 [12:25<00:00, 53.81it/s]


In [30]:
test_result = seqeval.compute(predictions=test_predictions, references=test_references)
test_result

{'LOC': {'precision': 0.6582594964470567,
  'recall': 0.7668761817159794,
  'f1': 0.7084287299077112,
  'number': 19569},
 'ORG': {'precision': 0.5075318066157761,
  'recall': 0.5900485149686427,
  'f1': 0.5456883344276647,
  'number': 16902},
 'PER': {'precision': 0.6760729816600491,
  'recall': 0.8315697674418605,
  'f1': 0.7458024820106373,
  'number': 17200},
 'overall_precision': 0.6176183887805798,
 'overall_recall': 0.7319222671461311,
 'overall_f1': 0.6699296525261139,
 'overall_accuracy': 0.9019221895230236}

In [None]:
train_tokens = []
assert len(train_dataset) == len(train_predictions)
for i in range(len(train_dataset)):
    token = train_dataset[i][0]
    prediction = train_predictions[i]
    assert len(token) >= len(prediction)
    if len(token) != len(prediction):
        token = token[:len(prediction)]
    train_tokens.append(token)

dev_tokens = []
assert len(dev_dataset) == len(dev_predictions)
for i in range(len(dev_dataset)):
    token = dev_dataset[i][0]
    prediction = dev_predictions[i]
    assert len(token) >= len(prediction)
    if len(token) != len(prediction):
        token = token[:len(prediction)]
    dev_tokens.append(token)

test_tokens = []
assert len(test_dataset) == len(test_predictions)
for i in range(len(test_dataset)):
    token = test_dataset[i][0]
    prediction = test_predictions[i]
    assert len(token) >= len(prediction)
    if len(token) != len(prediction):
        token = token[:len(prediction)]
    test_tokens.append(token)

train_df = pd.DataFrame({
    "tokens": train_tokens,
    "predictions": train_predictions
})
dev_df = pd.DataFrame({
    "tokens": dev_tokens,
    "predictions": dev_predictions
})
test_df = pd.DataFrame({
    "tokens": test_tokens,
    "predictions": test_predictions
})

train_df.to_csv("DCNN_train.csv")
dev_df.to_csv("DCNN_dev.csv")
test_df.to_csv("DCNN_test.csv")