<a href="https://colab.research.google.com/github/ksyeon94/untitled/blob/main/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install transformers
!pip -q install datasets
!pip -q install seqeval

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pathlib import Path
from pprint import pprint
from transformers import AutoModel, AutoTokenizer, BertPreTrainedModel, BertModel, AdamW
from transformers import PreTrainedTokenizer, BertConfig, BertForTokenClassification
from typing import Dict, List, Union, Optional, Tuple
from tqdm import tqdm, trange
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.nn import functional as F

import torch
from torch.utils.data import DataLoader, Dataset

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [None]:
from datasets import load_dataset
from pprint import pprint

In [None]:
my_dataset = load_dataset('rlatmddus159/ex')



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from dataclasses import dataclass

@dataclass
class Config():
  model_name: str = "klue/bert-base"
  train_data = my_dataset['train']
  test_data = my_dataset['test']
  epoch: int = 30
  max_seq_len: int = 150
  batch_size: int = 16
  learning_rate: float = 5e-3
  adam_epsilon: float = 1e-8
  device: str = "cuda"
  max_grad_norm: float = 1.0
  seed: int = 1234
  intermediate_hidden_size: int = 768
  num_labels: int =  441

In [None]:
init_config = Config()
init_config.train_data

Dataset({
    features: ['entities', 'relations', 'text', 'Comments', 'id', 'Articleid'],
    num_rows: 7934
})

In [None]:
config = BertConfig.from_pretrained(init_config.model_name)
config.update(init_config.__dict__)

In [None]:
model = BertForTokenClassification.from_pretrained(config.model_name, config=config)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['clas

In [None]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
model.classifier = torch.nn.Linear(768, 441)

In [None]:
max_length = init_config.max_seq_len
batch_size = init_config.batch_size

torch.manual_seed(init_config.seed)
np.random.seed(init_config.seed)

In [None]:
#라벨에 대한 리스트를 생성
#for data in dataset['train']['entities']
label_list=[]
for data in my_dataset['train']['entities']:
  for data2 in data:
    if data2['label'] in label_list:
      continue
    else:
      label_list.append(data2['label'])
for data in my_dataset['test']['entities']:
  for data2 in data:
    if data2['label'] in label_list:
      continue
    else:
      label_list.append(data2['label'])

In [None]:
new_label_list=['0']
for i in label_list:
  new_label_list.append("B-"+i)
  new_label_list.append("I-"+i)
labels=new_label_list

In [None]:
def 전처리(dataset):
    line = []

    for data in dataset:
        start_offset = []  # (4,19,23)
        end_offset = []  # (7,22,25)
        entities_num = len(data['entities'])  # 3개
        text_len = len(data['text'])  # 30
        label2 = []
        text_made="##"+data['text']+ '\n'

        for entity in data['entities']:
            start_offset.append(entity['start_offset'])
            end_offset.append(entity['end_offset'])
            index_num_B = label_list.index(entity['label']) * 2 + 1
            label2.append(labels[index_num_B])
            index_num_I = label_list.index(entity['label']) * 2 + 2
            label2.append(labels[index_num_I])

        for text_inx in range(text_len):
            label_value = '0'
            for i in range(entities_num):
                if start_offset[i] == text_inx:
                    label_value = str(label2[2*i])
                    break
                elif start_offset[i] < text_inx < end_offset[i]:
                    label_value = str(label2[2*i+1])
                    break

            text_made = text_made + data['text'][text_inx] + '\t' + label_value + '\n'

        text_made = text_made.rstrip('\n')
        line.append(text_made)


    return line

In [None]:
def load_data(document_list, tokenizer: PreTrainedTokenizer = None):
    data_list = []
    for doc in document_list:
        char_labels = []
        token_labels = []
        chars = []
        sentence = ""
        for line in doc.split("\n"):
            if line.startswith("##"):
                continue

            token, tag = line.split("\t")
            sentence += token
            char_labels.append(tag)
            chars.append(token)

        offset_mappings = tokenizer(sentence, max_length=max_length, return_offsets_mapping=True, truncation=True)["offset_mapping"]
        for offset in offset_mappings:
            start, end = offset
            if start == end == 0:
                continue
            token_labels.append(char_labels[start])

        instance = {
            "sentence": sentence,
            "token_label": token_labels,
            "char_label": char_labels,
            "offset_mapping": offset_mappings
        }
        data_list.append(instance)

    return data_list

In [None]:
class NerDataset(Dataset):
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        examples: List,
        shuffle: bool = False,
        **kwargs
    ):
        self.dataset = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        instance = self.dataset[index]

        return instance

In [None]:
def collate_fn(input_examples):
    input_texts, input_labels_str = [], []
    offset_mappings = []
    char_labels = []
    for input_example in input_examples:
        text, label_strs = input_example["sentence"], input_example["token_label"]
        input_texts.append(text)
        input_labels_str.append(label_strs)
        offset_mappings.append(input_example["offset_mapping"])
        char_labels.append(input_example["char_label"])

    encoded_texts = tokenizer.batch_encode_plus(
        input_texts,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_token_type_ids=True,
        return_attention_mask=True,
        return_offsets_mapping=True
    )
    input_ids = encoded_texts["input_ids"]
    token_type_ids = encoded_texts["token_type_ids"]
    attention_mask = encoded_texts["attention_mask"]

    len_input = input_ids.size(1)
    input_labels = []
    for input_label_str in input_labels_str:
        input_label = [label2id[x] for x in input_label_str]
        if len(input_label) > max_length - 2:
            input_label = input_label[:max_length - 2]
            input_label = [-100] + input_label + [-100]
        else:
            input_label = (
                [-100] + input_label + (max_length - len(input_label_str) - 1) * [-100]
            )
        input_label = torch.tensor(input_label).long()
        input_labels.append(input_label)

    input_labels = torch.stack(input_labels)
    return input_ids, token_type_ids, attention_mask, input_labels, offset_mappings, char_labels

In [None]:
type(전처리(init_config.train_data))

list

In [None]:
examples = load_data(전처리(init_config.train_data), tokenizer)
index = int(len(examples) * 0.1)

In [None]:
train_dataset = NerDataset(
    tokenizer,
    examples[:index]
)

In [None]:
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

In [None]:
valid_dataset = NerDataset(
    tokenizer,
    examples[index:]
)

In [None]:
valid_dataloader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
from transformers import AdamW

optimizer_grouped_parameters = [
    {'params': model.bert.parameters(), 'lr': config.learning_rate / 100 },
    {'params': model.classifier.parameters(), 'lr': config.learning_rate }
]
optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)



In [None]:
def train_epoch(epoch, model, dataloader, optimizer):
    model.train()
    total_loss = 0.0

    tepoch = tqdm(dataloader, unit="batch", position=1, leave=True)
    for batch in tepoch:
        tepoch.set_description(f"Train")
        model.zero_grad()

        input_ids = batch[0].to(config.device)
        token_type_ids = batch[1].to(config.device)
        attention_mask = batch[2].to(config.device)
        labels = batch[3].to(config.device)

        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "labels": labels,
        }

        outputs = model(**inputs)

        loss = outputs[0]
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
        optimizer.step()
        total_loss += loss.item()

        tepoch.set_postfix(loss=loss.mean().item())
    tepoch.set_postfix(loss=total_loss / len(dataloader))
    return total_loss / len(dataloader)

In [None]:
def valid_epoch(epoch, dataloader, model, tokenizer):
    total_loss = 0.0

    model.eval()
    all_char_preds = []
    all_char_labels = []
    all_token_predictions = []
    all_token_labels = []

    tepoch = tqdm(dataloader, unit="batch", leave=False)
    for batch in tepoch:
        tepoch.set_description(f"Valid")
        with torch.no_grad():
            input_ids = batch[0].to(config.device)
            token_type_ids = batch[1].to(config.device)
            attention_mask = batch[2].to(config.device)
            labels = batch[3].to(config.device)
            offset_mappings = batch[4]
            char_labels = batch[5]
            inputs = {
                "input_ids": input_ids,
                "token_type_ids": token_type_ids,
                "attention_mask": attention_mask,
                "labels": labels,
            }

            outputs = model(**inputs)

            loss, logits = outputs[:2]
            total_loss += loss.item()

            token_predictions = logits.argmax(dim=2) # logits
            token_predictions = token_predictions.detach().cpu().numpy()

            char_predictions = token_to_char_label(token_predictions, labels, offset_mappings)
            for j, (char_pred, char_label) in enumerate(zip(char_predictions, char_labels)):
                if len(char_pred) != len(char_label): # unknown 문장 처리
                    del char_predictions[j]
                    del char_labels[j]

            all_char_preds.extend(char_predictions)
            all_char_labels.extend(char_labels)

            for token_prediction, label in zip(token_predictions, labels):
                filtered = []
                filtered_label = []
                for i in range(len(token_prediction)):
                    if label[i].tolist() == -100:
                        continue
                    filtered.append(id2label[token_prediction[i]])
                    filtered_label.append(id2label[label[i].tolist()])
                assert len(filtered) == len(filtered_label)
                all_token_predictions.append(filtered)
                all_token_labels.append(filtered_label)

        tepoch.set_postfix(loss=loss.mean().item())

    token_f1 = f1_score(all_token_labels, all_token_predictions, average="macro")
    return total_loss / len(dataloader),  token_f1

In [None]:
def token_to_char_label(token_predictions, labels, offset_mapping_batch):
    char_predictions = []
    for token_predicts, label, offset_mappings in zip(token_predictions, labels, offset_mapping_batch):

        # SPECIAL token 제외
        filtered = []
        for i in range(len(token_predicts)):
            if label[i].tolist() == -100:
                continue
            filtered.append(token_predicts[i])
        char_prediction = []

        # SPECIAL token 제외
        if offset_mappings[0][0] == 0 and offset_mappings[0][1] == 0:
            del offset_mappings[0]
        if offset_mappings[-1][0] == 0 and offset_mappings[-1][1] == 0:
            del offset_mappings[-1]
        assert len(filtered) == len(offset_mappings)

        prev_end = None
        for token_predict, offset_mapping in zip(filtered, offset_mappings):
            start, end = offset_mapping

            # 이전 end와 현재 start가 1개이상 차이나면 띄어쓰기를 추가한다
            if prev_end != None and start - prev_end > 0:
                char_prediction.append("O") # 띄어쓰기
            prev_end = end

            # 싱글 라벨
            if end - start == 1:
                label_str = id2label[token_predict]
                char_prediction.append(label_str)
                continue

            # 멀티 라벨
            for i in range(end - start):
                label_str = id2label[token_predict]
                if i == 0 or label_str == "0":
                    char_prediction.append(label_str)
                    continue
                char_prediction.append("I-" + label_str.split("-")[1])
        char_predictions.append(char_prediction)
    return char_predictions

In [None]:
def test_epoch(dataloader, model, tokenizer):
    total_loss = 0.0

    model.eval()
    all_char_preds = []
    all_char_labels = []
    all_token_predictions = []
    all_token_labels = []

    tepoch = tqdm(dataloader, unit="batch")
    for batch in tepoch:
        tepoch.set_description(f"Test")
        with torch.no_grad():

            input_ids = batch[0].to(config.device)
            token_type_ids = batch[1].to(config.device)
            attention_mask = batch[2].to(config.device)
            labels = batch[3].to(config.device)
            offset_mappings = batch[4]
            char_labels = batch[5]

            inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                "labels": labels,
            }

            outputs = model(**inputs)

            loss, logits = outputs[:2]
            total_loss += loss.item()

            token_predictions = logits.argmax(dim=2) # logits
            token_predictions = token_predictions.detach().cpu().numpy()

            char_predictions = token_to_char_label(token_predictions, labels, offset_mappings)
            for j, (char_pred, char_label) in enumerate(zip(char_predictions, char_labels)):
                if len(char_pred) != len(char_label):
                    # print(tokenizer.decode(batch[0][j]))
                    del char_predictions[j]
                    del char_labels[j]

            all_char_preds.extend(char_predictions)
            all_char_labels.extend(char_labels)

            for token_prediction, label in zip(token_predictions, labels):
                filtered = []
                filtered_label = []
                for i in range(len(token_prediction)):
                    if label[i].tolist() == -100:
                        continue
                    filtered.append(id2label[token_prediction[i]])
                    filtered_label.append(id2label[label[i].tolist()])
                assert len(filtered) == len(filtered_label)
                all_token_predictions.append(filtered)
                all_token_labels.append(filtered_label)

            tepoch.set_postfix(loss=loss.mean().item())

    token_result = classification_report(all_token_labels, all_token_predictions)
    token_f1 = f1_score(all_token_labels, all_token_predictions, average="macro")
    char_result = classification_report(all_char_labels, all_char_preds)
    char_f1 = f1_score(all_char_labels, all_char_preds)

    print(token_result)
    print(char_result)

    tepoch.set_postfix(loss=total_loss / len(dataloader), token_f1=token_f1, char_f1=char_f1)
    return total_loss / len(dataloader), token_f1, char_f1

In [None]:
examples = load_data(전처리(init_config.test_data), tokenizer)
index = int(len(examples) * 0.1)

In [None]:
test_dataset = NerDataset(
    tokenizer,
    examples,
)

In [None]:
test_dataloader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
# String label값을 tensor로 변환하기 위해
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
len(train_dataloader)

50

In [None]:
model.to("cuda")

best_f1 = 0.0
best_model = None

tepoch = trange(config.epoch, position=0, leave=True)
for epoch in tepoch:
    tepoch.set_description(f"Epoch {epoch}")

    train_loss = train_epoch(epoch, model, train_dataloader, optimizer)
    valid_loss, token_f1 = valid_epoch(epoch, valid_dataloader, model, tokenizer)

    if best_f1 < token_f1:
        best_f1 = token_f1
        best_model = model

    tepoch.set_postfix(valid_f1=token_f1)

test_loss, token_f1, char_f1 = test_epoch(test_dataloader, model, tokenizer)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Valid:  72%|███████▏  | 322/447 [01:39<00:36,  3.40batch/s, loss=0.825][A
Valid:  72%|███████▏  | 322/447 [01:39<00:36,  3.40batch/s, loss=1.08] [A
Valid:  72%|███████▏  | 323/447 [01:39<00:36,  3.42batch/s, loss=1.08][A
Valid:  72%|███████▏  | 323/447 [01:39<00:36,  3.42batch/s, loss=1.08][A
Valid:  72%|███████▏  | 323/447 [01:39<00:36,  3.42batch/s, loss=0.77][A
Valid:  72%|███████▏  | 324/447 [01:39<00:35,  3.44batch/s, loss=0.77][A
Valid:  72%|███████▏  | 324/447 [01:39<00:35,  3.44batch/s, loss=0.77][A
Valid:  72%|███████▏  | 324/447 [01:40<00:35,  3.44batch/s, loss=2.14][A
Valid:  73%|███████▎  | 325/447 [01:40<00:35,  3.45batch/s, loss=2.14][A
Valid:  73%|███████▎  | 325/447 [01:40<00:35,  3.45batch/s, loss=2.14][A
Valid:  73%|███████▎  | 325/447 [01:40<00:35,  3.45batch/s, loss=1.14][A
Valid:  73%|███████▎  | 326/447 [01:40<00:35,  3.43batch/s, loss=1.14][A
Valid:  73%|███████▎  | 326/447 [01:40<00:35,  3.43batch/s, 

In [None]:

    #(50,6,16,150)
    #1개 batch안에 6개 들어있고 16개 들어있고 150개 들어있다

In [None]:
model