<a href="https://colab.research.google.com/github/hr1588/NLP/blob/main/v12_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터셋 로드 (전체 데이터 사용 - ontonote5 / english_v12)

In [None]:
from datasets import load_dataset, DatasetDict

In [None]:
v12_dataset = load_dataset("conll2012_ontonotesv5", 'english_v12')
v12_dataset

DatasetDict({
    train: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 10539
    })
    validation: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1370
    })
    test: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1200
    })
})

In [None]:
v12_dataset['train']['sentences'][0][0]['words']

['What', 'kind', 'of', 'memory', '?']

In [None]:
for split_name in v12_dataset.keys():
    print(f"{split_name}: {v12_dataset[split_name].shape}")

train: (10539, 2)
validation: (1370, 2)
test: (1200, 2)


# 필요한 데이터만 추출

In [None]:
import numpy as np

train_array = np.array(v12_dataset['train']['sentences'])
val_array = np.array(v12_dataset['validation']['sentences'])
test_array = np.array(v12_dataset['test']['sentences'])

  train_array = np.array(v12_dataset['train']['sentences'])
  val_array = np.array(v12_dataset['validation']['sentences'])
  test_array = np.array(v12_dataset['test']['sentences'])


In [None]:
train_array[0][0]['words']

['What', 'kind', 'of', 'memory', '?']

In [None]:
# Define extract_values function
def extract_values(list_of_dicts):
    return [{'words': d['words'], 'tags': d['named_entities']} for d in list_of_dicts]

In [None]:
# Extract values using map
train_values = list(map(extract_values, train_array))
val_values = list(map(extract_values, val_array))
test_values = list(map(extract_values, test_array))

In [None]:
train_values[0][0]

{'words': ['What', 'kind', 'of', 'memory', '?'], 'tags': [0, 0, 0, 0, 0]}

In [None]:
len(train_values)

10539

In [None]:
len(train_values[0])

235

In [None]:
from datasets import Dataset, DatasetDict

train_dict = [{"words": item["words"], "tags": item["tags"]} for sublist in train_values for item in sublist]
val_dict = [{"words": item["words"], "tags": item["tags"]} for sublist in val_values for item in sublist]
test_dict = [{"words": item["words"], "tags": item["tags"]} for sublist in test_values for item in sublist]

train_dataset = Dataset.from_dict({"words": [item["words"] for item in train_dict], "tags": [item["tags"] for item in train_dict]})
val_dataset = Dataset.from_dict({"words": [item["words"] for item in val_dict], "tags": [item["tags"] for item in val_dict]})
test_dataset = Dataset.from_dict({"words": [item["words"] for item in test_dict], "tags": [item["tags"] for item in test_dict]})

dataset_dict = DatasetDict({
    "train": train_dataset,
    "val": val_dataset,
    "test": test_dataset
})

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['words', 'tags'],
        num_rows: 115812
    })
    val: Dataset({
        features: ['words', 'tags'],
        num_rows: 15680
    })
    test: Dataset({
        features: ['words', 'tags'],
        num_rows: 12217
    })
})

In [None]:
each_length = []

for i in range(len(train_values)):
    for j in range(len(train_values[i])):
        each_length.append(len(train_values[j]))

In [None]:
len(each_length) # 길이 확인

115812

In [None]:
each_length[0]

235

# 데이터 정제

## feature의 key, value 확인

In [None]:
for key, value in dataset_dict['train'].features.items():
    print(f"{key} : {value}")

words : Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
tags : Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)


In [None]:
element = dataset_dict['train'][0]
for key, value in element.items():
    print(f"{key}: {value}")

words: ['What', 'kind', 'of', 'memory', '?']
tags: [0, 0, 0, 0, 0]


In [None]:
tags = dataset_dict['train'].features['tags'].feature
tags # 현재 태그 존재 X

Value(dtype='int64', id=None)

In [None]:
v12_tags = v12_dataset['train'].features['sentences'][0]['named_entities']
v12_tags

Sequence(feature=ClassLabel(names=['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE'], id=None), length=-1, id=None)

In [None]:
len(v12_tags.feature.names)

37

In [None]:
num_classes = len(v12_tags.feature.names)

from datasets import DatasetDict, ClassLabel
ner_labels = ClassLabel(names=v12_tags.feature.names, num_classes=num_classes)

In [None]:
for split in ['train', 'val', 'test']:
    dataset_dict[split].features['tags'].feature = ner_labels

In [None]:
tags = dataset_dict['train'].features['tags'].feature
tags # 태그 삽입 완료

ClassLabel(names=['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE'], id=None)

## str 파생변수 제작

In [None]:
def create_tag_names(batch):
    return{"ner_tags_str" : [tags.int2str(idx) for idx in batch['tags']]}

In [None]:
ner_dataset = dataset_dict.map(create_tag_names)
ner_dataset

Map:   0%|          | 0/115812 [00:00<?, ? examples/s]

Map:   0%|          | 0/15680 [00:00<?, ? examples/s]

Map:   0%|          | 0/12217 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['words', 'tags', 'ner_tags_str'],
        num_rows: 115812
    })
    val: Dataset({
        features: ['words', 'tags', 'ner_tags_str'],
        num_rows: 15680
    })
    test: Dataset({
        features: ['words', 'tags', 'ner_tags_str'],
        num_rows: 12217
    })
})

## ner tag 빈도 확인

In [None]:
from collections import Counter, defaultdict
import pandas as pd

split2freqs = defaultdict(Counter)

for split, dataset in ner_dataset.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient = 'index').T.sort_values(['train','val','test'], ascending = [False, False, False])

Unnamed: 0,train,val,test
ORG,24163,3798,2002
PERSON,22035,3163,2134
GPE,21938,3649,2546
DATE,18791,3208,1787
CARDINAL,10901,1720,1005
NORP,9341,1277,990
MONEY,5217,853,355
PERCENT,3802,656,408
ORDINAL,2195,335,207
LOC,2160,316,215


- 3가지 데이터 모두 비슷한 추세를 보이고 있음을 확인

# 모델링

In [None]:
!pip install transformers -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import torch.nn as nn
import numpy as np

from transformers import AutoTokenizer
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
from transformers import AutoConfig
from transformers import TrainingArguments
from transformers import XLMRobertaForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer

from huggingface_hub import notebook_login
from seqeval.metrics import f1_score, accuracy_score

bert_model_name = 'bert-base-cased'
xlmr_model_name = 'xlm-roberta-base'

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 바디 로드
        self.roberta = RobertaModel(config, add_pooling_layer=False) 

        # 토큰 분류 헤드 준비
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
       
        self.init_weights() # 가중치 로드 및 초기화

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                labels=None, **kwargs):
       
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs) 
        
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states, 
                                     attentions=outputs.attentions)


index2tag = {idx: tag for idx, tag in enumerate(v12_tags.feature.names)}
tag2index = {tag: idx for idx, tag in enumerate(v12_tags.feature.names)}


xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, 
                                         num_labels=len(tags.names),
                                         id2label=index2tag, label2id=tag2index)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["words"], truncation=True, 
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def encode_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, 
                      remove_columns=['tags', 'words'])

data_encoded = encode_dataset(ner_dataset)

def align_predictions(predictions, labels):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # 레이블 IDs = -100 무시
            if labels[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[labels[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

num_epochs = 2
batch_size = 16  
logging_steps = len(data_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"

training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch", 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=logging_steps, push_to_hub=False)

notebook_login()


Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, 
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}


data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

trainer = Trainer(model_init=model_init, args=training_args, 
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=data_encoded["train"],
                  eval_dataset=data_encoded["val"], 
                  tokenizer=xlmr_tokenizer)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.1044,0.08408,0.842507


KeyboardInterrupt: ignored

- 코랩 무료 버전에서는 1 epoch만 확인