# 모듈 import 및 전역 변수 설정

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 39.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 48.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [3]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 5.0 MB/s 
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 2.6 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 47.6 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 61.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 30.9 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70

In [4]:
import json
import os

import torch
import torch.nn as nn
from tqdm import trange
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from datasets import load_metric
from sklearn.metrics import f1_score
import pandas as pd
import copy

PADDING_TOKEN = 1
S_OPEN_TOKEN = 0
S_CLOSE_TOKEN = 2

do_eval=True

max_len = 256
batch_size = 8
base_model = 'kykim/electra-kor-base'
learning_rate = 3e-6
eps = 1e-8
num_train_epochs = 60
classifier_hidden_size = 768
classifier_dropout_prob = 0.1

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

special_tokens_dict = {
    'additional_special_tokens': ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
}

json 및 jsonl 파일 read, write 함수

In [5]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)

    return j

# json 개체를 파일이름으로 깔끔하게 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

# jsonlload('D:\새 폴더\PythonWork\korean_ABSA_baseline-main\nikluge-sa-2022-train.jsonl')

# 모델 정의
xlm-roberta 모델을 기반으로 한 classification 모델 이용

In [20]:
class SimpleClassifier(nn.Module):

    def __init__(self, num_label):
        super().__init__()
        self.dense = nn.Linear(classifier_hidden_size, classifier_hidden_size)
        self.dropout = nn.Dropout(classifier_dropout_prob)
        self.output = nn.Linear(classifier_hidden_size, num_label)

    def forward(self, features):
        x = features[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.output(x)
        return x


class RoBertaBaseClassifier(nn.Module):
    def __init__(self, num_label, len_tokenizer):
        super(RoBertaBaseClassifier, self).__init__()

        self.num_label = num_label
        self.xlm_roberta = AutoModel.from_pretrained(base_model)
        self.xlm_roberta.resize_token_embeddings(len_tokenizer)

        self.labels_classifier = SimpleClassifier(self.num_label)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.xlm_roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=None
        )

        sequence_output = outputs[0]
        logits = self.labels_classifier(sequence_output)

        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_label),
                                                labels.view(-1))

        return loss, logits


# 모델 평가

학습된 모델을 바탕으로 국어원 데이터 형태를 만드는 방법 예시

테스트 데이터에 대한 평가

In [None]:
import re

In [None]:
# 감성만 바꿔주는 def
def predict_from_korean_form(tokenizer, pc_model, data):
    for sentence in data:
        
        form = sentence['sentence_form']
        # form = re.sub("[.]",'',form)
        entity = sentence['annotation']
        sentence['annotation'] = []
        for i in entity:
            tokenized_data = tokenizer(form, i[0], padding='max_length', max_length=256, truncation=True)

            input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
            attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)

            with torch.no_grad():
                _, pc_logits = pc_model(input_ids, attention_mask)

            pc_predictions = torch.argmax(pc_logits, dim=-1)
            pc_result = polarity_id_to_name[pc_predictions[0]]

            sentence['annotation'].append([i[0], pc_result])

    return data

In [10]:
# 바꿔줄 pred_data파일 경로 설정
test_data_path = '/content/drive/MyDrive/메인프로젝트(5조)/말뭉치 경진대회/H.O.F/11_07/entity_ensemble/11.07_63.89+high_single_1_3_2.json'

In [None]:
def test_sentiment_analysis_save():
    # polarity pt 파일 경로 설정
    test_polarity_classification_model_path = '/content/drive/MyDrive/메인프로젝트(5조)/말뭉치 경진대회/data_train_dev_polarity_ver.2/saved_model_epoch_7.pt'
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    test_data = jsonload(test_data_path)
            
    polarity_model = RoBertaBaseClassifier(len(polarity_id_to_name), len(tokenizer))
    polarity_model.load_state_dict(torch.load(test_polarity_classification_model_path, map_location=device))
    polarity_model.to(device)
    polarity_model.eval()

    pred_data = predict_from_korean_form(tokenizer, polarity_model, copy.deepcopy(test_data))

    jsondump(pred_data, '/content/drive/MyDrive/메인프로젝트(5조)/말뭉치 경진대회/H.O.F/11_08/최고점_갱신_63.90/11.07_63.89+high_single_132_polarity_change_data_7.json')
    # pred_data = jsonload('C:\\Users\A\OneDrive\바탕 화면\PythonWork\korean_ABSA_baseline-main\H.O.F\\11.03_하이스코어\pred_data_병진스페셜_13.json')

In [None]:
test_sentiment_analysis_save()

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
