# KOBERT ESG 합치기 한 후


`필요 환경 및 패키지 설치`

In [None]:
!pip install mxnet
!pip install pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gluonnlp==0.10.0
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import json

import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import urllib.request
from google.colab import drive

drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kobert_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

word_tokenizer = kobert_tokenizer.tokenize

kobertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

vocab = nlp.vocab.BERTVocab.from_sentencepiece(kobert_tokenizer.vocab_file, padding_token='[PAD]')

`사전 파라미터 지정`

In [4]:
max_len = 128
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

`데이터셋 구축 & 모델 구축`

- test input dataset

In [5]:
class BERTDataset(Dataset):
    def __init__(self, dataset, word_tokenizer, max_len, vocab , pad, pair) :
        self.transform = nlp.data.BERTSentenceTransform(word_tokenizer, max_seq_length = max_len, vocab = vocab, pad = pad, pair = pair)
        self.sentences = [self.transform(sent) for sent in dataset]

    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))

- model 정의

In [6]:
class BERT_ESG_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 3,
                 dr_rate = None,
                 params = None):
        super(BERT_ESG_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.regressor = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)


    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        attention_mask[:valid_length] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        result = self.regressor(out)

        return result

In [7]:
class BERT_SENT_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2,
                 dr_rate = None,
                 params = None):
        super(BERT_SENT_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.regressor = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)


    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        attention_mask[:valid_length] = 1
        return attention_mask.float()


    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        posneg = self.regressor(out)
        return posneg

`모델 파라미터 적용하기`

In [8]:
# esg, sent 모델 정의
esg_model = BERT_ESG_Classifier(kobertmodel,  dr_rate = 0.5).to(device)
sent_model = BERT_SENT_Classifier(kobertmodel,  dr_rate = 0.5).to(device)

# esg_model
esg_checkpoint = torch.load('/content/drive/MyDrive/model_checkpoint/esg/esg_kobert 최종.pt')
esg_model.load_state_dict(esg_checkpoint['model.state_dict'])

# sent model
sent_checkpoint = torch.load('/content/drive/MyDrive/model_checkpoint/sent/sent_kobert최종.pt')
sent_model.load_state_dict(sent_checkpoint['model.state_dict'])

<All keys matched successfully>

`input data 불러오기`

In [9]:
with open('/content/drive/MyDrive/토큰화.json', 'r') as file :
    kosdaq_emb_dict = json.load(file)

`Test score 도출`

In [None]:
esg_activation = nn.Sigmoid()
sent_activation = nn.Softmax(dim=1)
sent_pn = {}
for i, sent in enumerate(tqdm_notebook(kosdaq_emb_dict)) :
    if (i+1) % 100000 == 0 :
        with open(f'/content/drive/MyDrive/코스닥esgpn점수{i}까지.json', 'w', encoding= 'utf-8') as file :
            json.dump(sent_pn , file, ensure_ascii = False)
        print(f'{i}번 문장 처리중....')
    token_ids, valid_length, segment_ids = kosdaq_emb_dict[sent]
    token_ids = torch.tensor(token_ids, dtype = torch.long).to(device)
    segment_ids = torch.tensor(segment_ids, dtype = torch.long).to(device)
    valid_length = np.array(valid_length)

    esg_score = esg_model(token_ids, valid_length, segment_ids)
    sent_score = sent_model(token_ids, valid_length, segment_ids)
    sigmoid_score = esg_activation(esg_score)
    e = round(float(sigmoid_score[0][0]),4)
    s = round(float(sigmoid_score[0][1]), 4)
    g = round(float(sigmoid_score[0][2]), 4)
    p = round(float(sent_score[0][0]), 4)
    n = round(float(sent_score[0][1]), 4)
    # softmax_score = sent_activation(sent_score)
    sent_pn[sent] = (e,s,g,p,n)