# KOBERT ESG 합치기 한 후


`필요 환경 및 패키지 설치`

In [None]:
!pip install mxnet
!pip install pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gluonnlp==0.10.0
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import json

import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import urllib.request
from google.colab import drive

drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kobert_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

word_tokenizer = kobert_tokenizer.tokenize

kobertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

vocab = nlp.vocab.BERTVocab.from_sentencepiece(kobert_tokenizer.vocab_file, padding_token='[PAD]')

`사전 파라미터 지정`

In [4]:
max_len = 128
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

`데이터셋 구축 & 모델 구축`

- test input dataset

In [5]:
class BERTDataset(Dataset):
    def __init__(self, dataset, word_tokenizer, max_len, vocab , pad, pair) :
        self.transform = nlp.data.BERTSentenceTransform(word_tokenizer, max_seq_length = max_len, vocab = vocab, pad = pad, pair = pair)
        self.dataset = [sent.replace('\xa0', '') for sent in dataset]
        self.sentences = [self.transform(sent) for sent in self.dataset]

    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))

- esg model 정의

In [6]:
class BERT_ESG_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 3,
                 dr_rate = None,
                 params = None):
        super(BERT_ESG_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.regressor = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)


    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        result = self.regressor(out)

        return result

- sent model 정의

In [7]:
class BERT_SENT_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2,
                 dr_rate = None,
                 params = None):
        super(BERT_SENT_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.regressor = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)


    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        posneg = self.regressor(out)
        return posneg

`모델 파라미터 적용하기`

In [8]:
# esg, sent 모델 정의
esg_model = BERT_ESG_Classifier(kobertmodel,  dr_rate = 0.5).to(device)
sent_model = BERT_SENT_Classifier(kobertmodel,  dr_rate = 0.5).to(device)

# 모델 파라미터 저장 pt 파일 적용하기

# esg model
esg_checkpoint = torch.load('/content/drive/MyDrive/model_checkpoint/esg/esg_kobert 최종.pt')
esg_model.load_state_dict(esg_checkpoint['model.state_dict'])
# sent model
sent_checkpoint = torch.load('/content/drive/MyDrive/model_checkpoint/sent/sent_kobert최종.pt')
sent_model.load_state_dict(sent_checkpoint['model.state_dict'])

<All keys matched successfully>

`input data 불러오기`

In [9]:
with open('/content/drive/MyDrive/kosdaq_input_data/total_kosdaq_result.json', 'r') as kosdaq_json:
    company_date_sentence = json.load(kosdaq_json)

In [None]:
# 구조 확인
company_date_sentence.keys()

In [61]:
# 구조 확인
company_date_sentence['에스엘에스바이오'].keys()

dict_keys(['2019', '2020', '2021', '2022', '2023', '2024'])

`예측 함수`

In [10]:
def esg_predict(model, test_loader, device):
    esg_result = torch.zeros((1,3)).to(device)

    for i, (token_ids, valid_length, segment_ids) in enumerate(test_loader):
        token_ids = token_ids.to(device)
        segment_ids = segment_ids.to(device)
        valid_length = valid_length

        esg_out = esg_model(token_ids, valid_length, segment_ids)
        esg_result += esg_out.mean(dim = 0)

    total_esg = esg_result/(i+1)

    return total_esg

In [11]:
def sent_predict(model, test_loader, device):
    sent_result = torch.zeros((1,2)).to(device)

    for i, (token_ids, valid_length, segment_ids) in enumerate(test_loader):
        token_ids = token_ids.to(device)
        segment_ids = segment_ids.to(device)
        valid_length = valid_length

        sent_out = sent_model(token_ids, valid_length, segment_ids).to(device)
        sent_result += sent_out.mean(dim = 0)

    total_sent = sent_result/(i+1)

    return total_sent

In [12]:
기업, 년도, E, S, G, P, N = [[] for _ in range(7)]

save_interval = 50

In [None]:
esg_activation = nn.Sigmoid()
sent_activation = nn.Softmax(dim=1)
esg_model.eval()
sent_model.eval()

with torch.no_grad() :
    for i, corp in enumerate(tqdm_notebook(company_date_sentence)) : # 기업명
        if i > 1698 :
            print(f'[LOG] {i+1} corp processing.. ')
            기업.extend([corp] * len(company_date_sentence[corp]))
            for date in company_date_sentence[corp] : # 딕셔너리(날짜 : [문장들])
                년도.append(date)
                input_data = company_date_sentence[corp][date]
                input_data = [str(sent)for sent in input_data]
                # input_data = [sent for sent in total_inputs['Sentence']] # 문장들 리스트로 접근한다.
                input_dataset = BERTDataset(input_data, word_tokenizer, max_len, vocab, True, False)
                input_dataloader = torch.utils.data.DataLoader(input_dataset, batch_size = batch_size, num_workers = 2)
                total_esg = esg_predict(esg_model, input_dataloader, device)
                total_esg = esg_activation(total_esg)
                total_sent = sent_predict(sent_model, input_dataloader, device)
                total_sent = sent_activation(total_sent)


                E.append(round(float(total_esg[0][0]), 4))
                S.append(round(float(total_esg[0][1]), 4))
                G.append(round(float(total_esg[0][2]), 4))
                P.append(round(float(total_sent[0][0]), 4))
                N.append(round(float(total_sent[0][1]), 4))


kosdaq_exec_df = pd.DataFrame({'기업' : 기업, '년도' : 년도, 'E' : E, 'S' : S, 'G' : G, 'P' : P, 'N' : N})
kosdaq_exec_df.to_csv('/content/drive/MyDrive/kosdaq_exec_df1700_to끝.csv')



