# KOBERT ESG 합치기 한 후


`필요 환경 및 패키지 설치`

In [None]:
!pip install mxnet
!pip install pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gluonnlp==0.10.0
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import urllib.request
from google.colab import drive

drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kobert_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

word_tokenizer = kobert_tokenizer.tokenize

kobertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

vocab = nlp.vocab.BERTVocab.from_sentencepiece(kobert_tokenizer.vocab_file, padding_token='[PAD]')

`사전 파라미터 지정`

In [5]:
max_len = 128
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

`데이터셋 구축`

In [50]:
class TOTAL_BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, vocab, max_len, pad, pair):
        self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, vocab = vocab, pad=pad, pair=pair)
        self.sentences = [self.transform(sent) for sent in dataset]
        # self.labels = torch.tensor([(int(i[label_idx][0]), int(i[label_idx][1])) for i in dataset])

    def __getitem__(self, i):
        return (self.sentences[i]) # + (self.labels[i], ))

    def __len__(self):
        return (len(self.sentences))

`ESG를 분류하는 모델`

In [51]:
class BERT_TOTAL_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 esg_classes = 3,
                 sent_classes = 2,
                 dr_rate = None,
                 params = None):
        super(BERT_TOTAL_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.esg_classifier = nn.Linear(hidden_size , esg_classes)
        self.sent_classifier = nn.Linear(hidden_size, sent_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)
        self.activation_sigmoid = nn.Sigmoid()
        self.activation_softmax = nn.Softmax(dim = 1)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)

        esg_score = self.esg_classifier(out)
        activation_esg_score = self.activation_sigmoid(esg_score)

        sent_score = self.sent_classifier(out)
        activation_sent_score = self.activation_softmax(sent_score)


        return activation_esg_score, activation_sent_score

`esg 모델 정의하기`

In [52]:
# BERT  모델 불러오기
model = BERT_TOTAL_Classifier(kobertmodel,  dr_rate = 0.5).to(device)

# model 가중치 적용하기
esg_checkpoint = torch.load('/content/drive/MyDrive/kobert_modeling/esg_version1.pt', map_location=torch.device('cpu'))# gpu 가능하면 없애기 // 가중치 저장할 때 gpu로 해서 cpu밖에 못쓰는 환경이면 에러나는데 이거 방지용!!
sent_checkpoint = torch.load('/content/drive/MyDrive/kobert_modeling/sent_version1.pt', map_location=torch.device('cpu'))
model.esg_classifier.load_state_dict(esg_checkpoint['model_esg_classifier_state_dict'])
model.sent_classifier.load_state_dict(sent_checkpoint['model_sent_classifier.state_dict'])


<All keys matched successfully>

`학습시킨 esg model 기반 esg_predict 함수`

In [67]:
# esg 분류시에 사용한다.
def total_predict(predict_sentence):
    dataset_another = [predict_sentence]
    another_test = TOTAL_BERTDataset(dataset_another, word_tokenizer, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size = batch_size, num_workers = 2)
    model.eval()
    with torch.no_grad() :
      for batch_id, (token_ids, valid_length, segment_ids) in enumerate(test_dataloader):
          # model의 forward 인자 설정
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length = valid_length

          # model output 도출
          activation_esg_score, activation_sent_score = model.forward(token_ids, valid_length, segment_ids)
          # 모델의 output은 바로 사용하지 못한다. 밑에 있는 logits 코드 이용하기
          5445

    return activation_esg_score[0], activation_sent_score[0]


`기업 별 문장 가지고 오기`

In [73]:
# 테스트 데이터 불러오기
test_data = pd.read_csv('/content/drive/MyDrive/kobert_modeling/naver_news_test.csv')

input_data = [sent for sent in test_data['content']]

input_data[0]

' 편의점 CU가 일회용품 사용 규제를 철회하기로 한 환경부 방침과 관계없이 환경 보호를 위해 종이 빨대를 계속 사용하기로 했다'

`esg_predict 해보기`

In [None]:
# 여기서 문제가 생기면 함수 return 부분 인덱스 때문일 가능성이 높음.
e_score = 0
s_score = 0
g_score = 0

inputs = input_data[:20]
esg_score = torch.zeros(3)
for data in inputs :
  esg_score = torch.zeros(3)
  activation_esg_score, activation_sent_score = total_predict(data)
  # 긍부정 가중치 구하기
  sent_weight = activation_sent_score[0] - activation_sent_score[1]
  esg_score += activation_esg_score * sent_weight * 100



In [83]:
print(f'해당 기업의 E 점수 : {esg_score[0]}')
print(f'해당 기업의 E 점수 : {esg_score[1]}')
print(f'해당 기업의 E 점수 : {esg_score[2]}')

해당 기업의 E 점수 : 7.379893779754639
해당 기업의 E 점수 : 6.664917469024658
해당 기업의 E 점수 : 8.277997016906738


`기업 별로 e,s,g score 데이터 프레임에 추가하기`