# KOBERT ESG 학습


`필요 환경 및 패키지 설치`

In [None]:
!pip install mxnet
!pip install pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gluonnlp==0.10.0
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import urllib.request
from google.colab import drive

drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kobert_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

word_tokenizer = kobert_tokenizer.tokenize

kobertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

vocab = nlp.vocab.BERTVocab.from_sentencepiece(kobert_tokenizer.vocab_file, padding_token='[PAD]')

`사전 파라미터 지정`

In [4]:
max_len = 128
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

`esg 데이터셋 불러오기 및 train_test_split`

In [5]:
# esg 데이터 불러오기
esg_data = pd.read_csv('/content/drive/MyDrive/kobert_modeling/문장esg테스트용.csv')

# 필요없는 정보 제거하기
del esg_data['Unnamed: 0']

# 문장 라벨 묶어주기
esg_data_list = []
for sent, e_label, s_label, g_label in zip(esg_data['문장'], esg_data['E'], esg_data['S'], esg_data['G']):
    data = []
    data.append(str(sent))
    data.append((str(e_label), str(s_label), str(g_label)))
    esg_data_list.append(data)

from sklearn.model_selection import train_test_split
esg_train_data, esg_valid_data = train_test_split(esg_data_list, test_size = 0.2, shuffle = True, random_state = 32)

`esg_dataset 구축 및 dataloader 구축`

In [None]:
class ESG_BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, vocab = vocab, pad=pad, pair=pair)
        self.sentences = [self.transform([i[sent_idx]]) for i in dataset] # 데이터 셋의 문장들을 bert 맞춤형 토큰화 모음 리스트
        self.labels = torch.tensor([(int(i[label_idx][0]), int(i[label_idx][1]), int(i[label_idx][2])) for i in dataset]) # 데이터 셋의 라벨들 모음 리스트
        # self.labels 형태 예시 : (0,1,0)
    def __getitem__(self, i): # 특정 문장 특정 문장의 라벨을 리턴해줌
        return (self.sentences[i] + (self.labels[i], ))
    def __len__(self):
        return (len(self.labels)) # 문장 개수 리턴 해줌.


esg_data_train = ESG_BERTDataset(esg_train_data, 0, 1, word_tokenizer, vocab, max_len, True, False)
esg_data_test = ESG_BERTDataset(esg_valid_data, 0, 1, word_tokenizer, vocab, max_len, True, False)


# 데이터 로더 구축
esg_train_dataloader = torch.utils.data.DataLoader(esg_data_train, batch_size = batch_size, num_workers = 5)
esg_test_dataloader = torch.utils.data.DataLoader(esg_data_test, batch_size = batch_size, num_workers = 5)

`ESG를 분류하는 모델`

In [7]:
class BERT_ESG_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 3,
                 dr_rate = None,
                 params = None):
        super(BERT_ESG_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.esg_classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)
        self.activation_sigmoid = nn.Sigmoid()

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        esg_score = self.esg_classifier(out)
        activation_esg_score = self.activation_sigmoid(esg_score)
        return activation_esg_score

`esg 모델 정의하기`

In [None]:
# BERT  모델 불러오기
model = BERT_ESG_Classifier(kobertmodel,  dr_rate = 0.5).to(device)

# kober model freezing 하기
for name, para in model.named_parameters() :
    if not name.count('esg_classifier') :
        para.requires_grad = False

# 옵티마이저 생성 시 전달해줄 파라미터 정의
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [weight for name, weight in model.named_parameters() if not any(nd in name for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [weight for name, weight in model.named_parameters() if any(nd in name for nd in no_decay)], 'weight_decay': 0.0} ]


# 옵티마이저 정의
optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)

# 손실함수 정의
loss_fn = nn.BCELoss()

# 스케쥴러 생성 시 전달해줄 파라미터 정의
t_total = len(esg_train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

# 스케쥴러 정의
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

`esg 정확도 계산 함수 정의`

In [9]:
# esg 분류 시에 사용한다
def esg_calc_accuracy(X,Y):
    X[X <= 0.5 ] = 0
    X[X > 0.5 ] = 1
    answer = 0
    for pred in (X - Y) :
        if torch.abs(pred).sum() == 0 :
            answer += 1
    train_acc = answer/batch_size
    return train_acc

`esg 모델 학습시키기`

In [None]:
# esg 분류 시에 사용한다.
for e in range(1):
    train_acc = 0.0
    model.train() # model을 훈련모드로 바꾸고, 가중치가 업데이트 될 수 있게 한다.
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(esg_train_dataloader)):
        # 옵티마이저의 미분값을 0으로 초기화
        optimizer.zero_grad()

        # model의 forward 인자 설정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.float().to(device)

        # model output 도출
        out = model.forward(token_ids, valid_length, segment_ids)

        # 모델 output과 label(정답)과의 손실함수 정의
        loss = loss_fn(out, label)

        # 손실함수의 기울기 계산
        loss.backward()

        # gradient vanishing 또는 gradient exploding을 방지하기 위한 gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        # 기울기 반영한 가중치 업데이트
        optimizer.step()
        scheduler.step()

        train_acc += esg_calc_accuracy(out, label)

        if batch_id % log_interval == 0:
             print(f'epoch : {e+1} | batch_id : {batch_id + 1} | loss : {loss.data.cpu().numpy()}| accuracy : {train_acc / (batch_id+1)}')

    print("epoch : {} train acc : {}".format(e+1, train_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(esg_train_dataloader)):


  0%|          | 0/951 [00:00<?, ?it/s]

  self.pid = os.fork()


epoch : 1 | batch_id : 1 | loss : 0.6844475865364075| accuracy : 0.234375
epoch : 1 | batch_id : 201 | loss : 0.7116233706474304| accuracy : 0.13440609452736318


`학습시킨 esg model 기반 esg_predict 함수`

In [None]:
# esg 분류시에 사용한다.
def esg_predict(predict_sentence):
    data = [predict_sentence,(0,0,0)]
    dataset_another = [data]
    another_test = ESG_BERTDataset(dataset_another, 0, 1, word_tokenizer, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size = batch_size, num_workers = 5)
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        # model의 forward 인자 설정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.float().to(device)
        # model output 도출
        out = model.forward(token_ids, valid_length, segment_ids)
        # 모델의 output은 바로 사용하지 못한다. 밑에 있는 logits 코드 이용하기
        for i in out:
            logits = i
            logits = logits.detach().cpu().numpy()

    return logits


`esg_predict 해보기`

In [None]:
# 테스트 데이터 불러오기
test_data = pd.read_csv('/content/drive/MyDrive/kobert_modeling/naver_news_test.csv')

# 문장 추출
sents = test_data['content']
for i, sent in enumerate(sents[:10]) :
    esg_output = esg_predict(sent)
    print(f'{i+1}번 문장 =  E : {esg_output[0] * 100}점, S : {esg_output[1] * 100}점, G : {esg_output[2] * 100}점')

`모델 저장하기`

In [None]:
# 저장하기(모델 추가 계층 및 옵티마이저)
torch.save({'model_esg_classifier_state_dict': model.esg_classifier.state_dict()}, '/content/drive/MyDrive/model_checkpoint/esg_version1.pt')