# KOBERT ESG 학습


`필요 환경 및 패키지 설치`

In [None]:
!pip install mxnet
!pip install pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install gluonnlp==0.10.0
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import urllib.request
from google.colab import drive
from sklearn.model_selection import train_test_split

drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kobert_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

word_tokenizer = kobert_tokenizer.tokenize

kobertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

vocab = nlp.vocab.BERTVocab.from_sentencepiece(kobert_tokenizer.vocab_file, padding_token='[PAD]')

`사전 작업`

- cpu 개수 확인하기

In [4]:
import os

cpu_count = os.cpu_count()  # 2
cpu_count # 8개 -> num_workers = 5 설정하기

12

- 사전 파라미터 정의

In [5]:
max_len = 200
batch_size = 100
warmup_ratio = 0.1
num_epochs = 35
max_grad_norm = 1
log_interval = 100
learning_rate =  2e-4

- Dataset 사전 정의하기

In [6]:
class ESG_BERTDataset(Dataset):
    def __init__(self, dataset, att_idx, label_idx) :
        self.sentences = [i[att_idx] for i in dataset]
        self.labels = torch.tensor([(int(i[label_idx][0]), int(i[label_idx][1]), int(i[label_idx][2])) for i in dataset])

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

- 모델 사전 정의하기

In [7]:
class BERT_ESG_Classifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 3,
                 dr_rate = None,
                 params = None):
        super(BERT_ESG_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.esg_classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)
        self.activation_sigmoid = nn.Sigmoid()

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        result = self.esg_classifier(out)
        activation_esg_score = self.activation_sigmoid(result)
        return activation_esg_score

- 검증을 위한 esg 정확도 함수 사전 정의

In [8]:
def esg_calc_accuracy(X,Y):
    X[X <= 0.5 ] = 0
    X[X > 0.5 ] = 1
    e_answer = 0
    s_answer = 0
    g_answer = 0
    for pred in (X - Y) :
        if pred[0] == 0 :
            e_answer += 1
        if pred[1] == 0 :
            s_answer += 1
        if pred[2] == 0 :
            g_answer += 1
    e_train_acc = e_answer/batch_size
    s_train_acc = s_answer/batch_size
    g_train_acc = g_answer/batch_size
    return e_train_acc, s_train_acc, g_train_acc

In [9]:
# esg 분류시에 사용한다.
def predict_valid_data_acc(esg_test_dataloader):
    model.eval()
    valid_acc_log = []
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(esg_test_dataloader):
        train_acc = 0
        # model의 forward 인자 설정
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.float().to(device)
        # model output 도출
        out = model.forward(token_ids, valid_length, segment_ids)
        # 모델의 output은 바로 사용하지 못한다. 밑에 있는 logits 코드 이용하기
        e_train_acc, s_train_acc, g_train_acc = esg_calc_accuracy(out, label)
        valid_acc_log.append([e_train_acc, s_train_acc, g_train_acc])
    e_valid_mean_acc = sum([accuracy[0] for accuracy in valid_acc_log]) / len([accuracy[0] for accuracy in valid_acc_log])
    s_valid_mean_acc = sum([accuracy[1] for accuracy in valid_acc_log]) / len([accuracy[1] for accuracy in valid_acc_log])
    g_valid_mean_acc = sum([accuracy[2] for accuracy in valid_acc_log]) / len([accuracy[2] for accuracy in valid_acc_log])
    return e_valid_mean_acc, s_valid_mean_acc, g_valid_mean_acc

`실전 데이터 학습 구조 짜기`

- 학습 / 검증용 데이터 불러오기

In [10]:
total_inputs = pd.read_csv('/content/drive/MyDrive/kobert_modeling/실전용/csv파일 실전용/모델별최종학습데이터셋/최종esg분류데이터셋.csv')

total_inputs

Unnamed: 0.1,Unnamed: 0,Sentence,E,S,G,조합
0,0,특히 한화그룹은 올해 상반기까지 실적 개선세가 두드러지면서 향후 성장에 대한 관심도...,0,0,1,1
1,1,올 상반기 한화에어로스페이스의 매출액은 2조8988억원으로 지난해 같은 기간보다 30,0,0,1,1
2,2,특히 지난 2분기 매출액은 2조7775억원 영업이익은 2211억원으로 분기 기준 사...,0,0,1,1
3,3,지속가능한 에너지 사용을 추진하기 위해 경기도 용인의 라이프파크 연수원 건물과 간판...,1,1,0,110
4,4,페이퍼리스 회의 문화 정착과 2015년부터 운영하고 있는 스마트 플래너 전자청약시스...,0,0,1,1
...,...,...,...,...,...,...
1046903,32997,전체적으로 연차 눈치 안보고 쓸 수 있는 것 같음. 그래도 칼퇴하는 사람 많은 것 같음.,0,0,1,1
1046904,32998,생각보다 경직화되어 있는 조직문화 여타복지로 커버한다고는 하지만 낮은 연봉,0,1,0,10
1046905,32999,대외적인 이미지를 중요시함. 임원들간에 정치가 심해 조직개편이 수시로 일어남. 그로...,0,0,1,1
1046906,33000,과거의 성장 동력이었던 요소들이 현재는 일정부분 발목을 잡고있는 형국임. 그 타개책...,0,0,1,1


- 학습을 위한 모델, 옵티마이자, 손실함수, 스케쥴러 초기화 및 requires_grad 설정

In [None]:
# BERT  모델 불러오기
model = BERT_ESG_Classifier(kobertmodel,  dr_rate = 0.5).to(device)

# kober model freezing 하기
for name, para in model.named_parameters() :
    if True :
        para.requires_grad = True

# 역전파 확인
for name, para in model.named_parameters() :
    print(para.requires_grad)

# 옵티마이저 생성 시 전달해줄 파라미터 정의
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [weight for name, weight in model.named_parameters() if not any(nd in name for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [weight for name, weight in model.named_parameters() if any(nd in name for nd in no_decay)], 'weight_decay': 0.0} ]

# 옵티마이저 정의
optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)

# 손실함수 정의
loss_fn = nn.BCELoss() # esg는 3가지 비율을 측정하는 것이므로 BCE가 좋다는 판단

# 스케쥴러 정의 -> 밑에서 확인해보고 수정할 것(아직 정확히 계산 못하겠음)
test_size = 0.1
scale_weight = 0.8
len_esg_train_dataloader = len(total_inputs) * (1-test_size) * scale_weight / batch_size # 전체 데이터의 샘플 수 X train_split 비율 X 최소값 scale 가중치 / 배치사이즈(512)
t_total = len_esg_train_dataloader * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

`모델 학습 코드`

In [12]:
train_data_loss = []
e_train_data_accuracy = []
s_train_data_accuracy = []
g_train_data_accuracy = []
e_valid_data_mean_accuracy = []
s_valid_data_mean_accuracy = []
g_valid_data_mean_accuracy = []

transform = nlp.data.BERTSentenceTransform(word_tokenizer, max_seq_length=max_len, vocab = vocab, pad=True, pair=False)
matching_sent_att = {sent : transform(sent) for sent in total_inputs['Sentence']}

In [None]:
for epoch in range(num_epochs) :
    model.train() # predict_valid_data_acc 함수에 model.eval() 존재 따라서 반복문 안에서 훈련/평가 모드 전환한다.
    print(f'{epoch+1}번째 학습에 대한 정확도입니다.')
    train_input, valid_input = train_test_split(total_inputs, test_size = test_size, shuffle = True, random_state = epoch + 1, stratify = total_inputs['조합'])

    train_scale_num = train_input['조합'].value_counts().min()
    valid_scale_num = valid_input['조합'].value_counts().min()

    train_input = train_input.groupby('조합').head(train_scale_num * scale_weight)
    valid_input = valid_input.groupby('조합').head(valid_scale_num * scale_weight)

    esg_train_data = [[matching_sent_att[sent], (str(e_label), str(s_label), str(g_label))] for sent, e_label, s_label, g_label in zip(train_input['Sentence'], train_input['E'], train_input['S'], train_input['G'])]
    esg_valid_data = [[matching_sent_att[sent], (str(e_label), str(s_label), str(g_label))] for sent, e_label, s_label, g_label in zip(valid_input['Sentence'], valid_input['E'], valid_input['S'], valid_input['G'])]

    esg_train_data = ESG_BERTDataset(esg_train_data, 0, 1)
    esg_valid_data = ESG_BERTDataset(esg_valid_data, 0, 1)


    esg_train_dataloader = torch.utils.data.DataLoader(esg_train_data, batch_size = batch_size, num_workers = 5)
    esg_test_dataloader = torch.utils.data.DataLoader(esg_valid_data, batch_size = batch_size, num_workers = 5)

    # 학습 과정 시작
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(esg_train_dataloader)):
        train_acc = 0.0
        optimizer.zero_grad()

        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.float().to(device)

        out = model.forward(token_ids, valid_length, segment_ids)

        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        e_train_acc, s_train_acc, g_train_acc = esg_calc_accuracy(out, label)
        if batch_id % log_interval == 0:
             print(f'train_data_accuracy = epoch : {epoch+1} | batch_id : {batch_id} | loss : {loss.data.cpu().numpy()} | e_accuracy : {e_train_acc} | s_accuracy : {s_train_acc} | g_accuracy : {g_train_acc}')
             train_data_loss.append(loss.data.cpu().numpy())
             e_train_data_accuracy.append(e_train_acc)
             s_train_data_accuracy.append(s_train_acc)
             g_train_data_accuracy.append(g_train_acc)


    # 학습이 완료 되었으니
    e_valid_mean_acc, s_valid_mean_acc, g_valid_mean_acc = predict_valid_data_acc(esg_test_dataloader)
    print(f'{epoch+1}번째 valid_data esg 평균 정확도 : [E 평균 정확도 : {e_valid_mean_acc}, S 평균 정확도 : {s_valid_mean_acc}, G 평균 정확도 : {g_valid_mean_acc}]')
    e_valid_data_mean_accuracy.append(e_valid_mean_acc)
    s_valid_data_mean_accuracy.append(s_valid_mean_acc)
    g_valid_data_mean_accuracy.append(g_valid_mean_acc)

    # 에폭 = random_state --> 에폭마다 학습하는 것은 고정되어 있다. 끊길 위험성 배제 : 에폭마다 모델 저장하기
    torch.save({'model_state_dict': model.state_dict()}, f'/content/drive/MyDrive/model_checkpoint/esg/esg_version1-{epoch + 1}.pt')

train_data_accuracy = epoch : 2 | batch_id : 1600 | loss : 0.6408862471580505 | e_accuracy : 0.75 | s_accuracy : 0.51 | g_accuracy : 0.72
train_data_accuracy = epoch : 2 | batch_id : 1700 | loss : 0.5443394184112549 | e_accuracy : 0.52 | s_accuracy : 0.94 | g_accuracy : 0.56
train_data_accuracy = epoch : 2 | batch_id : 1800 | loss : 0.5677791833877563 | e_accuracy : 0.47 | s_accuracy : 0.9 | g_accuracy : 0.62
train_data_accuracy = epoch : 2 | batch_id : 1900 | loss : 0.5459916591644287 | e_accuracy : 0.63 | s_accuracy : 0.92 | g_accuracy : 0.53
train_data_accuracy = epoch : 2 | batch_id : 2000 | loss : 0.5129324197769165 | e_accuracy : 0.65 | s_accuracy : 0.86 | g_accuracy : 0.83
train_data_accuracy = epoch : 2 | batch_id : 2100 | loss : 0.5431194305419922 | e_accuracy : 0.73 | s_accuracy : 0.82 | g_accuracy : 0.76
train_data_accuracy = epoch : 2 | batch_id : 2200 | loss : 0.5491596460342407 | e_accuracy : 0.65 | s_accuracy : 0.84 | g_accuracy : 0.76
train_data_accuracy = epoch : 2 | b

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(esg_train_dataloader)):


  0%|          | 0/2678 [00:00<?, ?it/s]

  self.pid = os.fork()


train_data_accuracy = epoch : 3 | batch_id : 0 | loss : 2.720496416091919 | e_accuracy : 0.44 | s_accuracy : 0.58 | g_accuracy : 0.49
train_data_accuracy = epoch : 3 | batch_id : 100 | loss : 0.7031360268592834 | e_accuracy : 0.45 | s_accuracy : 0.57 | g_accuracy : 0.52
train_data_accuracy = epoch : 3 | batch_id : 200 | loss : 0.690646767616272 | e_accuracy : 0.56 | s_accuracy : 0.51 | g_accuracy : 0.56
train_data_accuracy = epoch : 3 | batch_id : 300 | loss : 0.6970274448394775 | e_accuracy : 0.48 | s_accuracy : 0.42 | g_accuracy : 0.53
train_data_accuracy = epoch : 3 | batch_id : 400 | loss : 0.6864483952522278 | e_accuracy : 0.54 | s_accuracy : 0.58 | g_accuracy : 0.59


- 학습 데이터로 학습시키기 + dataloader에 맞는 스케쥴러 정의

`시각화`

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(3, 3, figsize=(20, 15))

axs[0, 0].plot(train_data_loss, label='train loss')
axs[0, 0].set_title('Train Data Loss')
axs[0, 0].set_xlabel('Epochs')
axs[0, 0].set_ylabel('Loss')
axs[0, 0].legend()

axs[0, 1].plot(e_train_data_accuracy, label='train accuracy', color='orange')
axs[0, 1].set_title('E_train Data Accuracy')
axs[0, 1].set_xlabel('Epochs')
axs[0, 1].set_ylabel('Accuracy')
axs[0, 1].legend()

axs[0, 2].plot(s_train_data_accuracy, label='train accuracy', color='orange')
axs[0, 2].set_title('S_train Data Accuracy')
axs[0, 2].set_xlabel('Epochs')
axs[0, 2].set_ylabel('Accuracy')
axs[0, 2].legend()

axs[1, 0].plot(g_train_data_accuracy, label='train accuracy', color='orange')
axs[1, 0].set_title('G_train Data Accuracy')
axs[1, 0].set_xlabel('Epochs')
axs[1, 0].set_ylabel('Accuracy')
axs[1, 0].legend()

axs[1, 1].plot(e_valid_data_mean_accuracy, label='e_validation accuracy', color='green')
axs[1, 1].set_title('E_Validation Data Mean Accuracy')
axs[1, 1].set_xlabel('Epochs')
axs[1, 1].set_ylabel('Accuracy')
axs[1, 1].legend()

axs[1, 2].plot(s_valid_data_mean_accuracy, label='s_validation accuracy', color='green')
axs[1, 2].set_title('S_Validation Data Mean Accuracy')
axs[1, 2].set_xlabel('Epochs')
axs[1, 2].set_ylabel('Accuracy')
axs[1, 2].legend()

axs[2, 0].plot(g_valid_data_mean_accuracy, label='g_validation accuracy', color='green')
axs[2, 0].set_title('G_Validation Data Mean Accuracy')
axs[2, 0].set_xlabel('Epochs')
axs[2, 0].set_ylabel('Accuracy')
axs[2, 0].legend()

axs[2, 1].axis('off')
axs[2, 2].axis('off')

plt.tight_layout()
plt.show()
