In [None]:
#========================================================================================================================
# sentence-bert(sbert)에 CrossEncoder 방식 NLI 훈련 예시임
# => cross-encocoder 방식은 2개의 문장(문장1, 문장2)을 입력했을때 output으로 유사도(0~1값)을 출력해줌
#
# => 참고 : https://www.sbert.net/examples/training/cross-encoder/README.html
#         https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/cross-encoder/training_stsbenchmark.py  
#========================================================================================================================
import torch 
import os
import time
import numpy as np
from os import sys
from datetime import datetime
sys.path.append('../../')
from myutils import seed_everything, GPU_info, mlogging
from torch.utils.data import DataLoader
import math

from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers import InputExample

device = GPU_info()
logger =  mlogging(loggername="sbertcross", logfilename="../../../log/sbert-crossencocer-train-sts")

In [None]:

train_batch_size = 64
num_epochs = 3
lr = 3e-5 # default=2e-5 
eps = 1e-8 #lr이 0으로 나뉘어져 계산이 엉키는 것을 방지하기 위해 epsilion
max_seq_length = 128 
seed = 111 

use_kornli = 1 # kornli 파일 
use_kluenli = 1 # kluests_v1.1 파일 
use_gluenli = 1 # glue 파일

#KorNLI, KorSTS 파일 경로
train_kornli_file = '../../../data11/korpora/kornli/snli_1.0_train.ko.tsv' 
eval_kornli_file = '../../../data11/korpora/kornli/xnli.dev.ko-1.tsv'

#KLUENIL, KlueSTS 파일 경로
train_kluenli_file = '../../../data11/korpora/klue-nli/klue-nli-v1.1_train.json' 
eval_kluenli_file = '../../../data11/korpora/klue-nli/klue-nli-v1.1_dev.json'

#GLUENLI, GLUESTS 파일 경로
train_gluenli_file = '../../../data11/korpora/gluemnli/glue-mnli-train.tsv' 
eval_gluenli_file = '../../../data11/korpora/gluemnli/glue-mnli-valid.tsv'

label2int = {"entailment": 0, "neutral": 1, "contradiction": 2}

model_path = "../../../data11/model/moco/cross/klue-cross-sts-nli-sts/bertmodel"
model_save_path = '../../../data11/model/moco/cross/klue-cross-sts-nli-sts-nli' # +datetime.now().strftime("%Y-%m-%d_%H-%M")

seed_everything(seed)

# sts->nli 모델로 만들때도 bert 모델로 만든 후 훈련시켜야 함
'''
from transformers import BertModel, BertTokenizer, DistilBertModel, DistilBertTokenizer, AlbertModel, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained(model_path, do_lower_case=True, keep_accent=False)
bertmodel = AlbertModel.from_pretrained(model_path)
OUTPATH = model_path + "/bertmodel"
os.makedirs(OUTPATH, exist_ok=True)
bertmodel.save_pretrained(OUTPATH)
tokenizer.save_pretrained(OUTPATH)
'''

In [None]:
# 훈련 데이터 불러오기
# => [sentence1, sentence2], labels 식으로 만듬
train_samples = []

####################################################################################################
# KorNLI 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_kornli == 1:
    count = 0
    logger.info(f"\r\nRead NLI train dataset:{train_kornli_file}")

    with open(train_kornli_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            s1, s2, label = line.split('\t')
            label = label2int[label.strip()]
            if count < 5:
                print(f"{s1}, {s2}, {label}")
            
            train_samples.append(InputExample(texts=[s1, s2], label=label))
            count += 1
        
    logger.info(f'*kornli len: {count}')
####################################################################################################

####################################################################################################
# KlueNLI 훈련 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "nli", split="train")
# for data in datas:
#        s1 = data["sentence1"]
#        s2 = data["sentence2"]
#        label = data["label"]["label"]
###################################################################################################    
# kluenli 훈련인 경우 
if use_kluenli == 1:
    count = 0
    import json
    logger.info(f"\r\nRead NLI train dataset:{train_kluenli_file}")

    with open(train_kluenli_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            #print(data)
            s1 = data["premise"].strip()
            s2 = data["hypothesis"].strip()
            label = label2int[data["gold_label"].strip()]
            if count < 5:
                print(f"{s1}, {s2}, {label}")

            train_samples.append(InputExample(texts=[s1, s2], label=label))
            count += 1
            
    logger.info(f'*kluenli len: {count}')
    
####################################################################################################
# GLUENLI 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_gluenli == 1:
    count = 0
    logger.info(f"\r\nRead NLI train dataset:{train_gluenli_file}")

    with open(train_gluenli_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            s1, s2, label = line.split('\t')
            label = label2int[label.strip()]
            if count < 5:
                print(f"{s1}, {s2}, {label}")
            
            train_samples.append(InputExample(texts=[s1, s2], label=label))
            count += 1
        
    logger.info(f'*gluenli len: {count}')
####################################################################################################
        
logger.info(f'*train_samples_len:{len(train_samples)}')

In [None]:
# 평가 데이터 불러오기
# => [sentence1, sentence2], labels 식으로 만듬
dev_samples = []

####################################################################################################
# KorNLI 평가 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_kornli == 1:
    count = 0
    logger.info(f"\r\nRead NLI dev dataset:{eval_kornli_file}")

    with open(eval_kornli_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            s1, s2, label = line.split('\t')
            label = label2int[label.strip()]
            if count < 5:
                print(f"{s1}, {s2}, {label}")
            
            dev_samples.append(InputExample(texts=[s1, s2], label=label))
            count += 1
        
    logger.info(f'*kornli len: {count}')
####################################################################################################

####################################################################################################
# KlueNLI 평가 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "nli", split="train")
# for data in datas:
#        s1 = data["sentence1"]
#        s2 = data["sentence2"]
#        label = data["label"]["label"]
###################################################################################################    
# kluenli 훈련인 경우 
if use_kluenli == 1:
    count = 0
    import json
    logger.info(f"\r\nRead NLI dev dataset:{eval_kluenli_file}")

    with open(eval_kluenli_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            #print(data)
            s1 = data["premise"].strip()
            s2 = data["hypothesis"].strip()
            label = label2int[data["gold_label"].strip()]
            if count < 5:
                print(f"{s1}, {s2}, {label}")

            dev_samples.append(InputExample(texts=[s1, s2], label=label))
            count += 1
            
    logger.info(f'*kluenli len: {count}')
    
####################################################################################################
# GLUENLI 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_gluenli == 1:
    count = 0
    logger.info(f"\r\nRead NLI dev dataset:{eval_gluenli_file}")

    with open(eval_gluenli_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            s1, s2, label = line.split('\t')
            label = label2int[label.strip()]
            if count < 5:
                print(f"{s1}, {s2}, {label}")
            
            dev_samples.append(InputExample(texts=[s1, s2], label=label))
            count += 1
        
    logger.info(f'*gluenli len: {count}')
####################################################################################################
        
logger.info(f'*dev_samples_len:{len(dev_samples)}')

In [None]:
#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='xnli.dev.ko.tsv')

In [None]:
# 모델 불러오기 
model = CrossEncoder(model_path, max_length=max_seq_length, num_labels=len(label2int))

In [None]:
# 훈련시작
# => model_save_path에 모델과, 평가 CESoftmaxAccuracyEvaluator-dev_results.csv 파일 생성됨
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
# evaluation_steps은 20%로 설정
evaluation_steps = warmup_steps * 2

logger.info(f"model:{model_path}, save_model:{model_save_path}")
logger.info("*train_batch: {}, epoch:{}, lr:{}, eps:{}, train_dataset:{}, Warmup-steps: {}, evaluation_step: {}".format(train_batch_size, num_epochs, lr, eps, len(train_dataloader), warmup_steps, evaluation_steps))


# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          optimizer_params= {'lr': lr, 'eps': eps, 'correct_bias': False},
          save_best_model=True, # **기본 = True : eval 가장 best 모델을 output_Path에 저장함
          output_path=model_save_path)

In [None]:
# Load the stored model and evaluate its performance on STS benchmark dataset
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
##############################################################################
import time 

#model_save_path = "bongsoo/albert-small-kor-v1"
#model_save_path = "../../../data11/model/moco/cross/albert-small-kor-cross-nli"

test_file = '../../../data11/korpora/kornli/xnli.test.ko-1.tsv'

# 테스트 데이터 불러옴 
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
test_samples = []
with open(test_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, label = line.split('\t')
        label = label2int[label.strip()]
        test_samples.append(InputExample(texts=[s1, s2], label=label))

start = time.time()       
model = CrossEncoder(model_save_path, num_labels=len(label2int))

evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(test_samples, name='xnli.test.ko.tsv')
result = evaluator(model)

logger.info(f"\n")
logger.info(f"model path: {model_save_path}")
logger.info(f'=== result: {result} ===')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info("==============================================")
logger.info("\n")

In [None]:
# bert 모델로 저장
# => 추후 sts 훈련을 위해서는 bert모델로 다시 만들어야 하므로.
from transformers import BertModel, BertTokenizer, DistilBertModel, DistilBertTokenizer, AlbertModel, AlbertTokenizer

tokenizer = BertTokenizer.from_pretrained(model_save_path, do_lower_case=True, keep_accent=False)
bertmodel = BertModel.from_pretrained(model_save_path)
OUTPATH = model_save_path + "/bertmodel"
os.makedirs(OUTPATH, exist_ok=True)
bertmodel.save_pretrained(OUTPATH)
tokenizer.save_pretrained(OUTPATH)
    