In [None]:
#========================================================================================================================
# sentence-bert(sbert)에 CrossEncoder 방식 STS 훈련 예시임
# => cross-encocoder 방식은 2개의 문장(문장1, 문장2)을 입력했을때 output으로 유사도(0~1값)을 출력해줌
# => 반드시 num_label=1이어야 함.(*따라서 nli 모델은 num_labels=3이므로, 훈련시킬수 없음)
# => bi-encoder와 차이점은 Evaluator가 다르다는 것임.
# => bi-encoder: EmbeddingSimilarityEvaluator vs cross-encoder: CECorrelationEvaluator
#
# => 참고 : https://www.sbert.net/examples/training/cross-encoder/README.html
#         https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/cross-encoder/training_stsbenchmark.py  
#========================================================================================================================
import torch 
import os
import time
import numpy as np
from os import sys
from datetime import datetime
sys.path.append('../../')
from myutils import seed_everything, GPU_info, mlogging
from torch.utils.data import DataLoader
import math

from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample

device = GPU_info()
logger =  mlogging(loggername="sbertcross", logfilename="../../../log/sbert-crossencocer-train-sts")

In [None]:
#===============================================================================
# parameter 설정
#===============================================================================

train_batch_size = 32  # base 이상 모델은=128, small 모델은=32 로 하는게 좋음
eval_batch_size = 64
num_epochs = 10      # 10 정도 해도 최상의 모델을 찾을수 있음 (*sbert는 eval이 최상인 모델이 out모델로 저장됨)
max_seq_length = 128
lr = 1e-4            # default=2e-5
eps = 1e-6           #lr이 0으로 나뉘어져 계산이 엉키는 것을 방지하기 위해 epsilion
seed=111

# 말뭉치 사용.유무
use_korsts = 1     # 한국어 korsts 파일 (tsv 5,749개)
use_kluests = 1    # 한국어 kluests_v1.1 파일 (json 11,668개)
use_sts17 = 1      # 한국어 sts17-crosslingual-sts (jsonl 2,846개)
use_glue_sts = 1   # 영어 glue_sts (load_dataset 5,749개)
use_en_sts = 1     # 영어 stsb_multi_mt(load_dataset 15,676개) = stsb_multi_mt(5,749개) + mteb/sickr-sts(9,927개)

# sentence_transformers 2.2.2 부터는 'correct_bias' 인자가 없어졌음. => correct_bias : False 하면 sts 성능이 떨어짐(*원인 모름)
use_correct_bias = 0

if use_correct_bias == 0:
    opt_params = {'lr': lr, 'eps': eps}  # defalut
else:
    opt_params = {'lr': lr, 'eps': eps, 'correct_bias': False}
    print(f'**correct_bias:False')
    
# KorSTS 학습, 평가 파일들
train_korsts_file = '../../../data11/korpora/korsts/tune_train.tsv'
eval_korsts_file = '../../../data11/korpora/korsts/tune_dev.tsv'

# KlueSTS 학습, 평가 파일들
train_kluests_file = '../../../data11/korpora/klue-sts/klue-sts-v1.1_train.json'
eval_kluests_file = '../../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json'

# sts17-crosslingual-sts  학습 파일(*평가파일 없음)
train_sts17_file = '../../../data11/korpora/sts17-crosslingual-sts/ko-ko.jsonl'

# 모델 경로 
model_path = "../../../data11/model/moco/cross/albert-small-kor-cross-nli/bertmodel"            # 불러올 모델 경로
model_save_path = '../../../data11/model/moco/cross/albert-small-kor-cross-nli-sts'   # 저장할 모델 경로

seed_everything(seed)

'''
# nli 모델인 경우 bert 모델로 저장
# => nli 모델은 num_labels=3이므로, sts 훈련을 위해서는 bert모델로 다시 만들어야 함
from transformers import BertModel, BertTokenizer, DistilBertModel, DistilBertTokenizer, AlbertModel, AlbertTokenizer
model_save_path = "../../../data11/model/moco/cross/albert-small-kor-cross-nli"

tokenizer = AlbertTokenizer.from_pretrained(model_save_path, do_lower_case=True, keep_accent=False)
bertmodel = AlbertModel.from_pretrained(model_save_path)
OUTPATH = model_save_path + "/bertmodel"
os.makedirs(OUTPATH, exist_ok=True)
bertmodel.save_pretrained(OUTPATH)
tokenizer.save_pretrained(OUTPATH)
'''

In [None]:
# 모델 로딩 
# => nli 모델(num_labels=3)은 sts 훈련 못함 (반드시 num_labels=1 이어야함)
model = CrossEncoder(model_path, max_length=max_seq_length, num_labels=1)     # CrossEncoder 모델 로딩
print(model)

In [None]:
import json
from datasets import load_dataset

train_samples = []

####################################################################################################
# KorSTS 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS train dataset=>{train_korsts_file}")
    with open(train_korsts_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_korsts_file} count: {count}')
####################################################################################################

####################################################################################################
# klue 훈련 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="train")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
###################################################################################################           
if use_kluests == True:  
    count = 0
    logger.info(f"Read STS train dataset=>{train_kluests_file}")
    with open(train_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 3:
                print(f"{text_a}, {text_b}, {score}")

            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################
# 한국어 sts17-crosslingual-sts 훈련 데이터셋 설정
# => jsonl : 여러개의 json 형식 파일이 각 줄마다 기록되어 있는 형태 파일
# => 패키지 설치 : !pip install jsonlines
####################################################################################################
if use_sts17 == True:
    import jsonlines
    count = 0
    logger.info(f"Read STS train dataset=>{train_sts17_file}")
    with jsonlines.open(train_sts17_file, "r") as f:
        for line in f:
            text_a = line["sentence1"]
            text_a = line["sentence2"]
            score = line["score"]
            score = float(score) / 5.0
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
            
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################

#############################################################################################
# 영문 sts 데이터셋 설정 (load_dataset)
# => stsb_multi_mt , mteb/sickr-sts 영문 sts 훈련 데이터 셋 불러오기
#############################################################################################
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt_en len: {count}')
    
    # mteb/sickr-sts 훈련데이터 불러옴
    count = 0    
    en_sts_dataset = load_dataset("mteb/sickr-sts", split="test")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*mteb/sickr-sts len: {count}')
#############################################################################################           
 
#############################################################################################
# GLUE STS 훈련 데이터셋 설정 (load_dataset)
#############################################################################################
if use_glue_sts == True:
    # glue stsb 훈련데이터 불러옴(5,749개)
    count = 0    
    en_sts_dataset = load_dataset("glue","stsb", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue_stsb len: {count}')
#############################################################################################

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*train_samples_len:{len(train_samples)}')
print(train_samples[0:3])

In [None]:
#Read STSbenchmark dataset and use it as development set
# 평가데이터 불러오기
#korsts 파일로 두 문장간 유사도를 수치로(5.0이 만점=매우 유사) 측정함.
dev_samples = []

####################################################################################################
# KorSTS 평가 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_korsts_file}")
    with open(eval_korsts_file, 'rt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
            
            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_korsts_file} len: {count}')
####################################################################################################  

####################################################################################################
# KlueSTS 평가 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="test")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
####################################################################################################           
if use_kluests == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_kluests_file}")
    with open(eval_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 5:
                print(f"{text_a}, {text_b}, {score}")

            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_kluests_file} len: {count}')
####################################################################################################  

####################################################################################################
# 영문 stsb_multi_mt 데이터 셋 설정(load_dataset)
####################################################################################################                
# stsb_multi_mt 영문 sts dev 데이터 셋 불러오기
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="dev")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt len: {count}')
####################################################################################################  

####################################################################################################
# 영문 GLUE 데이터 셋 설정(load_dataset)
####################################################################################################  
if use_glue_sts == True:
    count = 0
    glue_stsb_dataset = load_dataset("glue","stsb", split="validation")
    for data in glue_stsb_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
        
        if count < 3:
            print(f"{text_a}, {text_b}, {score}")
            
        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
        
        #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        # **한국어 모델 훈련시에는 500개(총 1,500개 중) 정도만 작게 평가 데이터셋으로 설정
        #if count > 499:
        #    break
        #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        
    logger.info(f'*glue-stsb len: {count}')
####################################################################################################  

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*dev_samples_len:{len(dev_samples)}')
print(dev_samples[0:3])

In [None]:
# 데이터 로더, evaluator 
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

#** bi-encoder와 차이점.
# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [None]:
# 훈련 시작 
# => model_save_path에 모델과, 평가 CECorrelationEvaluator_sts-dev_results.csv 파일 생성됨

warmup_steps = math.ceil(len(train_samples) * num_epochs * 0.1) #10% of train data for warm-up

# evaluation_steps은 20%로 설정
evaluation_steps = int(len(train_samples) * num_epochs / train_batch_size * 0.2)

logger.info("Warmup-steps: {}".format(warmup_steps))

logger.info(f"*IN-model:{model_path}")
logger.info(f"*OUT-model:{model_save_path}")
logger.info("*seed:{}, train_batch:{}, eval_batch:{}, epoch:{}, lr:{}, eps:{}, max_seq_length:{}, train_samples:{}, Warmup-steps: {}, evaluation_step: {}".format(seed, train_batch_size, eval_batch_size, num_epochs, lr, eps, max_seq_length, len(train_samples), warmup_steps, evaluation_steps))
logger.info(f"*use_correct_bias:{use_correct_bias}")

# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          optimizer_params= opt_params, 
          save_best_model=True, # **기본 = True : eval 가장 best 모델을 output_Path에 저장함
          output_path=model_save_path)

In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
##############################################################################
import time 

#model_save_path = "../../../data11/model/moco/cross/albert-small-kor-cross-sts"
#model_save_path = "../../../model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-ft-nli-0328/bertmodel"


test_file = '../../../data11/korpora/korsts/tune_test.tsv'

# 훈련 데이터 불러옴 
test_samples = []
with open(test_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1,s2], label=score))

start = time.time()       
model = CrossEncoder(model_save_path)

evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='sts-test')
result = evaluator(model)

logger.info(f"\n")
logger.info(f"model path: {model_save_path}")
logger.info(f'=== result: {result} ===')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info("==============================================")
logger.info("\n")