In [None]:
#======================================================================================================
# sentence-bert STS 데이터셋을 가지고, 훈련 및 평가 예시
#
# => 기존 (distil)bert 모델을 가지고, STS 훈련 및 평가 후, S-BERT로 만드는 예시임.

#=> 필요에 따라 출력 dimension을 768보다 작게 줄이고 싶을때 dense 모델을 추가해서 줄일수 있음
#=> reduce_out_dimension = True 로 하면, 출력 임베딩 dimension이 줄어들게 설정가능함

# => sentence-transformers 패키지를 이용하여 구현 함.(*pip install -U sentence-transformers 설치 필요)
#
# **learning rate는 기본이 2e-5임
#
# 도큐먼트 : https://www.sbert.net/index.html
# 소스참고 : https://github.com/BM-K/KoSentenceBERT-ETRI
#
# pip install -U sentence-transformers
#
# # ** skt/kobert-base-V1  sbert 만들고 나서는 tokenizer_config.json 에 tokenizer_class:"KoBERTTokenizer" 를 tokenizer_class:"XLNetTokenizer" 로 변경해야함.
#======================================================================================================
import torch.nn as nn
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import sys
import os
import gzip
import csv

sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging

logger = mlogging(loggername="s-bert-sts", logfilename="../../log/s-bert-sts")
device = GPU_info()

In [None]:
import os

# ** skt/kobert-base-V1  sbert 만들고 나서는 tokenizer_config.json 에 tokenizer_class:"KoBERTTokenizer" 를 tokenizer_class:"XLNetTokenizer" 로 변경해야함.
bisSKKobertModel = 1  # skt/kobert-base-V1 허깅페이스 모델 사용시에는 1로 해줌
# s-bert로 만들 원본 bert 경로
model_path = "skt/kobert-base-v1"
#model_path = "bongsoo/mdistilbertV1.1"

# 원본 bert를 sentencebert로 만든후 만들어진 s-bert 저장 경로
# => **해당 경로\eval 폴더에 similarity_evaluation_sts-dev_result.csv 파일로 각 epoch 마다 평가된 결과가 기록된다.
#smodel_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
smodel_path = "../../data11/model/sbert/kobert-base-v1-sts-b32/"


use_korsts = 1     # 한국어 korsts 파일 (tsv 5,749개)
use_kluests = 1    # 한국어 kluests_v1.1 파일 (json 11,668개)
use_sts17 = 1      # 한국어 sts17-crosslingual-sts (jsonl 2,846개)
use_glue_sts = 1   # 영어 glue_sts (load_dataset 5,749개)
use_en_sts = 1     # 영어 stsb_multi_mt(load_dataset 15,676개) = stsb_multi_mt(5,749개) + mteb/sickr-sts(9,927개)

# KorSTS 학습, 평가 파일들
train_korsts_file = '../../data11/korpora/korsts/tune_train.tsv'
eval_korsts_file = '../../data11/korpora/korsts/tune_dev.tsv'

# KlueSTS 학습, 평가 파일들
train_kluests_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_train.json'
eval_kluests_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json'

# sts17-crosslingual-sts  학습 파일(*평가파일 없음)
train_sts17_file = '../../data11/korpora/sts17-crosslingual-sts/ko-ko.jsonl'

train_batch_size = 32
eval_batch_size = 32
num_epochs = 5      # 128 정도 해도 최상의 모델을 찾을수 있음 (*sbert는 eval이 최상인 모델이 out모델로 저장됨)
#num_epochs = 800
max_seq_length = 72
lr = 1e-4            # default=2e-5
eps = 1e-6           #lr이 0으로 나뉘어져 계산이 엉키는 것을 방지하기 위해 epsilion
seed=111
#============================================================================
# *출력 dimension을 줄일 경우에는 True로 하고, out_dimension에 줄일 값을 설정함
reduce_out_dimension = False  # True이면 dimension을 줄임=>Dense 모델 추가됨
out_dimension = 128
#============================================================================

seed_everything(seed)

# 모델과 tokenizer 를 불러옴
# => **사전파일(vocab.txt, *.json) 와 model 경로(config.json, pytorch_model.bin)가 같은 경로에 있어야 함.
word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length, do_lower_case=False)

#========================================================================================================
# skt/kobert 모델은 tokenizer을 XLNET Tokenizer 이므로, 자체 KoBERTTOkenizer 를 불러와서 사용해야 함.
# => 설치 : !pip install 'git+https://github.com/SKTBrain/KOBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
# => 출처 : https://velog.io/@m0oon0/KoBERT-%EC%82%AC%EC%9A%A9%EB%B2%95
if bisSKKobertModel == 1:
    from kobert_tokenizer import KoBERTTokenizer
    word_embedding_model.tokenizer = KoBERTTokenizer.from_pretrained(model_path)
    print(f'load koBertTokenizer:{word_embedding_model.tokenizer}')
#========================================================================================================

# embedding 길이를 재조정 필요할때 auto_model.resize_token_embeddings 해줌
print(f'token_len:{len(word_embedding_model.tokenizer)}')
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

# word embedding_model 출력 
print(word_embedding_model)

In [None]:
# 2 bert 모델의 임베딩 풀링 정책을 설정(cls 이용, 워드임베딩 평균이용, 워드임베딩 max 이용)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),  #모델이 dimension(768)
                               pooling_mode_mean_tokens=True,  # 워드 임베딩 평균을 이용
                               pooling_mode_cls_token=False,   # cls 를 이용
                               pooling_mode_max_tokens=False)  # 워드 임베딩 값중 max 값을 이용
# pooling model 출력 
print(pooling_model)
print(pooling_model.get_sentence_embedding_dimension())

In [None]:
# 3. dense 모델 추가(옵션)
#=> 필요에 따라 출력 dimension을 768보다 작게 줄이고 싶을때 dense 모델을 추가해서 줄임.
#=> https://www.sbert.net/docs/training/overview.html?highlight=dense 참조
if reduce_out_dimension:
    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), # 입력 dimension은 앞에 pooling모델 embedding dimension으로 지정
                               out_features=out_dimension,  # 출력 dimension
                               activation_function=nn.Tanh())  # activation function은 Tahn으로 정의

In [None]:
# SBERT 모델 생성
if reduce_out_dimension:
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])
else:
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    
print(model)

In [None]:
import json
from datasets import load_dataset

train_samples = []

####################################################################################################
# KorSTS 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS train dataset=>{train_korsts_file}")
    with open(train_korsts_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_korsts_file} count: {count}')
####################################################################################################

####################################################################################################
# klue 훈련 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="train")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
###################################################################################################           
if use_kluests == True:  
    count = 0
    logger.info(f"Read STS train dataset=>{train_kluests_file}")
    with open(train_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 3:
                print(f"{text_a}, {text_b}, {score}")

            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################
# 한국어 sts17-crosslingual-sts 훈련 데이터셋 설정
# => jsonl : 여러개의 json 형식 파일이 각 줄마다 기록되어 있는 형태 파일
# => 패키지 설치 : !pip install jsonlines
####################################################################################################
if use_sts17 == True:
    import jsonlines
    count = 0
    logger.info(f"Read STS train dataset=>{train_sts17_file}")
    with jsonlines.open(train_sts17_file, "r") as f:
        for line in f:
            text_a = line["sentence1"]
            text_a = line["sentence2"]
            score = line["score"]
            score = float(score) / 5.0
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
            
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################

#############################################################################################
# 영문 sts 데이터셋 설정 (load_dataset)
# => stsb_multi_mt , mteb/sickr-sts 영문 sts 훈련 데이터 셋 불러오기
#############################################################################################
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt_en len: {count}')
    
    # mteb/sickr-sts 훈련데이터 불러옴
    count = 0    
    en_sts_dataset = load_dataset("mteb/sickr-sts", split="test")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*mteb/sickr-sts len: {count}')
#############################################################################################           
 
#############################################################################################
# GLUE STS 훈련 데이터셋 설정 (load_dataset)
#############################################################################################
if use_glue_sts == True:
    # glue stsb 훈련데이터 불러옴(5,749개)
    count = 0    
    en_sts_dataset = load_dataset("glue","stsb", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue_stsb len: {count}')
#############################################################################################

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*train_samples_len:{len(train_samples)}')
print(train_samples[0:3])

In [None]:
# 데이터 셋, 데이터 로더, 손실함수 정의

train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


In [None]:
#Read STSbenchmark dataset and use it as development set
# 평가데이터 불러오기
#korsts 파일로 두 문장간 유사도를 수치로(5.0이 만점=매우 유사) 측정함.
dev_samples = []

####################################################################################################
# KorSTS 평가 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_korsts_file}")
    with open(eval_korsts_file, 'rt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
            
            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_korsts_file} len: {count}')
####################################################################################################  

####################################################################################################
# KlueSTS 평가 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="test")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
####################################################################################################           
if use_kluests == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_kluests_file}")
    with open(eval_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 5:
                print(f"{text_a}, {text_b}, {score}")

            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_kluests_file} len: {count}')
####################################################################################################  

####################################################################################################
# 영문 stsb_multi_mt 데이터 셋 설정(load_dataset)
####################################################################################################                
# stsb_multi_mt 영문 sts dev 데이터 셋 불러오기
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="dev")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt len: {count}')
####################################################################################################  

####################################################################################################
# 영문 GLUE 데이터 셋 설정(load_dataset)
####################################################################################################  
if use_glue_sts == True:
    count = 0
    glue_stsb_dataset = load_dataset("glue","stsb", split="validation")
    for data in glue_stsb_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
        
        if count < 3:
            print(f"{text_a}, {text_b}, {score}")
            
        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue-stsb len: {count}')
####################################################################################################  

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*dev_samples_len:{len(dev_samples)}')
print(dev_samples[0:3])

# 2개의 bert 모델에서 구한 2개의 embedding 값들의 cosine 유사도를 구해서, 이를 실제 score와 비교해서 유사도 측정함
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, 
                                                                 batch_size=eval_batch_size, 
                                                                 name='sts-dev')

In [None]:
#warmup_step은 10% 로 설정
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) 
#warmup_steps = 0 # kcbert-config 참조함 

# evaluation_steps은 20%로 설정
evaluation_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.2)

logger.info(f"*IN-model:{model_path}")
logger.info(f"*OUT-model:{smodel_path}")
logger.info("*seed:{}, train_batch:{}, eval_batch:{}, epoch:{}, lr:{}, eps:{}, max_seq_length:{}, train_dataset:{}, Warmup-steps: {}, evaluation_step: {}".format(seed, train_batch_size, eval_batch_size, num_epochs, lr, eps, max_seq_length, len(train_dataset), warmup_steps, evaluation_steps))

# Train the model
# => **learning rate는 기본이 2e-5임
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          optimizer_params= {'lr': lr, 'eps': eps, 'correct_bias': False},
          save_best_model=True, # **기본 = True : eval 가장 best 모델을 output_Path에 저장함
          output_path=smodel_path
          )


In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
##############################################################################
import time
from sentence_transformers.evaluation import SimilarityFunction

# 테스트 파일=KorSTS 테스트파일 경로 지정
test_file = '../../data11/korpora/korsts/tune_test.tsv'

# 평가시 cosine 유사도등 측정 결과값 파일 (similarity_evaluation_xxxx.xls) 저장될 경로
output_path = './sts-test2'
os.makedirs(output_path, exist_ok=True)

test_samples = []
with open(test_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1,s2], label=score))

logger.info("\n")
logger.info("======================TEST===================")
logger.info("\n\n")
logger.info(f"model save path > {smodel_path}")
start = time.time()
model = SentenceTransformer(smodel_path)

# 유사도 측정방식(COSINE, EUCLIDEAN, MANHATTAN, DOT_PRODUCT 중 선택 , 모두 spearman 방식임)
# => None 이면 아래 값들중 MAX 값 추력함
#main_similarity = None
main_similarity = SimilarityFunction.COSINE
#main_similarity = SimilarityFunction.EUCLIDEAN
#main_similarity = SimilarityFunction.MANHATTAN
#main_similarity = SimilarityFunction.DOT_PRODUCT

logger.info(f"main_similarity: {main_similarity}")

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, main_similarity=main_similarity, batch_size=eval_batch_size, name='sts-test', show_progress_bar=True)
result = test_evaluator(model, output_path=output_path)

logger.info(f"\n")
logger.info(f"model path: {smodel_path}")
logger.info(f'=== result: {result} ===')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info("==============================================")
logger.info("\n")

In [None]:
# 마지막 model 저장
#output_path = "../../data11/model/sbert/sbert-mdistilbertV3.1-last"
#model.save(output_path)