In [1]:
#======================================================================================================
# sentence-bert STS 데이터셋을 가지고, 훈련 및 평가 예시
#
# => 기존 (distil)bert 모델을 가지고, STS 훈련 및 평가 후, S-BERT로 만드는 예시임.

#=> 필요에 따라 출력 dimension을 768보다 작게 줄이고 싶을때 dense 모델을 추가해서 줄일수 있음
#=> reduce_out_dimension = True 로 하면, 출력 임베딩 dimension이 줄어들게 설정가능함

# => sentence-transformers 패키지를 이용하여 구현 함.(*pip install -U sentence-transformers 설치 필요)
#
# **learning rate는 기본이 2e-5임
#
# 도큐먼트 : https://www.sbert.net/index.html
# 소스참고 : https://github.com/BM-K/KoSentenceBERT-ETRI
#
# pip install -U sentence-transformers
#
# # ** skt/kobert-base-V1  sbert 만들고 나서는 tokenizer_config.json 에 tokenizer_class:"KoBERTTokenizer" 를 tokenizer_class:"XLNetTokenizer" 로 변경해야함.
#==========================================================================================================

import torch.nn as nn
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import SimilarityFunction, EmbeddingSimilarityEvaluator
from datetime import datetime
import sys
import os
import gzip
import csv

sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging

logger = mlogging(loggername="s-bert-sts", logfilename="../../log/s-bert-sts")
device = GPU_info()



logfilepath:../../log/s-bert-sts_2023-01-05.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [2]:
import os
from transformers import BertTokenizer, BertTokenizerFast

# ** skt/kobert-base-V1  sbert 만들고 나서는 tokenizer_config.json 에 tokenizer_class:"KoBERTTokenizer" 를 tokenizer_class:"XLNetTokenizer" 로 변경해야함.
bisSKKobertModel = 0  # skt/kobert-base-V1 허깅페이스 모델 사용시에는 1로 해줌
# s-bert로 만들 원본 bert 경로
model_path = '../../data11/model/moco/sbert-klue-nli-mean-64-2-sts-128/'#"skt/kobert-base-v1"
#model_path = "kykim/albert-kor-base"

# 원본 bert를 sentencebert로 만든후 만들어진 s-bert 저장 경로
# => **해당 경로\eval 폴더에 similarity_evaluation_sts-dev_result.csv 파일로 각 epoch 마다 평가된 결과가 기록된다.
#smodel_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
#smodel_path = '../../data11/model/bert/mbertV3.0-aihub-NSPMLM-checkout/checkpoint-4129542-sts-b128-lower-bias'#'../../data11/model/sbert/mdistilbertV3.1-sts-b32-lower'
smodel_path = '../../data11/model/moco/sbert-klue-nli-mean-64-2-sts-128-2'
#=======================================================================================================
train_batch_size = 128
eval_batch_size = 64
num_epochs = 128      # 128 정도 해도 최상의 모델을 찾을수 있음 (*sbert는 eval이 최상인 모델이 out모델로 저장됨)
max_seq_length = 72
lr = 3e-5            # default=2e-5
eps = 1e-6           #lr이 0으로 나뉘어져 계산이 엉키는 것을 방지하기 위해 epsilion
seed=111

# 임베딩 벡터 폴링 모드 선택 (*아래값중 문자열로 입력함, 기본=mean)
# mean=단어 평균, max=최대값, cls=문장, 
#['mean', 'max', 'cls', 'weightedmean', 'lasttoken']
pooling_mode = 'mean'

do_lower_case_param = True # true = 대.소문자 구분없이 모두 소문자로 변환(*한국어는 True해도 상관없음)

# sentence_transformers 2.2.2 부터는 'correct_bias' 인자가 없어졌음. => correct_bias : False 하면 sts 성능이 떨어짐(*원인 모름)
use_correct_bias = 0

if use_correct_bias == 0:
    opt_params = {'lr': lr, 'eps': eps}  # defalut
else:
    opt_params = {'lr': lr, 'eps': eps, 'correct_bias': False}
    print(f'**correct_bias:False')
    
# 평가 유사도 측정방식(COSINE, EUCLIDEAN, MANHATTAN, DOT_PRODUCT 중 선택 , 모두 spearman 방식임)
# => None 이면 아래 값들중 MAX 값 추력함
#main_similarity = None
main_similarity = SimilarityFunction.COSINE
#main_similarity = SimilarityFunction.EUCLIDEAN
#main_similarity = SimilarityFunction.MANHATTAN
#main_similarity = SimilarityFunction.DOT_PRODUCT

#=======================================================================================================

use_korsts = 1     # 한국어 korsts 파일 (tsv 5,749개)
use_kluests = 1    # 한국어 kluests_v1.1 파일 (json 11,668개)
use_sts17 = 1      # 한국어 sts17-crosslingual-sts (jsonl 2,846개)
use_glue_sts = 1   # 영어 glue_sts (load_dataset 5,749개)
use_en_sts = 0     # 영어 stsb_multi_mt(load_dataset 15,676개) = stsb_multi_mt(5,749개) + mteb/sickr-sts(9,927개)

# KorSTS 학습, 평가 파일들
train_korsts_file = '../../data11/korpora/korsts/tune_train.tsv'
eval_korsts_file = '../../data11/korpora/korsts/tune_dev.tsv'

# KlueSTS 학습, 평가 파일들
train_kluests_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_train.json'
eval_kluests_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json'

# sts17-crosslingual-sts  학습 파일(*평가파일 없음)
train_sts17_file = '../../data11/korpora/sts17-crosslingual-sts/ko-ko.jsonl'


#============================================================================
# *출력 dimension을 줄일 경우에는 True로 하고, out_dimension에 줄일 값을 설정함
reduce_out_dimension = False  # True이면 dimension을 줄임=>Dense 모델 추가됨
out_dimension = 128
#============================================================================

seed_everything(seed)

#==========================================================================================================
# 모델 설정
# => * 훈련시킬 모델이 이미 sentencebert일지라도, 아래처럼 SentenceTransformer(model_path) 이용하지 않고, 
# word_embedding_model, pooling_model 을 각각 만들어서 처리하는것이 테스트 시 효율의 좋음
#
# [모델 생성 방법]
# 1) word_embedding 모델 생성
# 2) pooling 모델 생성 : pooling 정책을 설정함 : CLS, 평균, MAX 정책중 택1(*평균 정책이 효율의 가장 좋다고 함)
# 3) 1) + 2) 모델을 연결시켜서 하나의 sbert 모델 만듬
#==========================================================================================================

# 모델과 tokenizer 를 불러옴
# => **사전파일(vocab.txt, *.json) 와 model 경로(config.json, pytorch_model.bin)가 같은 경로에 있어야 함.
#========================================================================================================
# **tokenier가 모델과 다른 경우에는 tokenizer_path 를 모델과 같은 임이 tokenizer로 설정함.
# => 모델과 동일한 tokenizer를 로딩한 후, 밑에서 다시 word_embedding_model.tokenizer를 다시 설정함.
# => 이렇게 하는 이유는 Transformer 함수 내부에서 AutoTokenizer 를 호출시 모델과 동일한 tokenizer를 자동으로 호출하므로 에러 발생해서 
# 꽁수로 모델과 동일한 tokenizer 로딩후, 아래에서 다시 설정하는 것임
tokenizer_path = model_path #'bongsoo/albert-small-kor-v1' # model_Path
#========================================================================================================
word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length, do_lower_case=do_lower_case_param, tokenizer_name_or_path=tokenizer_path)

#========================================================================================================
# **tokenier가 모델과 다른 경우에는 word_embedding_model.tokenizer 다시 설정함.
# => 예: 모델은 albert 인데, tokenizer는 berttokenizer 인 경우(원래는 AlbertTokenizer 이어야 함)
#word_embedding_model.tokenizer = BertTokenizerFast.from_pretrained(model_path)
#========================================================================================================

#========================================================================================================
# skt/kobert 모델은 tokenizer을 XLNET Tokenizer 이므로, 자체 KoBERTTOkenizer 를 불러와서 사용해야 함.
# => 설치 : !pip install 'git+https://github.com/SKTBrain/KOBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
# => 출처 : https://velog.io/@m0oon0/KoBERT-%EC%82%AC%EC%9A%A9%EB%B2%95
if bisSKKobertModel == 1:
    from kobert_tokenizer import KoBERTTokenizer
    word_embedding_model.tokenizer = KoBERTTokenizer.from_pretrained(model_path)
    print(f'load koBertTokenizer:{word_embedding_model.tokenizer}')
#========================================================================================================

# embedding 길이를 재조정 필요할때 auto_model.resize_token_embeddings 해줌
print(f'token_len:{len(word_embedding_model.tokenizer)}')
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

# word embedding_model 출력 
print(word_embedding_model)

token_len:30000
Transformer({'max_seq_length': 72, 'do_lower_case': True}) with Transformer model: AlbertModel 


In [3]:
# 2 bert 모델의 임베딩 풀링 정책을 설정(cls 이용, 워드임베딩 평균이용, 워드임베딩 max 이용)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),  #모델이 dimension(768)
                               pooling_mode=pooling_mode)  
# pooling model 출력 
print(pooling_model)
print(pooling_model.get_sentence_embedding_dimension())

Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
768


In [4]:
# 3. dense 모델 추가(옵션)
#=> 필요에 따라 출력 dimension을 768보다 작게 줄이고 싶을때 dense 모델을 추가해서 줄임.
#=> https://www.sbert.net/docs/training/overview.html?highlight=dense 참조
if reduce_out_dimension:
    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), # 입력 dimension은 앞에 pooling모델 embedding dimension으로 지정
                               out_features=out_dimension,  # 출력 dimension
                               activation_function=nn.Tanh())  # activation function은 Tahn으로 정의

In [5]:
# SBERT 모델 생성
if reduce_out_dimension:
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])
else:
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 72, 'do_lower_case': True}) with Transformer model: AlbertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [6]:
import json
from datasets import load_dataset

train_samples = []

####################################################################################################
# KorSTS 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS train dataset=>{train_korsts_file}")
    with open(train_korsts_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_korsts_file} count: {count}')
####################################################################################################

####################################################################################################
# klue 훈련 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="train")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
###################################################################################################           
if use_kluests == True:  
    count = 0
    logger.info(f"Read STS train dataset=>{train_kluests_file}")
    with open(train_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 3:
                print(f"{text_a}, {text_b}, {score}")

            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################
# 한국어 sts17-crosslingual-sts 훈련 데이터셋 설정
# => jsonl : 여러개의 json 형식 파일이 각 줄마다 기록되어 있는 형태 파일
# => 패키지 설치 : !pip install jsonlines
####################################################################################################
if use_sts17 == True:
    import jsonlines
    count = 0
    logger.info(f"Read STS train dataset=>{train_sts17_file}")
    with jsonlines.open(train_sts17_file, "r") as f:
        for line in f:
            text_a = line["sentence1"]
            text_a = line["sentence2"]
            score = line["score"]
            score = float(score) / 5.0
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
            
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################

#############################################################################################
# 영문 sts 데이터셋 설정 (load_dataset)
# => stsb_multi_mt , mteb/sickr-sts 영문 sts 훈련 데이터 셋 불러오기
#############################################################################################
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt_en len: {count}')
    
    # mteb/sickr-sts 훈련데이터 불러옴
    count = 0    
    en_sts_dataset = load_dataset("mteb/sickr-sts", split="test")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*mteb/sickr-sts len: {count}')
#############################################################################################           
 
#############################################################################################
# GLUE STS 훈련 데이터셋 설정 (load_dataset)
#############################################################################################
if use_glue_sts == True:
    # glue stsb 훈련데이터 불러옴(5,749개)
    count = 0    
    en_sts_dataset = load_dataset("glue","stsb", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue_stsb len: {count}')
#############################################################################################

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*train_samples_len:{len(train_samples)}')
print(train_samples[0:3])

2023-01-05 13:01:40,304 - s-bert-sts - INFO - Read STS train dataset=>../../data11/korpora/korsts/tune_train.tsv
2023-01-05 13:01:40,321 - s-bert-sts - INFO - *../../data11/korpora/korsts/tune_train.tsv count: 5749
2023-01-05 13:01:40,322 - s-bert-sts - INFO - Read STS train dataset=>../../data11/korpora/klue-sts/klue-sts-v1.1_train.json


비행기가 이륙하고 있다., 비행기가 이륙하고 있다., 1.0
한 남자가 큰 플루트를 연주하고 있다., 남자가 플루트를 연주하고 있다., 0.76
한 남자가 피자에 치즈를 뿌려놓고 있다., 한 남자가 구운 피자에 치즈 조각을 뿌려놓고 있다., 0.76


2023-01-05 13:01:40,621 - s-bert-sts - INFO - *../../data11/korpora/klue-sts/klue-sts-v1.1_train.json len: 11668
2023-01-05 13:01:40,628 - s-bert-sts - INFO - Read STS train dataset=>../../data11/korpora/sts17-crosslingual-sts/ko-ko.jsonl
2023-01-05 13:01:40,644 - s-bert-sts - INFO - *../../data11/korpora/klue-sts/klue-sts-v1.1_train.json len: 2846


숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다., 숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다., 0.74
위반행위 조사 등을 거부·방해·기피한 자는 500만원 이하 과태료 부과 대상이다., 시민들 스스로 자발적인 예방 노력을 한 것은 아산 뿐만이 아니었다., 0.0
회사가 보낸 메일은 이 지메일이 아니라 다른 지메일 계정으로 전달해줘., 사람들이 주로 네이버 메일을 쓰는 이유를 알려줘, 0.06
안전모를 쓴 한 남자가 춤을 추고 있다., 학회 홍보 메일은 회신 하지마, 1.0
아이가 말을 타고 있다., 학회 홍보 메일은 회신 하지마, 0.95
남자가 뱀에게 쥐를 먹이고 있다., 학회 홍보 메일은 회신 하지마, 1.0


Reusing dataset glue (/MOCOMSYS/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


A plane is taking off., An air plane is taking off., 1.0
A man is playing a large flute., A man is playing a flute., 0.7599999904632568
A man is spreading shreded cheese on a pizza., A man is spreading shredded cheese on an uncooked pizza., 0.7599999904632568


2023-01-05 13:01:42,676 - s-bert-sts - INFO - *glue_stsb len: 5749
2023-01-05 13:01:42,678 - s-bert-sts - INFO - ------------------------------------------------------------------------
2023-01-05 13:01:42,680 - s-bert-sts - INFO - *train_samples_len:26012


[<sentence_transformers.readers.InputExample.InputExample object at 0x7fea3fc44040>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fea3bb8de50>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fea3bb8d970>]


In [7]:
# 데이터 셋, 데이터 로더, 손실함수 정의

train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


In [8]:
#Read STSbenchmark dataset and use it as development set
# 평가데이터 불러오기
#korsts 파일로 두 문장간 유사도를 수치로(5.0이 만점=매우 유사) 측정함.
dev_samples = []

####################################################################################################
# KorSTS 평가 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_korsts_file}")
    with open(eval_korsts_file, 'rt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
            
            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_korsts_file} len: {count}')
####################################################################################################  

####################################################################################################
# KlueSTS 평가 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="test")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
####################################################################################################           
if use_kluests == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_kluests_file}")
    with open(eval_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 5:
                print(f"{text_a}, {text_b}, {score}")

            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_kluests_file} len: {count}')
####################################################################################################  

####################################################################################################
# 영문 stsb_multi_mt 데이터 셋 설정(load_dataset)
####################################################################################################                
# stsb_multi_mt 영문 sts dev 데이터 셋 불러오기
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="dev")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt len: {count}')
####################################################################################################  

####################################################################################################
# 영문 GLUE 데이터 셋 설정(load_dataset)
####################################################################################################  
if use_glue_sts == True:
    count = 0
    glue_stsb_dataset = load_dataset("glue","stsb", split="validation")
    for data in glue_stsb_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
        
        if count < 3:
            print(f"{text_a}, {text_b}, {score}")
            
        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue-stsb len: {count}')
####################################################################################################  

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*dev_samples_len:{len(dev_samples)}')
print(dev_samples[0:3])

# 2개의 bert 모델에서 구한 2개의 embedding 값들의 cosine 유사도를 구해서, 이를 실제 score와 비교해서 유사도 측정함
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, 
                                                                 main_similarity=main_similarity,
                                                                 batch_size=eval_batch_size, 
                                                                 name='sts-dev')

2023-01-05 13:01:42,705 - s-bert-sts - INFO - Read STS dev dataset=>../../data11/korpora/korsts/tune_dev.tsv
2023-01-05 13:01:42,711 - s-bert-sts - INFO - *../../data11/korpora/korsts/tune_dev.tsv len: 1500
2023-01-05 13:01:42,712 - s-bert-sts - INFO - Read STS dev dataset=>../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json
2023-01-05 13:01:42,727 - s-bert-sts - INFO - *../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json len: 519


안전모를 가진 한 남자가 춤을 추고 있다., 안전모를 쓴 한 남자가 춤을 추고 있다., 1.0
어린아이가 말을 타고 있다., 아이가 말을 타고 있다., 0.95
한 남자가 뱀에게 쥐를 먹이고 있다., 남자가 뱀에게 쥐를 먹이고 있다., 1.0
한 여성이 기타를 연주하고 있다., 한 남자가 기타를 치고 있다., 0.48
한 여성이 플루트를 연주하고 있다., 남자가 플루트를 연주하고 있다., 0.55
무엇보다도 호스트분들이 너무 친절하셨습니다., 무엇보다도, 호스트들은 매우 친절했습니다., 0.9800000000000001
주요 관광지 모두 걸어서 이동가능합니다., 위치는 피렌체 중심가까지 걸어서 이동 가능합니다., 0.27999999999999997
학생들의 균형 있는 영어능력을 향상시킬 수 있는 학교 수업을 유도하기 위해 2018학년도 수능부터 도입된 영어 영역 절대평가는 올해도 유지한다., 영어 영역의 경우 학생들이 한글 해석본을 암기하는 문제를 해소하기 위해 2016학년도부터 적용했던 EBS 연계 방식을 올해도 유지한다., 0.26
다만, 도로와 인접해서 거리의 소음이 들려요., 하지만, 길과 가깝기 때문에 거리의 소음을 들을 수 있습니다., 0.74
형이 다시 캐나다 들어가야 하니 가족모임 일정은 바꾸지 마세요., 가족 모임 일정은 바꾸지 말도록 하십시오., 0.5


Reusing dataset glue (/MOCOMSYS/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
2023-01-05 13:01:43,921 - s-bert-sts - INFO - *glue-stsb len: 1500
2023-01-05 13:01:43,922 - s-bert-sts - INFO - ------------------------------------------------------------------------
2023-01-05 13:01:43,923 - s-bert-sts - INFO - *dev_samples_len:3519


A man with a hard hat is dancing., A man wearing a hard hat is dancing., 1.0
A young child is riding a horse., A child is riding a horse., 0.95
A man is feeding a mouse to a snake., The man is feeding a mouse to the snake., 1.0
[<sentence_transformers.readers.InputExample.InputExample object at 0x7fea45a41310>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fea45a441f0>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fea45a44250>]


In [9]:
#warmup_step은 10% 로 설정
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) 
#warmup_steps = 0 # kcbert-config 참조함 

# evaluation_steps은 20%로 설정
evaluation_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.2)

logger.info(f"*IN-model:{model_path}")
logger.info(f"*OUT-model:{smodel_path}")
logger.info("*seed:{}, train_batch:{}, eval_batch:{}, epoch:{}, lr:{}, eps:{}, max_seq_length:{}, train_dataset:{}, Warmup-steps: {}, evaluation_step: {}".format(seed, train_batch_size, eval_batch_size, num_epochs, lr, eps, max_seq_length, len(train_dataset), warmup_steps, evaluation_steps))
logger.info(f"*do_lower_case:{do_lower_case_param}, use_correct_bias:{use_correct_bias}")

# Train the model
# => **learning rate는 기본이 2e-5임
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          optimizer_params= opt_params, 
          save_best_model=True, # **기본 = True : eval 가장 best 모델을 output_Path에 저장함
          output_path=smodel_path
          )


2023-01-05 13:01:43,933 - s-bert-sts - INFO - *IN-model:../../data11/model/moco/sbert-albert-small-nli-cls-64/
2023-01-05 13:01:43,936 - s-bert-sts - INFO - *OUT-model:../../data11/model/moco/sbert-albert-small-nli-cls-64-sts-128
2023-01-05 13:01:43,937 - s-bert-sts - INFO - *seed:111, train_batch:128, eval_batch:64, epoch:128, lr:3e-05, eps:1e-06, max_seq_length:72, train_dataset:26012, Warmup-steps: 2602, evaluation_step: 5202
2023-01-05 13:01:43,938 - s-bert-sts - INFO - *do_lower_case:True, use_correct_bias:0


Epoch:   0%|          | 0/128 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

Iteration:   0%|          | 0/204 [00:00<?, ?it/s]

In [10]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
##############################################################################
import time
from sentence_transformers.evaluation import SimilarityFunction

# 테스트 파일=KorSTS 테스트파일 경로 지정
test_file = '../../data11/korpora/korsts/tune_test.tsv'

# 평가시 cosine 유사도등 측정 결과값 파일 (similarity_evaluation_xxxx.xls) 저장될 경로
output_path = '../../log'
os.makedirs(output_path, exist_ok=True)

test_samples = []
with open(test_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1,s2], label=score))

logger.info("\n")
logger.info("======================TEST===================")
logger.info("\n\n")
logger.info(f"model save path > {smodel_path}")
start = time.time()
model = SentenceTransformer(smodel_path)

# 유사도 측정방식(COSINE, EUCLIDEAN, MANHATTAN, DOT_PRODUCT 중 선택 , 모두 spearman 방식임)
# => None 이면 아래 값들중 MAX 값 추력함
#main_similarity = None
main_similarity = SimilarityFunction.COSINE
#main_similarity = SimilarityFunction.EUCLIDEAN
#main_similarity = SimilarityFunction.MANHATTAN
#main_similarity = SimilarityFunction.DOT_PRODUCT

logger.info(f"main_similarity: {main_similarity}")

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, main_similarity=main_similarity, batch_size=eval_batch_size, name='sts-test', show_progress_bar=True)
result = test_evaluator(model, output_path=output_path)

logger.info(f"\n")
logger.info(f"model path: {smodel_path}")
logger.info(f'=== result: {result} ===')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info("==============================================")
logger.info("\n")

2023-01-05 14:55:00,103 - s-bert-sts - INFO - 

2023-01-05 14:55:00,105 - s-bert-sts - INFO - 


2023-01-05 14:55:00,105 - s-bert-sts - INFO - model save path > ../../data11/model/moco/sbert-albert-small-nli-cls-64-sts-128
2023-01-05 14:55:00,271 - s-bert-sts - INFO - main_similarity: SimilarityFunction.COSINE


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2023-01-05 14:55:01,120 - s-bert-sts - INFO - 

2023-01-05 14:55:01,121 - s-bert-sts - INFO - model path: ../../data11/model/moco/sbert-albert-small-nli-cls-64-sts-128
2023-01-05 14:55:01,122 - s-bert-sts - INFO - === result: 0.8420636980852383 ===
2023-01-05 14:55:01,123 - s-bert-sts - INFO - === 처리시간: 1.017 초 ===
2023-01-05 14:55:01,124 - s-bert-sts - INFO - 



In [11]:
# 마지막 model 저장
#output_path = "../../data11/model/sbert/sbert-mdistilbertV3.1-last"
#model.save(output_path)